diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json index f34104a6..3a5ad3ad 100644 --- a/dev/.documenter-siteinfo.json +++ b/dev/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.9.3","generation_timestamp":"2023-10-26T20:44:11","documenter_version":"1.1.2"}} \ No newline at end of file +{"documenter":{"julia_version":"1.9.3","generation_timestamp":"2023-10-28T08:29:12","documenter_version":"1.1.2"}} \ No newline at end of file diff --git a/dev/assets/Manifest.toml b/dev/assets/Manifest.toml index 873ccd34..c386c64c 100644 --- a/dev/assets/Manifest.toml +++ b/dev/assets/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.9.3" manifest_format = "2.0" -project_hash = "118b18706ee7a5c993d7252b3dc1283898fcb222" +project_hash = "7bd5312b540d49b0a8586423c747705dd5615096" [[deps.ADTypes]] git-tree-sha1 = "5d2e21d7b0d8c22f67483ef95ebdc39c0e6b6003" @@ -369,9 +369,9 @@ version = "1.9.1" [[deps.DiffEqBase]] deps = ["ArrayInterface", "ChainRulesCore", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "Requires", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Static", "StaticArraysCore", "Statistics", "Tricks", "TruncatedStacktraces", "ZygoteRules"] -git-tree-sha1 = "36a590efdbee58b38f903ffc3b378f4a5336bc3f" +git-tree-sha1 = "59842b690c2ed12428a741c1334f3c68d1377fb6" uuid = "2b5f629d-d688-5b77-993f-72d75c75574e" -version = "6.134.0" +version = "6.135.0" [deps.DiffEqBase.extensions] DiffEqBaseDistributionsExt = "Distributions" @@ -409,7 +409,7 @@ version = "2.33.1" [[deps.DiffEqGPU]] deps = ["Adapt", "ChainRulesCore", "DiffEqBase", "Distributed", "DocStringExtensions", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LinearSolve", "MuladdMacro", "Parameters", "Random", "RecursiveArrayTools", "Requires", "SciMLBase", "Setfield", "SimpleDiffEq", "StaticArrays", "TOML", "ZygoteRules"] -path = "/var/lib/buildkite-agent/builds/gpuci-16/julialang/diffeqgpu-dot-jl" +path = "/var/lib/buildkite-agent/builds/gpuci-9/julialang/diffeqgpu-dot-jl" uuid = "071ae1c0-96b5-11e9-1965-c90190d839ea" version = "3.2.0" @@ -512,21 +512,21 @@ version = "1.0.4" [[deps.Enzyme]] deps = ["CEnum", "EnzymeCore", "Enzyme_jll", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "ObjectFile", "Preferences", "Printf", "Random"] -git-tree-sha1 = "a8a13f3259f59f1738e14296b9191299226ea6b6" +git-tree-sha1 = "821daa883570994b4c0f9994de2840cc506a51f3" uuid = "7da242da-08ed-463a-9acd-ee780be4f1d9" -version = "0.11.9" +version = "0.11.10" [[deps.EnzymeCore]] deps = ["Adapt"] -git-tree-sha1 = "d8701002a745c450c03b890f10d53636d1a8a7ea" +git-tree-sha1 = "ab81396e4e7b61f5590db02fa1c17fae4f16d7ab" uuid = "f151be2c-9106-41f4-ab19-57ee4f262869" -version = "0.6.2" +version = "0.6.3" [[deps.Enzyme_jll]] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "6febd718de952cb31717a34f81f98872c065e825" +git-tree-sha1 = "e248a990a4a917e587c6b7fcf64a4311649a267f" uuid = "7cc45869-7501-5eee-bdea-0790c847d4ef" -version = "0.0.88+0" +version = "0.0.89+0" [[deps.EpollShim_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -1085,9 +1085,9 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[deps.LinearSolve]] deps = ["ArrayInterface", "ConcreteStructs", "DocStringExtensions", "EnumX", "EnzymeCore", "FastLapackInterface", "GPUArraysCore", "InteractiveUtils", "KLU", "Krylov", "Libdl", "LinearAlgebra", "MKL_jll", "PrecompileTools", "Preferences", "RecursiveFactorization", "Reexport", "Requires", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Sparspak", "SuiteSparse", "UnPack"] -git-tree-sha1 = "5f97525abf8d81f9554e836a7e5802fd89bb0594" +git-tree-sha1 = "a563cd835c9ed5295c35a7d140e0bf0a514bb10a" uuid = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" -version = "2.12.1" +version = "2.13.0" [deps.LinearSolve.extensions] LinearSolveBandedMatricesExt = "BandedMatrices" @@ -1277,15 +1277,17 @@ version = "1.2.0" [[deps.NonlinearSolve]] deps = ["ADTypes", "ArrayInterface", "ConcreteStructs", "DiffEqBase", "EnumX", "FiniteDiff", "ForwardDiff", "LineSearches", "LinearAlgebra", "LinearSolve", "PrecompileTools", "RecursiveArrayTools", "Reexport", "SciMLBase", "SimpleNonlinearSolve", "SparseArrays", "SparseDiffTools", "StaticArraysCore", "UnPack"] -git-tree-sha1 = "9203b3333c9610664de2e8cbc23cfd726663df7d" +git-tree-sha1 = "adbde57261effaa44b4d3d5942651a4bb22aefc3" uuid = "8913a72c-1f9b-4ce2-8d82-65094dcecaec" -version = "2.4.0" +version = "2.5.0" [deps.NonlinearSolve.extensions] + NonlinearSolveBandedMatricesExt = "BandedMatrices" NonlinearSolveFastLevenbergMarquardtExt = "FastLevenbergMarquardt" NonlinearSolveLeastSquaresOptimExt = "LeastSquaresOptim" [deps.NonlinearSolve.weakdeps] + BandedMatrices = "aae01518-5342-5314-be14-df237901396f" FastLevenbergMarquardt = "7a0df574-e128-4d35-8cbd-3d84502bf7ce" LeastSquaresOptim = "0fc2ff8b-aaa3-5acd-a817-1944a5e08891" @@ -1661,17 +1663,19 @@ version = "0.1.0" [[deps.SciMLBase]] deps = ["ADTypes", "ArrayInterface", "ChainRulesCore", "CommonSolve", "ConstructionBase", "Distributed", "DocStringExtensions", "EnumX", "FillArrays", "FunctionWrappersWrappers", "IteratorInterfaceExtensions", "LinearAlgebra", "Logging", "Markdown", "PrecompileTools", "Preferences", "RecipesBase", "RecursiveArrayTools", "Reexport", "RuntimeGeneratedFunctions", "SciMLOperators", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables", "TruncatedStacktraces", "ZygoteRules"] -git-tree-sha1 = "151c322c309d879d114d1c0bee69c61d5933357f" +git-tree-sha1 = "1c2a4e245744dd76b2eb677d3535ffad16d8b989" uuid = "0bca4576-84f4-4d90-8ffe-ffa030f20462" -version = "2.4.3" +version = "2.5.0" [deps.SciMLBase.extensions] + SciMLBasePartialFunctionsExt = "PartialFunctions" SciMLBasePyCallExt = "PyCall" SciMLBasePythonCallExt = "PythonCall" SciMLBaseRCallExt = "RCall" SciMLBaseZygoteExt = "Zygote" [deps.SciMLBase.weakdeps] + PartialFunctions = "570af359-4316-4cb7-8c74-252c00c2016b" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" RCall = "6f49c342-dc21-5d91-9882-a32aef131414" @@ -1778,9 +1782,9 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[deps.SparseDiffTools]] deps = ["ADTypes", "Adapt", "ArrayInterface", "Compat", "DataStructures", "FiniteDiff", "ForwardDiff", "Graphs", "LinearAlgebra", "PackageExtensionCompat", "Random", "Reexport", "SciMLOperators", "Setfield", "SparseArrays", "StaticArrayInterface", "StaticArrays", "Tricks", "UnPack", "VertexSafeGraphs"] -git-tree-sha1 = "336fd944a1bbb8873bfa8171387608ca93317d68" +git-tree-sha1 = "eac5b754896a56df0b9e049a3d93cc27912a78f2" uuid = "47a9eef4-7e08-11e9-0b38-333d64bd3804" -version = "2.8.0" +version = "2.9.0" [deps.SparseDiffTools.extensions] SparseDiffToolsEnzymeExt = "Enzyme" diff --git a/dev/assets/Project.toml b/dev/assets/Project.toml index 7f2bb96c..865d533f 100644 --- a/dev/assets/Project.toml +++ b/dev/assets/Project.toml @@ -32,4 +32,5 @@ Plots = "1" SafeTestsets = "0.0.1, 0.1" SciMLSensitivity = "7" StaticArrays = "1" +Statistics = "1" StochasticDiffEq = "6.57" diff --git a/dev/examples/ad/index.html b/dev/examples/ad/index.html index e5956a73..1c1b236c 100644 --- a/dev/examples/ad/index.html +++ b/dev/examples/ad/index.html @@ -66,4 +66,4 @@ @time sol = solve(monteprob, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 10_000, saveat = 1.0f0)
EnsembleSolution Solution of length 10000 with uType:
-SciMLBase.ODESolution{ForwardDiff.Dual{Nothing, Float64, 3}, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{ForwardDiff.Dual{Nothing, Float64, 3}}, Tuple{Float32, Float32}, true, Vector{Float32}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, IType, DiffEqBase.Stats, Nothing} where {uType, rateType, IType}
+SciMLBase.ODESolution{ForwardDiff.Dual{Nothing, Float64, 3}, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{ForwardDiff.Dual{Nothing, Float64, 3}}, Tuple{Float32, Float32}, true, Vector{Float32}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, IType, DiffEqBase.Stats, Nothing} where {uType, rateType, IType} diff --git a/dev/examples/bruss/index.html b/dev/examples/bruss/index.html index 43d1698a..6ad5363d 100644 --- a/dev/examples/bruss/index.html +++ b/dev/examples/bruss/index.html @@ -89,4 +89,4 @@ 11.5 u: 2-element Vector{CUDA.CuArray{Float64, 3, CUDA.Mem.DeviceBuffer}}: [0.0 0.12134432813715873 … 0.1213443281371586 0.0; 0.0 0.12134432813715873 … 0.1213443281371586 0.0; … ; 0.0 0.12134432813715873 … 0.1213443281371586 0.0; 0.0 0.12134432813715873 … 0.1213443281371586 0.0;;; 0.0 0.0 … 0.0 0.0; 0.14892258453196755 0.14892258453196755 … 0.14892258453196755 0.14892258453196755; … ; 0.14892258453196738 0.14892258453196738 … 0.14892258453196738 0.14892258453196738; 0.0 0.0 … 0.0 0.0] - [3.0478431718205936 3.0478158588988173 … 3.0479344071354673 3.0478829076429497; 3.0478921742826572 3.047861583972248 … 3.0479950350896505 3.0479368431672165; … ; 3.047757449709251 3.0477354559246868 … 3.0478300382513863 3.04778922650763; 3.0477977675276757 3.047773336691987 … 3.0478788678640973 3.0478331843281357;;; 2.567054744855225 2.5670562676043445 … 2.567049778293458 2.5670525600516014; 2.56705171174118 2.5670533462176053 … 2.5670463664078818 2.5670493630361464; … ; 2.5670601483305377 2.5670614822617197 … 2.5670558195263085 2.5670582401346573; 2.5670575914290477 2.5670590130863213 … 2.5670529666027693 2.5670555547531957] + [3.0478431718176133 3.0478158588958384 … 3.047934407132488 3.0478829076399707; 3.0478921742796787 3.0478615839692695 … 3.0479950350866707 3.0479368431642384; … ; 3.0477574497062734 3.04773545592171 … 3.0478300382484087 3.047789226504652; 3.047797767524695 3.0477733366890103 … 3.047878867861117 3.047833184325156;;; 2.567054744858359 2.5670562676074793 … 2.567049778296592 2.567052560054736; 2.567051711744317 2.56705334622074 … 2.5670463664110166 2.5670493630392817; … ; 2.567060148333675 2.567061482264857 … 2.567055819529447 2.567058240137795; 2.567057591432184 2.567059013089459 … 2.567052966605906 2.567055554756332] diff --git a/dev/examples/reaction_diffusion/index.html b/dev/examples/reaction_diffusion/index.html index 5f7d33bd..ce61c4e5 100644 --- a/dev/examples/reaction_diffusion/index.html +++ b/dev/examples/reaction_diffusion/index.html @@ -1,2 +1,2 @@ -GPU-Accelerated Stochastic Partial Differential Equations · DiffEqGPU.jl
+GPU-Accelerated Stochastic Partial Differential Equations · DiffEqGPU.jl
diff --git a/dev/examples/reductions/index.html b/dev/examples/reductions/index.html index 282ef0d5..cf0489e3 100644 --- a/dev/examples/reductions/index.html +++ b/dev/examples/reductions/index.html @@ -28,4 +28,4 @@ reduction = reduction, u_init = Vector{eltype(prob.u0)}([0.0])) sim4 = solve(prob2, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 100, batch_size = 20)
EnsembleSolution Solution of length 1 with uType:
-Float64
+Float64 diff --git a/dev/examples/sde/index.html b/dev/examples/sde/index.html index ad5f4c76..df48366a 100644 --- a/dev/examples/sde/index.html +++ b/dev/examples/sde/index.html @@ -23,4 +23,4 @@ monteprob = EnsembleProblem(prob, prob_func = prob_func) sol = solve(monteprob, SOSRI(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 10_000, saveat = 1.0f0)
EnsembleSolution Solution of length 10000 with uType:
-SciMLBase.RODESolution{Float32, 2, uType, Nothing, Nothing, Vector{Float32}, randType, SciMLBase.SDEProblem{Vector{Float32}, Tuple{Float32, Float32}, true, Vector{Float32}, Nothing, SciMLBase.SDEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), typeof(Main.multiplicative_noise), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, typeof(Main.multiplicative_noise), Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, Nothing}, StochasticDiffEq.SOSRI, IType, DiffEqBase.Stats, Nothing} where {uType, randType, IType}
+SciMLBase.RODESolution{Float32, 2, uType, Nothing, Nothing, Vector{Float32}, randType, SciMLBase.SDEProblem{Vector{Float32}, Tuple{Float32, Float32}, true, Vector{Float32}, Nothing, SciMLBase.SDEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), typeof(Main.multiplicative_noise), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, typeof(Main.multiplicative_noise), Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, Nothing}, StochasticDiffEq.SOSRI, IType, DiffEqBase.Stats, Nothing} where {uType, randType, IType} diff --git a/dev/getting_started/index.html b/dev/getting_started/index.html index d2f32321..d531eb3c 100644 --- a/dev/getting_started/index.html +++ b/dev/getting_started/index.html @@ -54,48 +54,48 @@ prob = ODEProblem(f, u0, (0.0f0, 1.0f0)) # Float32 is better on GPUs! sol = solve(prob, Tsit5())
retcode: Success
 Interpolation: specialized 4th order "free" interpolation
-t: 51-element Vector{Float32}:
+t: 50-element Vector{Float32}:
  0.0
- 0.0028032635
- 0.007292892
- 0.013943267
- 0.022358477
- 0.03266243
- 0.044566914
- 0.057291754
- 0.07193968
- 0.087012306
+ 0.0028032633
+ 0.00770806
+ 0.014659371
+ 0.023422156
+ 0.034057636
+ 0.046285566
+ 0.05950984
+ 0.074400246
+ 0.090368554
  ⋮
- 0.8247635
- 0.84982157
- 0.8745116
- 0.89433014
- 0.91924965
- 0.9438157
- 0.96827626
- 0.9933905
+ 0.8050874
+ 0.83047694
+ 0.85561347
+ 0.88050914
+ 0.90504134
+ 0.9308001
+ 0.95659125
+ 0.98157626
  1.0
-u: 51-element Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}:
+u: 50-element Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}:
  Float32[0.88071316, 0.091738656, 0.5644176, 0.89315474, 0.11550513, 0.9899834, 0.76465535, 0.5730443, 0.5333632, 0.46715936  …  0.51651764, 0.69562453, 0.13039295, 0.017110005, 0.3178709, 0.7581309, 0.32872194, 0.65469325, 0.5585072, 0.12346421]
- Float32[0.88458586, 0.10383201, 0.56542593, 0.91055316, 0.14688145, 0.98178184, 0.72525895, 0.59418094, 0.573652, 0.43604094  …  0.4663532, 0.7562544, 0.07489444, 0.09894383, 0.27474692, 0.7218533, 0.30385882, 0.74906904, 0.6420995, 0.16706456]
- Float32[0.8993485, 0.11652332, 0.5549422, 0.9226965, 0.20035666, 0.9682945, 0.64829654, 0.6165711, 0.6383632, 0.39841184  …  0.38447455, 0.86521155, -0.024065342, 0.2509739, 0.21489702, 0.6452851, 0.25786027, 0.89370865, 0.7635648, 0.23452367]
- Float32[0.94403464, 0.11908797, 0.512699, 0.9032732, 0.27996275, 0.94630456, 0.49943888, 0.6203278, 0.73663926, 0.3718802  …  0.25955936, 1.0538634, -0.18425739, 0.5192544, 0.1464948, 0.48769277, 0.18006498, 1.0911947, 0.91577864, 0.3277313]
- Float32[1.0456772, 0.09229674, 0.41570693, 0.80836636, 0.3661887, 0.91255134, 0.24255256, 0.5678781, 0.8680704, 0.39424247  …  0.094817884, 1.3401356, -0.38622913, 0.9196656, 0.09464145, 0.20607205, 0.07886583, 1.3067919, 1.063521, 0.4331251]
- Float32[1.2451123, 0.009822205, 0.23700213, 0.56974095, 0.42532614, 0.8577494, -0.1955113, 0.40833247, 1.0415791, 0.5242374  …  -0.118558235, 1.764022, -0.5804089, 1.4731584, 0.08890158, -0.28040943, -0.013598273, 1.503326, 1.1853585, 0.5476369]
- Float32[1.5779717, -0.16074678, -0.03486809, 0.10167053, 0.4077717, 0.76762325, -0.9066363, 0.09505341, 1.2570384, 0.8555032  …  -0.3853483, 2.353389, -0.6376506, 2.1472414, 0.18251278, -1.075195, -0.0032046076, 1.5974889, 1.2684758, 0.68334526]
- Float32[2.0385969, -0.44505167, -0.36323085, -0.6404346, 0.30168927, 0.62329584, -1.9623735, -0.35553157, 1.5006028, 1.4946269  …  -0.704176, 3.0949616, -0.3621099, 2.8454118, 0.4555171, -2.279417, 0.26366287, 1.4488839, 1.3324683, 0.8927335]
- Float32[2.653575, -0.92650205, -0.68213314, -1.7437161, 0.20209287, 0.35969245, -3.62819, -0.88641196, 1.8046279, 2.715105  …  -1.137421, 4.0728436, 0.60271794, 3.567788, 1.1182202, -4.298576, 1.0722924, 0.7389011, 1.4554504, 1.3578767]
- Float32[3.2878957, -1.6229483, -0.6983753, -2.8794007, 0.5136371, -0.081603736, -5.89597, -1.127807, 2.196508, 4.6699176  …  -1.7071708, 5.187741, 2.6080158, 4.255335, 2.4045234, -7.4276085, 2.6378803, -1.0187249, 1.7589356, 2.3444717]
+ Float32[0.88458586, 0.10383208, 0.56542593, 0.9105531, 0.14688143, 0.98178184, 0.725259, 0.59418094, 0.573652, 0.43604097  …  0.46635318, 0.7562544, 0.07489447, 0.09894379, 0.2747469, 0.7218533, 0.30385882, 0.749069, 0.6420995, 0.16706459]
+ Float32[0.9013025, 0.11726365, 0.5532294, 0.92281747, 0.20539533, 0.96700597, 0.6402653, 0.6178738, 0.6443927, 0.3957117  …  0.3768051, 0.8760311, -0.033707015, 0.26627496, 0.20992415, 0.63702106, 0.253282, 0.90664876, 0.77402776, 0.24059023]
+ Float32[0.9506427, 0.11814861, 0.5062973, 0.8983726, 0.28816503, 0.94372785, 0.48070958, 0.6184501, 0.74748284, 0.37123397  …  0.24583998, 1.076143, -0.20188585, 0.55090535, 0.14055808, 0.4674204, 0.17131874, 1.1111343, 0.93024594, 0.3372425]
+ Float32[1.0623785, 0.08638905, 0.4001696, 0.79027975, 0.37505215, 0.9076652, 0.20401168, 0.5563591, 0.8853406, 0.40204  …  0.073421545, 1.3801469, -0.41001603, 0.9741743, 0.090954676, 0.1634308, 0.066957146, 1.3308672, 1.0788912, 0.44552672]
+ Float32[1.278569, -0.0058233947, 0.20831701, 0.52600056, 0.428239, 0.8488757, -0.2666873, 0.37851772, 1.0660826, 0.5520373  …  -0.14858639, 1.8276249, -0.5984681, 1.5513371, 0.09369785, -0.359674, -0.020600917, 1.5228127, 1.1977124, 0.56266296]
+ Float32[1.6344533, -0.19264252, -0.07817726, 0.015917871, 0.39763498, 0.7514975, -1.030123, 0.039752055, 1.2891968, 0.9227872  …  -0.42609212, 2.4470403, -0.6241573, 2.2441795, 0.20730077, -1.2141467, 0.014517415, 1.5952181, 1.2774434, 0.7059188]
+ Float32[2.1275413, -0.50668025, -0.41936472, -0.79362315, 0.27951172, 0.5912294, -2.1819494, -0.4400623, 1.5444939, 1.6427702  …  -0.76444525, 3.2350993, -0.26647314, 2.9609494, 0.52785707, -2.5357955, 0.34722802, 1.3856934, 1.3453207, 0.9431921]
+ Float32[2.7601502, -1.0255971, -0.71567386, -1.9411954, 0.20904303, 0.30135608, -3.95969, -0.9587043, 1.8606849, 2.981347  …  -1.2199625, 4.248185, 0.85137504, 3.6808665, 1.2795985, -4.724534, 1.273345, 0.5378595, 1.4889644, 1.4750801]
+ Float32[3.416101, -1.8075991, -0.61632633, -3.0732634, 0.7101897, -0.21086018, -6.4749594, -1.0869303, 2.3067675, 5.218714  …  -1.8599517, 5.4480147, 3.2286246, 4.417976, 2.8007317, -8.321874, 3.0981548, -1.6155888, 1.866251, 2.6711357]
  ⋮
- Float32[-3.8355356f9, 1.1298996f10, -3.93494f10, 6.2908006f9, -3.2500275f10, -3.4033916f10, 3.4276102f10, -8.358764f9, -1.3181027f8, 6.9724595f9  …  -3.758224f10, 3.1913632f10, 2.2516085f10, 3.781391f10, -1.4528389f10, 5.0571884f9, 5.5028557f9, 1.6473422f10, -1.3774336f10, -6.9072525f8]
- Float32[-1.1802888f10, 2.3964957f10, -7.842121f10, 2.128094f10, -7.817608f10, -7.8497145f10, 7.4242105f10, -1.5591672f10, 5.0940733f9, 1.9588368f10  …  -8.79277f10, 7.620545f10, 4.6994047f10, 8.263724f10, -1.7877727f10, 1.0438716f10, 7.8208154f9, 4.0790344f10, -3.0817853f10, 7.139321f8]
- Float32[-3.1122395f10, 4.835905f10, -1.5162422f11, 6.8681708f10, -1.8565762f11, -1.7294005f11, 1.5758757f11, -2.8887898f10, 2.2604016f10, 5.4178144f10  …  -1.9935493f11, 1.7930378f11, 9.4641455f10, 1.7341137f11, -1.4456337f10, 2.112963f10, 2.1751183f9, 9.453497f10, -6.3974597f10, 8.769985f9]
- Float32[-6.452887f10, 8.201022f10, -2.5630245f11, 1.6806388f11, -3.7132304f11, -3.191059f11, 2.88068f11, -4.7664165f10, 5.865629f10, 1.2108814f11  …  -3.795655f11, 3.5548863f11, 1.6306613f11, 3.080752f11, 2.4782664f8, 3.6635824f10, -2.2505982f10, 1.7990689f11, -1.1051795f11, 2.9476325f10]
- Float32[-1.5636973f11, 1.4985986f11, -5.021798f11, 4.8890043f11, -8.8476824f11, -6.7263385f11, 6.1807664f11, -8.995169f10, 1.7117613f11, 3.2547294f11  …  -8.40242f11, 8.3662484f11, 3.1607397f11, 6.190827f11, 4.815659f10, 7.027574f10, -1.2605745f11, 3.9060007f11, -2.1035781f11, 1.0438328f11]
- Float32[-3.688076f11, 2.4462803f11, -1.0109411f12, 1.3265278f12, -2.0706932f12, -1.3690599f12, 1.3258288f12, -1.6722428f11, 4.572891f11, 8.401172f11  …  -1.8141f12, 1.9319423f12, 5.9251157f11, 1.2009144f12, 1.4473537f11, 1.2264527f11, -4.2864322f11, 8.121761f11, -3.8142922f11, 3.138882f11]
- Float32[-8.665197f11, 3.2343844f11, -2.1468291f12, 3.4262035f12, -4.7932254f12, -2.718374f12, 2.87366f12, -3.0060757f11, 1.1718073f12, 2.105633f12  …  -3.8650286f12, 4.4083467f12, 1.08114706f12, 2.2714987f12, 2.8944125f11, 1.775824f11, -1.21367f12, 1.6383607f12, -6.7264394f11, 8.601409f11]
- Float32[-2.10209f12, 1.8158826f11, -4.964374f12, 8.729674f12, -1.1245941f13, -5.389325f12, 6.4526274f12, -5.0728174f11, 3.0218288f12, 5.279704f12  …  -8.353269f12, 1.0181811f13, 1.9492538f12, 4.2902613f12, 4.039475f11, 1.2968861f11, -3.182691f12, 3.292595f12, -1.200864f12, 2.2715708f12]
- Float32[-2.6602567f12, 3.8560997f10, -6.2509354f12, 1.1102161f13, -1.4053066f13, -6.433953f12, 8.0016766f12, -5.6851615f11, 3.8712493f12, 6.6998103f12  …  -1.0227779f13, 1.2668629f13, 2.2638093f12, 5.06079f12, 3.965273f11, 6.051217f10, -4.0504638f12, 3.9452772f12, -1.4038904f12, 2.9080688f12]

Notice that the solution values sol[i] are CUDA-based arrays, which can be moved back to the CPU using Array(sol[i]).

More details on effective use of within-method GPU parallelism can be found in the within-method GPU parallelism tutorial.

Example of Parameter-Parallelism with GPU Ensemble Methods

On the other side of the spectrum, what if we want to solve tons of small ODEs? For this use case, we would use the ensemble methods to solve the same ODE many times with different parameters. This looks like:

using DiffEqGPU, OrdinaryDiffEq, StaticArrays, CUDA
+ Float32[-1.2448122f9, 6.120147f9, -2.2411454f10, 2.5883497f9, -1.6325854f10, -1.7176054f10, 1.845867f10, -5.097449f9, -1.0280615f9, 3.1593787f9  …  -1.9029856f10, 1.6097072f10, 1.2417926f10, 2.0032188f10, -1.0834926f10, 2.8556372f9, 3.1497367f9, 7.697759f9, -6.9380485f9, -5.535416f8]
+ Float32[-5.054011f9, 1.3453031f10, -4.616155f10, 8.2861164f9, -3.969783f10, -4.1313247f10, 4.0932168f10, -9.638242f9, 4.9811946f8, 8.812047f9  …  -4.5696623f10, 3.8924104f10, 2.6688123f10, 4.5313098f10, -1.5538131f10, 5.968125f9, 6.2155515f9, 2.0368216f10, -1.6654228f10, -6.140038f8]
+ Float32[-1.4944735f10, 2.8362789f10, -9.165974f10, 2.815186f10, -9.5766045f10, -9.474463f10, 8.862145f10, -1.800876f10, 7.6212874f9, 2.4890206f10  …  -1.0672122f11, 9.316511f10, 5.550552f10, 9.85828f10, -1.7973344f10, 1.2330889f10, 7.6901f9, 4.9893302f10, -3.677066f10, 1.6863081f9]
+ Float32[-3.893791f10, 5.6953594f10, -1.7770832f11, 9.044478f10, -2.2902504f11, -2.0856529f11, 1.8912282f11, -3.359453f10, 3.0586415f10, 6.920851f10  …  -2.425359f11, 2.2062703f11, 1.117672f11, 2.0673906f11, -1.1531223f10, 2.5009125f10, -2.3971753f9, 1.1517141f11, -7.575034f10, 1.3046186f10]
+ Float32[-9.4686446f10, 1.0731566f11, -3.412096f11, 2.6793178f11, -5.3960088f11, -4.4109395f11, 3.9954403f11, -6.260374f10, 9.412401f10, 1.8581643f11  …  -5.3513342f11, 5.1393908f11, 2.1738078f11, 4.1722443f11, 1.5812787f10, 4.882664f10, -5.2608455f10, 2.5214506f11, -1.465566f11, 5.201068f10]
+ Float32[-2.3425732f11, 1.9188338f11, -6.936081f11, 7.865207f11, -1.3207119f12, -9.422044f11, 8.834734f11, -1.2063414f11, 2.7343546f11, 5.0998555f11  …  -1.2084028f12, 1.2411704f12, 4.2599105f11, 8.479224f11, 8.660919f10, 9.264514f10, -2.3117296f11, 5.5311847f11, -2.7944665f11, 1.7754315f11]
+ Float32[-5.7578055f11, 2.9436674f11, -1.4863718f12, 2.1885256f12, -3.2131108f12, -1.963872f12, 1.982272f12, -2.2853788f11, 7.4989955f11, 1.3616497f12  …  -2.6955859f12, 2.9757908f12, 8.1371673f11, 1.6795491f12, 2.162986f11, 1.5407199f11, -7.498414f11, 1.175319f12, -5.1397958f11, 5.3616073f11]
+ Float32[-1.3832491f12, 3.0316875f11, -3.320267f12, 5.6475113f12, -7.5383245f12, -3.914882f12, 4.4024636f12, -4.0234097f11, 1.9380348f12, 3.4359225f12  …  -5.81553f12, 6.876512f12, 1.4829776f12, 3.1870962f12, 3.6673844f11, 1.8028655f11, -2.0433033f12, 2.376735f12, -9.128034f11, 1.4484123f12]
+ Float32[-2.6601705f12, 3.8654902f10, -6.250732f12, 1.1101971f13, -1.4052883f13, -6.4339533f12, 8.0015576f12, -5.68527f11, 3.8711103f12, 6.6996934f12  …  -1.022768f13, 1.2668507f13, 2.263842f12, 5.060772f12, 3.9652835f11, 6.0590236f10, -4.0504087f12, 3.945264f12, -1.403861f12, 2.9080258f12]

Notice that the solution values sol[i] are CUDA-based arrays, which can be moved back to the CPU using Array(sol[i]).

More details on effective use of within-method GPU parallelism can be found in the within-method GPU parallelism tutorial.

Example of Parameter-Parallelism with GPU Ensemble Methods

On the other side of the spectrum, what if we want to solve tons of small ODEs? For this use case, we would use the ensemble methods to solve the same ODE many times with different parameters. This looks like:

using DiffEqGPU, OrdinaryDiffEq, StaticArrays, CUDA
 
 function lorenz(u, p, t)
     σ = p[1]
@@ -116,4 +116,4 @@
 
 sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
     trajectories = 10_000)
EnsembleSolution Solution of length 10000 with uType:
-SciMLBase.ODESolution{Float32, 2, SubArray{StaticArraysCore.SVector{3, Float32}, 1, Matrix{StaticArraysCore.SVector{3, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, Nothing, SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, DiffEqGPU.ImmutableODEProblem{StaticArraysCore.SVector{3, Float32}, Tuple{Float32, Float32}, false, StaticArraysCore.SVector{3, Float32}, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, GPUTsit5, SciMLBase.LinearInterpolation{SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, SubArray{StaticArraysCore.SVector{3, Float32}, 1, Matrix{StaticArraysCore.SVector{3, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}}, Nothing, Nothing}

To dig more into this example, see the ensemble GPU solving tutorial.

+SciMLBase.ODESolution{Float32, 2, SubArray{StaticArraysCore.SVector{3, Float32}, 1, Matrix{StaticArraysCore.SVector{3, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, Nothing, SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, DiffEqGPU.ImmutableODEProblem{StaticArraysCore.SVector{3, Float32}, Tuple{Float32, Float32}, false, StaticArraysCore.SVector{3, Float32}, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, GPUTsit5, SciMLBase.LinearInterpolation{SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, SubArray{StaticArraysCore.SVector{3, Float32}, 1, Matrix{StaticArraysCore.SVector{3, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}}, Nothing, Nothing}

To dig more into this example, see the ensemble GPU solving tutorial.

diff --git a/dev/index.html b/dev/index.html index cd8933f5..65345976 100644 --- a/dev/index.html +++ b/dev/index.html @@ -1,11 +1,11 @@ DiffEqGPU: Massively Data-Parallel GPU Solving of ODEs · DiffEqGPU.jl

DiffEqGPU: Massively Data-Parallel GPU Solving of ODEs

This library is a component package of the DifferentialEquations.jl ecosystem. It includes functionality for making use of GPUs in the differential equation solvers.

Installation

To install DiffEqGPU.jl, use the Julia package manager:

using Pkg
-Pkg.add("DiffEqGPU")

This will also install all the dependencies, including the CUDA.jl, which will also install all the required versions of CUDA and CuDNN required by these libraries. Note that the same requirements of CUDA.jl apply to DiffEqGPU, such as requiring a GPU with CUDA v11 compatibility. For more information on these requirements, see the requirements of CUDA.jl.

Contributing

Reproducibility

The documentation of this SciML package was built using these direct dependencies,
Status `/var/lib/buildkite-agent/builds/gpuci-16/julialang/diffeqgpu-dot-jl/docs/Project.toml`
+Pkg.add("DiffEqGPU")

This will also install all the dependencies, including the CUDA.jl, which will also install all the required versions of CUDA and CuDNN required by these libraries. Note that the same requirements of CUDA.jl apply to DiffEqGPU, such as requiring a GPU with CUDA v11 compatibility. For more information on these requirements, see the requirements of CUDA.jl.

Contributing

Reproducibility

The documentation of this SciML package was built using these direct dependencies,
Status `/var/lib/buildkite-agent/builds/gpuci-9/julialang/diffeqgpu-dot-jl/docs/Project.toml`
   [79e6a3ab] Adapt v3.7.1
   [6e4b80f9] BenchmarkTools v1.3.2
  [052768ef] CUDA v4.4.1
-  [2b5f629d] DiffEqBase v6.134.0
-  [071ae1c0] DiffEqGPU v3.2.0 `/var/lib/buildkite-agent/builds/gpuci-16/julialang/diffeqgpu-dot-jl`
+  [2b5f629d] DiffEqBase v6.135.0
+  [071ae1c0] DiffEqGPU v3.2.0 `/var/lib/buildkite-agent/builds/gpuci-9/julialang/diffeqgpu-dot-jl`
   [e30172f5] Documenter v1.1.2
   [587475ba] Flux v0.14.6
   [f6369f11] ForwardDiff v0.10.36
@@ -35,7 +35,7 @@
   JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b
   LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
   JULIA_PKG_SERVER =
-  JULIA_IMAGE_THREADS = 1
A more complete overview of all dependencies and their versions is also provided.
Status `/var/lib/buildkite-agent/builds/gpuci-16/julialang/diffeqgpu-dot-jl/docs/Manifest.toml`
+  JULIA_IMAGE_THREADS = 1
A more complete overview of all dependencies and their versions is also provided.
Status `/var/lib/buildkite-agent/builds/gpuci-9/julialang/diffeqgpu-dot-jl/docs/Manifest.toml`
   [47edcb42] ADTypes v0.2.4
   [a4c015fc] ANSIColoredPrinters v0.0.1
   [621f4979] AbstractFFTs v1.5.0
@@ -81,9 +81,9 @@
   [e2d170a0] DataValueInterfaces v1.0.0
   [244e2a9f] DefineSingletons v0.1.2
   [8bb1440f] DelimitedFiles v1.9.1
-  [2b5f629d] DiffEqBase v6.134.0
+  [2b5f629d] DiffEqBase v6.135.0
   [459566f4] DiffEqCallbacks v2.33.1
-  [071ae1c0] DiffEqGPU v3.2.0 `/var/lib/buildkite-agent/builds/gpuci-16/julialang/diffeqgpu-dot-jl`
+  [071ae1c0] DiffEqGPU v3.2.0 `/var/lib/buildkite-agent/builds/gpuci-9/julialang/diffeqgpu-dot-jl`
   [77a26b50] DiffEqNoiseProcess v5.19.0
   [163ba53b] DiffResults v1.1.0
   [b552c78f] DiffRules v1.15.1
@@ -94,8 +94,8 @@
   [fa6b7ba4] DualNumbers v0.6.8
   [da5c29d0] EllipsisNotation v1.7.0
   [4e289a0a] EnumX v1.0.4
-  [7da242da] Enzyme v0.11.9
-  [f151be2c] EnzymeCore v0.6.2
+  [7da242da] Enzyme v0.11.10
+  [f151be2c] EnzymeCore v0.6.3
   [460bff9d] ExceptionUnwrapping v0.1.9
   [d4d017d3] ExponentialUtilities v1.25.0
   [e2ba6199] ExprTools v0.1.10
@@ -148,7 +148,7 @@
   [50d2b5c4] Lazy v0.15.1
   [2d8b4e74] LevyArea v1.0.0
   [d3d80556] LineSearches v7.2.0
-  [7ed4a6bd] LinearSolve v2.12.1
+  [7ed4a6bd] LinearSolve v2.13.0
   [2ab3a3ac] LogExpFunctions v0.3.26
   [e6f89c97] LoggingExtras v1.0.3
   [bdcacae8] LoopVectorization v0.12.165
@@ -167,7 +167,7 @@
   [872c559c] NNlib v0.9.7
   [77ba4419] NaNMath v1.0.2
   [71a1bf82] NameResolution v0.1.5
-  [8913a72c] NonlinearSolve v2.4.0
+  [8913a72c] NonlinearSolve v2.5.0
   [d8793406] ObjectFile v0.4.1
   [6fe1bfb0] OffsetArrays v1.12.10
   [0b1bfda6] OneHotArrays v0.2.4
@@ -212,7 +212,7 @@
   [94e857df] SIMDTypes v0.1.0
   [476501e8] SLEEFPirates v0.6.39
   [1bc83da4] SafeTestsets v0.1.0
-  [0bca4576] SciMLBase v2.4.3
+  [0bca4576] SciMLBase v2.5.0
   [e9a6253c] SciMLNLSolve v0.1.9
   [c0aeaf25] SciMLOperators v0.3.6
   [1ed8b502] SciMLSensitivity v7.46.0
@@ -227,7 +227,7 @@
   [ce78b400] SimpleUnPack v1.1.0
   [66db9d55] SnoopPrecompile v1.0.3
   [a2af1166] SortingAlgorithms v1.2.0
-  [47a9eef4] SparseDiffTools v2.8.0
+  [47a9eef4] SparseDiffTools v2.9.0
   [dc90abb0] SparseInverseSubset v0.1.1
   [e56a9233] Sparspak v0.3.9
   [276daf66] SpecialFunctions v2.3.1
@@ -272,7 +272,7 @@
  [4ee394cb] CUDA_Driver_jll v0.5.0+1
  [76a88914] CUDA_Runtime_jll v0.6.0+0
   [83423d85] Cairo_jll v1.16.1+1
-  [7cc45869] Enzyme_jll v0.0.88+0
+  [7cc45869] Enzyme_jll v0.0.89+0
   [2702e6a9] EpollShim_jll v0.0.20230411+0
   [2e619515] Expat_jll v2.5.0+0
  [b22a6f82] FFMPEG_jll v4.4.2+2
@@ -401,4 +401,4 @@
   [8e850b90] libblastrampoline_jll v5.8.0+0
   [8e850ede] nghttp2_jll v1.48.0+0
   [3f19e933] p7zip_jll v17.4.0+0
-Info Packages marked with  and  have new versions available, but those with  are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m`

You can also download the manifest file and the project file.

+Info Packages marked with and have new versions available, but those with are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m`

You can also download the manifest file and the project file.

diff --git a/dev/manual/backends/index.html b/dev/manual/backends/index.html index 1769f82e..2de9cf4b 100644 --- a/dev/manual/backends/index.html +++ b/dev/manual/backends/index.html @@ -1,2 +1,2 @@ -Compute Backends (GPU Choices) · DiffEqGPU.jl

Compute Backends (GPU Choices)

DiffEqGPU.jl supports a multitude of different GPU devices. These must be chosen during the construction of the EnsembleGPUArray and EnsembleGPUKernel construction and correpond to the compute backends of KernelAbstractions.jl. The choices for backends are:

  • CUDA.CUDABackend(): For NVIDIA GPUs via code generation for CUDA kernels.
  • AMDGPU.ROCBackend(): For AMD GPUs via code generation for ROCm kernels.
  • oneAPI.oneAPIBackend(): For Intel GPUs via code generation for OneAPI kernels.
  • Metal.MetalBackend(): For Apple Silicon (M-Series such as M1 or M2) via code generation for Metal kernels.

This is used for example like EnsembleGPUKernel(oneAPI.oneAPIBackend()) to enable the computations for Intel GPUs. The choice of backend is mandatory and requires the installation of the respective package. Thus for example, using the OneAPI backend requires that the user has successfully installed oneAPI.jl and has an Intel GPU.

+Compute Backends (GPU Choices) · DiffEqGPU.jl

Compute Backends (GPU Choices)

DiffEqGPU.jl supports a multitude of different GPU devices. These must be chosen during the construction of the EnsembleGPUArray and EnsembleGPUKernel construction and correpond to the compute backends of KernelAbstractions.jl. The choices for backends are:

  • CUDA.CUDABackend(): For NVIDIA GPUs via code generation for CUDA kernels.
  • AMDGPU.ROCBackend(): For AMD GPUs via code generation for ROCm kernels.
  • oneAPI.oneAPIBackend(): For Intel GPUs via code generation for OneAPI kernels.
  • Metal.MetalBackend(): For Apple Silicon (M-Series such as M1 or M2) via code generation for Metal kernels.

This is used for example like EnsembleGPUKernel(oneAPI.oneAPIBackend()) to enable the computations for Intel GPUs. The choice of backend is mandatory and requires the installation of the respective package. Thus for example, using the OneAPI backend requires that the user has successfully installed oneAPI.jl and has an Intel GPU.

diff --git a/dev/manual/choosing_ensembler/index.html b/dev/manual/choosing_ensembler/index.html index 1bb95dd4..d5d90bd4 100644 --- a/dev/manual/choosing_ensembler/index.html +++ b/dev/manual/choosing_ensembler/index.html @@ -1,2 +1,2 @@ -Choosing the Ensemble: EnsembleGPUArray vs EnsembleGPUKernel · DiffEqGPU.jl

Choosing the Ensemble: EnsembleGPUArray vs EnsembleGPUKernel

The short answer for how to choose an ensemble method is that, if EnsembleGPUKernel works on your problem, you should use it. A more complex discussion is the following:

  • EnsembleGPUKernel is more asynchronous and has lower kernel call counts than EnsembleGPUArray. This should amount to lower overhead in any case where the algorithms are the same.
  • EnsembleGPUKernel is restrictive on the types of ODE solvers that have been implemented for it. If the most efficient method is not in the list of GPU kernel methods, it may be more efficient to use EnsembleGPUArray with the better method.
  • EnsembleGPUKernel requires equations to be written in out-of-place form, along with a few other restrictions, and thus in some cases can be less automatic than EnsembleGPUArray depending on how the code was originally written.
+Choosing the Ensemble: EnsembleGPUArray vs EnsembleGPUKernel · DiffEqGPU.jl

Choosing the Ensemble: EnsembleGPUArray vs EnsembleGPUKernel

The short answer for how to choose an ensemble method is that, if EnsembleGPUKernel works on your problem, you should use it. A more complex discussion is the following:

  • EnsembleGPUKernel is more asynchronous and has lower kernel call counts than EnsembleGPUArray. This should amount to lower overhead in any case where the algorithms are the same.
  • EnsembleGPUKernel is restrictive on the types of ODE solvers that have been implemented for it. If the most efficient method is not in the list of GPU kernel methods, it may be more efficient to use EnsembleGPUArray with the better method.
  • EnsembleGPUKernel requires equations to be written in out-of-place form, along with a few other restrictions, and thus in some cases can be less automatic than EnsembleGPUArray depending on how the code was originally written.
diff --git a/dev/manual/ensemblegpuarray/index.html b/dev/manual/ensemblegpuarray/index.html index 79907503..4403d6b5 100644 --- a/dev/manual/ensemblegpuarray/index.html +++ b/dev/manual/ensemblegpuarray/index.html @@ -12,4 +12,4 @@ prob = ODEProblem(lorenz,u0,tspan,p) prob_func = (prob,i,repeat) -> remake(prob,p=rand(Float32,3).*p) monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy=false) -@time sol = solve(monteprob,Tsit5(),EnsembleGPUArray(CUDADevice()),trajectories=10_000,saveat=1.0f0)source
DiffEqGPU.EnsembleCPUArrayType
EnsembleCPUArray(cpu_offload = 0.2)

An EnsembleArrayAlgorithm which utilizes the CPU kernels to parallelize each ODE solve with their separate ODE integrator on each kernel. This method is meant to be a debugging counterpart to EnsembleGPUArray, having the same behavior and using the same KernelAbstractions.jl process to build the combined ODE, but without the restrictions of f being a GPU-compatible kernel function.

It is unlikely that this method is useful beyond library development and debugging, as almost any case should be faster with EnsembleThreads or EnsembleDistributed.

source
+@time sol = solve(monteprob,Tsit5(),EnsembleGPUArray(CUDADevice()),trajectories=10_000,saveat=1.0f0)source
DiffEqGPU.EnsembleCPUArrayType
EnsembleCPUArray(cpu_offload = 0.2)

An EnsembleArrayAlgorithm which utilizes the CPU kernels to parallelize each ODE solve with their separate ODE integrator on each kernel. This method is meant to be a debugging counterpart to EnsembleGPUArray, having the same behavior and using the same KernelAbstractions.jl process to build the combined ODE, but without the restrictions of f being a GPU-compatible kernel function.

It is unlikely that this method is useful beyond library development and debugging, as almost any case should be faster with EnsembleThreads or EnsembleDistributed.

source
diff --git a/dev/manual/ensemblegpukernel/index.html b/dev/manual/ensemblegpukernel/index.html index 2a244cba..518302c0 100644 --- a/dev/manual/ensemblegpukernel/index.html +++ b/dev/manual/ensemblegpukernel/index.html @@ -19,13 +19,13 @@ monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false) @time sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(), trajectories = 10_000, - adaptive = false, dt = 0.1f0)source

Specialized Solvers

DiffEqGPU.GPUTsit5Type

GPUTsit5()

A specialized implementation of the 5th order Tsit5 method specifically for kernel generation with EnsembleGPUKernel. For a similar CPU implementation, see SimpleATsit5 from SimpleDiffEq.jl.

source
DiffEqGPU.GPUVern7Type

GPUVern7()

A specialized implementation of the 7th order Vern7 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUVern9Type

GPUVern9()

A specialized implementation of the 9th order Vern9 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUEMType

GPUEM()

A specialized implementation of the Euler-Maruyama GPUEM method with weak order 1.0. Made specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUSIEAType

GPUSIEA()

A specialized implementation of the weak order 2.0 for Ito SDEs GPUSIEA method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURosenbrock23Type

GPURosenbrock23()

A specialized implementation of the W-method Rosenbrock23 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURodas4Type

GPURodas4()

A specialized implementation of the Rodas4 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURodas5PType

GPURodas5P()

A specialized implementation of the Rodas5P method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUKvaerno3Type

GPUKvaerno3()

A specialized implementation of the Kvaerno3 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUKvaerno5Type

GPUKvaerno5()

A specialized implementation of the Kvaerno5 method specifically for kernel generation with EnsembleGPUKernel.

source

Lower Level API

DiffEqGPU.vectorized_solveFunction
vectorized_solve(probs, prob::Union{ODEProblem, SDEProblem}alg;
+                  adaptive = false, dt = 0.1f0)
source

Specialized Solvers

DiffEqGPU.GPUTsit5Type

GPUTsit5()

A specialized implementation of the 5th order Tsit5 method specifically for kernel generation with EnsembleGPUKernel. For a similar CPU implementation, see SimpleATsit5 from SimpleDiffEq.jl.

source
DiffEqGPU.GPUVern7Type

GPUVern7()

A specialized implementation of the 7th order Vern7 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUVern9Type

GPUVern9()

A specialized implementation of the 9th order Vern9 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUEMType

GPUEM()

A specialized implementation of the Euler-Maruyama GPUEM method with weak order 1.0. Made specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUSIEAType

GPUSIEA()

A specialized implementation of the weak order 2.0 for Ito SDEs GPUSIEA method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURosenbrock23Type

GPURosenbrock23()

A specialized implementation of the W-method Rosenbrock23 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURodas4Type

GPURodas4()

A specialized implementation of the Rodas4 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURodas5PType

GPURodas5P()

A specialized implementation of the Rodas5P method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUKvaerno3Type

GPUKvaerno3()

A specialized implementation of the Kvaerno3 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUKvaerno5Type

GPUKvaerno5()

A specialized implementation of the Kvaerno5 method specifically for kernel generation with EnsembleGPUKernel.

source

Lower Level API

DiffEqGPU.vectorized_solveFunction
vectorized_solve(probs, prob::Union{ODEProblem, SDEProblem}alg;
                  dt, saveat = nothing,
                  save_everystep = true,
-                 debug = false, callback = CallbackSet(nothing), tstops = nothing)

A lower level interface to the kernel generation solvers of EnsembleGPUKernel with fixed time-stepping.

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • prob: the quintessential problem form. Can be just probs[1]
  • alg: the kernel-based differential equation solver. Must be one of the EnsembleGPUKernel specialized methods.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
DiffEqGPU.vectorized_asolveFunction
vectorized_asolve(probs, prob::ODEProblem, alg;
+                 debug = false, callback = CallbackSet(nothing), tstops = nothing)

A lower level interface to the kernel generation solvers of EnsembleGPUKernel with fixed time-stepping.

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • prob: the quintessential problem form. Can be just probs[1]
  • alg: the kernel-based differential equation solver. Must be one of the EnsembleGPUKernel specialized methods.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
DiffEqGPU.vectorized_asolveFunction
vectorized_asolve(probs, prob::ODEProblem, alg;
                   dt = 0.1f0, saveat = nothing,
                   save_everystep = false,
                   abstol = 1.0f-6, reltol = 1.0f-3,
-                  callback = CallbackSet(nothing), tstops = nothing)

A lower level interface to the kernel generation solvers of EnsembleGPUKernel with adaptive time-stepping.

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • prob: the quintessential problem form. Can be just probs[1]
  • alg: the kernel-based differential equation solver. Must be one of the EnsembleGPUKernel specialized methods.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
DiffEqGPU.vectorized_map_solveFunction

Lower level API for EnsembleArrayAlgorithm. Avoids conversion of solution to CPU arrays.

vectorized_map_solve(probs, alg,
+                  callback = CallbackSet(nothing), tstops = nothing)

A lower level interface to the kernel generation solvers of EnsembleGPUKernel with adaptive time-stepping.

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • prob: the quintessential problem form. Can be just probs[1]
  • alg: the kernel-based differential equation solver. Must be one of the EnsembleGPUKernel specialized methods.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
DiffEqGPU.vectorized_map_solveFunction

Lower level API for EnsembleArrayAlgorithm. Avoids conversion of solution to CPU arrays.

vectorized_map_solve(probs, alg,
                      ensemblealg::Union{EnsembleArrayAlgorithm}, I,
-                     adaptive)

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • alg: the kernel-based differential equation solver. Most of the solvers from OrdinaryDiffEq.jl are supported.
  • ensemblealg: The EnsembleGPUArray() algorithm.
  • I: The iterator argument. Can be set to for e.g. 1:10_000 to simulate 10,000 trajectories.
  • adaptive: The Boolean argument for time-stepping. Use true to enable adaptive time-stepping.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
+ adaptive)

Arguments

Keyword Arguments

Only a subset of the common solver arguments are supported.

source diff --git a/dev/manual/optimal_trajectories/index.html b/dev/manual/optimal_trajectories/index.html index baf2c8be..519754f6 100644 --- a/dev/manual/optimal_trajectories/index.html +++ b/dev/manual/optimal_trajectories/index.html @@ -1,2 +1,2 @@ -Choosing Optimal Numbers of Trajectories · DiffEqGPU.jl

Choosing Optimal Numbers of Trajectories

There is a balance between two things for choosing the number of trajectories:

  • The number of trajectories needs to be high enough that the work per kernel is sufficient to overcome the kernel call cost.
  • More trajectories means that every trajectory will need more time steps, since the adaptivity syncs all solves.

From our testing, the balance is found at around 10,000 trajectories being optimal for EnsembleGPUArray, since it has higher kernel call costs because every internal operation of the ODE solver requires a kernel call. Thus, for larger sets of trajectories, use a batch size of 10,000. Of course, benchmark for yourself on your own setup, as all GPUs are different.

On the other hand, EnsembleGPUKernel fuses the entire GPU solve into a single kernel, greatly reducing the kernel call cost. This means longer or more expensive ODE solves will require even less of a percentage of time kernel launching, making the cutoff much smaller. We see some cases with around 100 ODEs being viable with EnsembleGPUKernel. Again, this is highly dependent on the ODE and the chosen GPU and thus one will need to benchmark to get accurate numbers for their system, this is merely a ballpark estimate.

+Choosing Optimal Numbers of Trajectories · DiffEqGPU.jl

Choosing Optimal Numbers of Trajectories

There is a balance between two things for choosing the number of trajectories:

  • The number of trajectories needs to be high enough that the work per kernel is sufficient to overcome the kernel call cost.
  • More trajectories means that every trajectory will need more time steps, since the adaptivity syncs all solves.

From our testing, the balance is found at around 10,000 trajectories being optimal for EnsembleGPUArray, since it has higher kernel call costs because every internal operation of the ODE solver requires a kernel call. Thus, for larger sets of trajectories, use a batch size of 10,000. Of course, benchmark for yourself on your own setup, as all GPUs are different.

On the other hand, EnsembleGPUKernel fuses the entire GPU solve into a single kernel, greatly reducing the kernel call cost. This means longer or more expensive ODE solves will require even less of a percentage of time kernel launching, making the cutoff much smaller. We see some cases with around 100 ODEs being viable with EnsembleGPUKernel. Again, this is highly dependent on the ODE and the chosen GPU and thus one will need to benchmark to get accurate numbers for their system, this is merely a ballpark estimate.

diff --git a/dev/tutorials/gpu_ensemble_basic/index.html b/dev/tutorials/gpu_ensemble_basic/index.html index 60530cf3..b566a2cf 100644 --- a/dev/tutorials/gpu_ensemble_basic/index.html +++ b/dev/tutorials/gpu_ensemble_basic/index.html @@ -67,4 +67,4 @@ solve(monteprob_jac, Rodas5(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 10_000, saveat = 1.0f0)
EnsembleSolution Solution of length 10000 with uType:
-SciMLBase.ODESolution{Float32, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{Float32}, Tuple{Float32, Float32}, true, StaticArraysCore.SizedVector{3, Float32, Vector{Float32}}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, typeof(Main.lorenz_tgrad), typeof(Main.lorenz_jac), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, A, IType, DiffEqBase.Stats, Nothing} where {uType, rateType, A, IType}
+SciMLBase.ODESolution{Float32, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{Float32}, Tuple{Float32, Float32}, true, StaticArraysCore.SizedVector{3, Float32, Vector{Float32}}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, typeof(Main.lorenz_tgrad), typeof(Main.lorenz_jac), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, A, IType, DiffEqBase.Stats, Nothing} where {uType, rateType, A, IType} diff --git a/dev/tutorials/lower_level_api/index.html b/dev/tutorials/lower_level_api/index.html index 508d466d..642963f9 100644 --- a/dev/tutorials/lower_level_api/index.html +++ b/dev/tutorials/lower_level_api/index.html @@ -41,7 +41,7 @@ save_everystep = false, dt = 0.1f0) @time CUDA.@sync ts, us = DiffEqGPU.vectorized_asolve(probs, prob, GPUTsit5(); - save_everystep = false, dt = 0.1f0)
(Float32[0.0 0.0 … 0.0 0.0; 10.0 10.0 … 10.0 10.0], StaticArraysCore.SVector{3, Float32}[[1.0, 0.0, 0.0] [1.0, 0.0, 0.0] … [1.0, 0.0, 0.0] [1.0, 0.0, 0.0]; [0.057184353, 0.055973485, 0.9972001] [0.9682037, 0.98773384, 0.45903432] … [-4.037727, -2.4832535, 15.188514] [-4.061194, -5.0106506, 13.35104]])

Note that the core is the function DiffEqGPU.vectorized_solve which is the solver for the CUDA-based probs which uses the manually converted problems, and returns us which is a vector of CuArrays for the solution.

Similarily, there exists a lower-level API for EnsembleGPUArray as well, primarily for benchmarking purposes. The solution returned for state (sol.u) is a matrix having columns as different parameter-parallel solutions for the ensemble problem. An example is shown below:

using DiffEqGPU, OrdinaryDiffEq, StaticArrays, CUDA, DiffEqBase
+    save_everystep = false, dt = 0.1f0)
(Float32[0.0 0.0 … 0.0 0.0; 10.0 10.0 … 10.0 10.0], StaticArraysCore.SVector{3, Float32}[[1.0, 0.0, 0.0] [1.0, 0.0, 0.0] … [1.0, 0.0, 0.0] [1.0, 0.0, 0.0]; [-2.5039465, -2.1839972, 8.628112] [0.6071553, 0.60693836, 0.29354915] … [0.5634087, 0.5667132, 0.5146651] [0.042401426, 0.0419496, 0.000810971]])

Note that the core is the function DiffEqGPU.vectorized_solve which is the solver for the CUDA-based probs which uses the manually converted problems, and returns us which is a vector of CuArrays for the solution.

Similarily, there exists a lower-level API for EnsembleGPUArray as well, primarily for benchmarking purposes. The solution returned for state (sol.u) is a matrix having columns as different parameter-parallel solutions for the ensemble problem. An example is shown below:

using DiffEqGPU, OrdinaryDiffEq, StaticArrays, CUDA, DiffEqBase
 
 trajectories = 10_000
 
@@ -82,4 +82,4 @@
  10.0
 u: 2-element Vector{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}:
  [1.0 1.0 … 1.0 1.0; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
- [2.874537 -0.5345339 … 1.1868322 6.235758; 2.8190444 0.69876623 … 1.1867995 6.239402; 6.9810677 20.06372 … 0.7322635 15.252539]
+ [1.8232745 -4.260363 … 0.0056332415 2.2216926; 1.8224066 -4.7686563 … 0.003429688 3.1562388; 2.3079915 11.387499 … 1.3406616f-5 15.157592] diff --git a/dev/tutorials/multigpu/index.html b/dev/tutorials/multigpu/index.html index 81ff6a3b..d67ef5ba 100644 --- a/dev/tutorials/multigpu/index.html +++ b/dev/tutorials/multigpu/index.html @@ -71,4 +71,4 @@ @time sol = solve(monteprob, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 100_000, - batch_size = 50_000, saveat = 1.0f0) + batch_size = 50_000, saveat = 1.0f0) diff --git a/dev/tutorials/parallel_callbacks/index.html b/dev/tutorials/parallel_callbacks/index.html index 6d036fbd..b164dd0b 100644 --- a/dev/tutorials/parallel_callbacks/index.html +++ b/dev/tutorials/parallel_callbacks/index.html @@ -19,4 +19,4 @@ trajectories = 10, adaptive = false, dt = 0.01f0, callback = gpu_cb, merge_callbacks = true, tstops = [4.0f0])
EnsembleSolution Solution of length 10 with uType:
-SciMLBase.ODESolution{Float32, 2, SubArray{StaticArraysCore.SVector{1, Float32}, 1, Matrix{StaticArraysCore.SVector{1, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, Nothing, SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, DiffEqGPU.ImmutableODEProblem{StaticArraysCore.SVector{1, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, typeof(Main.f), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, GPUTsit5, SciMLBase.LinearInterpolation{SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, SubArray{StaticArraysCore.SVector{1, Float32}, 1, Matrix{StaticArraysCore.SVector{1, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}}, Nothing, Nothing}
+SciMLBase.ODESolution{Float32, 2, SubArray{StaticArraysCore.SVector{1, Float32}, 1, Matrix{StaticArraysCore.SVector{1, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, Nothing, SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, DiffEqGPU.ImmutableODEProblem{StaticArraysCore.SVector{1, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, typeof(Main.f), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, GPUTsit5, SciMLBase.LinearInterpolation{SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, SubArray{StaticArraysCore.SVector{1, Float32}, 1, Matrix{StaticArraysCore.SVector{1, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}}, Nothing, Nothing} diff --git a/dev/tutorials/weak_order_conv_sde/d8132f6f.svg b/dev/tutorials/weak_order_conv_sde/a5edcbf6.svg similarity index 87% rename from dev/tutorials/weak_order_conv_sde/d8132f6f.svg rename to dev/tutorials/weak_order_conv_sde/a5edcbf6.svg index 70945135..2b5fb4f7 100644 --- a/dev/tutorials/weak_order_conv_sde/d8132f6f.svg +++ b/dev/tutorials/weak_order_conv_sde/a5edcbf6.svg @@ -1,46 +1,46 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dev/tutorials/weak_order_conv_sde/index.html b/dev/tutorials/weak_order_conv_sde/index.html index a8f3e03f..db25187f 100644 --- a/dev/tutorials/weak_order_conv_sde/index.html +++ b/dev/tutorials/weak_order_conv_sde/index.html @@ -30,4 +30,4 @@ plot(ts, us_expect, lw = 5, xaxis = "Time (t)", yaxis = "y(t)", label = "True Expected value") -plot!(ts, us_calc, lw = 3, ls = :dash, label = "Caculated Expected value")Example block output +plot!(ts, us_calc, lw = 3, ls = :dash, label = "Caculated Expected value")Example block output diff --git a/dev/tutorials/within_method_gpu/index.html b/dev/tutorials/within_method_gpu/index.html index 1b4d2384..670522a3 100644 --- a/dev/tutorials/within_method_gpu/index.html +++ b/dev/tutorials/within_method_gpu/index.html @@ -22,16 +22,16 @@ 0.039545782 0.06525832 0.09650515 - 0.13952377 + 0.13952234 ⋮ - 95.27473 - 95.91403 - 96.57579 - 97.23755 - 97.89931 - 98.561066 - 99.23637 - 99.91167 + 95.27474 + 95.91405 + 96.57581 + 97.23758 + 97.899345 + 98.56111 + 99.23641 + 99.91171 100.0 u: 229-element Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}: Float32[1.0, 0.0, 0.0] @@ -43,14 +43,14 @@ Float32[0.9666134, -0.016576296, -0.0360636] Float32[0.9460261, -0.026261317, -0.05836517] Float32[0.9221204, -0.036940157, -0.08431637] - Float32[0.8910716, -0.04980573, -0.11809613] + Float32[0.89107263, -0.049805336, -0.11809503] ⋮ - Float32[0.00049719884, -0.0022435721, 0.0021882595] - Float32[0.0010859583, -0.0024246967, 0.0015704404] - Float32[0.0015605923, -0.0023972206, 0.00084748876] - Float32[0.0018686663, -0.0021694193, 0.000106063744] - Float32[0.0019964012, -0.0017752235, -0.0005898974] - Float32[0.0019462514, -0.0012601186, -0.0011849877] - Float32[0.001729764, -0.00066427403, -0.0016438589] - Float32[0.0013776367, -5.4696073f-5, -0.0019241067] - Float32[0.001323388, 2.3389219f-5, -0.0019467665]

Notice that both stiff and non-stiff ODE solvers were used here.

Note

Time span was changed to Float32 types, as GPUs generally have very slow Float64 operations, usually around 1/32 of the speed of Float32. cu(x) on an array automatically changes an Array{Float64} to a CuArray{Float32}. If this is not intended, use the CuArray constructor directly. For more information on GPU Float64 performance issues, search around Google for discussions like this.

Warn

Float32 precision is sometimes not enough precision to accurately solve a stiff ODE. Make sure that the precision is necessary by investigating the condition number of the Jacobian. If this value is well-above 1e8, use Float32 with caution!

Restrictions of CuArrays

Note that all the rules of CUDA.jl apply when CuArrays are being used in the solver. While for most of the AbstractArray interface they act similarly to Arrays, such as having valid broadcasting operations (x .* y) defined, they will work on GPUs. For more information on the rules and restrictions of CuArrays, see this page from the CUDA.jl documentation.

+ Float32[0.000497219, -0.002243581, 0.0021882413] + Float32[0.0010859751, -0.0024246988, 0.0015704188] + Float32[0.0015606071, -0.0023972152, 0.0008474603] + Float32[0.0018686759, -0.0021694046, 0.00010603073] + Float32[0.0019964029, -0.001775199, -0.00058993173] + Float32[0.001946243, -0.0012600849, -0.0011850193] + Float32[0.0017297472, -0.000664239, -0.0016438798] + Float32[0.001377614, -5.466292f-5, -0.0019241166] + Float32[0.0013233931, 2.3381774f-5, -0.0019467642]

Notice that both stiff and non-stiff ODE solvers were used here.

Note

Time span was changed to Float32 types, as GPUs generally have very slow Float64 operations, usually around 1/32 of the speed of Float32. cu(x) on an array automatically changes an Array{Float64} to a CuArray{Float32}. If this is not intended, use the CuArray constructor directly. For more information on GPU Float64 performance issues, search around Google for discussions like this.

Warn

Float32 precision is sometimes not enough precision to accurately solve a stiff ODE. Make sure that the precision is necessary by investigating the condition number of the Jacobian. If this value is well-above 1e8, use Float32 with caution!

Restrictions of CuArrays

Note that all the rules of CUDA.jl apply when CuArrays are being used in the solver. While for most of the AbstractArray interface they act similarly to Arrays, such as having valid broadcasting operations (x .* y) defined, they will work on GPUs. For more information on the rules and restrictions of CuArrays, see this page from the CUDA.jl documentation.