diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json index f6ac048e..00c37a41 100644 --- a/dev/.documenter-siteinfo.json +++ b/dev/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.9.3","generation_timestamp":"2023-10-19T22:21:58","documenter_version":"1.1.1"}} \ No newline at end of file +{"documenter":{"julia_version":"1.9.3","generation_timestamp":"2023-10-20T00:50:25","documenter_version":"1.1.1"}} \ No newline at end of file diff --git a/dev/assets/Manifest.toml b/dev/assets/Manifest.toml index 5014ce92..be54f500 100644 --- a/dev/assets/Manifest.toml +++ b/dev/assets/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.9.3" manifest_format = "2.0" -project_hash = "3afc6a82a930acaecc9f474467cad008d77854df" +project_hash = "118b18706ee7a5c993d7252b3dc1283898fcb222" [[deps.ADTypes]] git-tree-sha1 = "5d2e21d7b0d8c22f67483ef95ebdc39c0e6b6003" @@ -409,9 +409,9 @@ version = "2.33.1" [[deps.DiffEqGPU]] deps = ["Adapt", "ChainRulesCore", "DiffEqBase", "Distributed", "DocStringExtensions", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LinearSolve", "MuladdMacro", "Parameters", "Random", "RecursiveArrayTools", "Requires", "SciMLBase", "Setfield", "SimpleDiffEq", "StaticArrays", "TOML", "ZygoteRules"] -path = "/var/lib/buildkite-agent/builds/gpuci-15/julialang/diffeqgpu-dot-jl" +path = "/var/lib/buildkite-agent/builds/gpuci-13/julialang/diffeqgpu-dot-jl" uuid = "071ae1c0-96b5-11e9-1965-c90190d839ea" -version = "2.5.1" +version = "3.0.0" [deps.DiffEqGPU.extensions] AMDGPUExt = ["AMDGPU"] diff --git a/dev/assets/Project.toml b/dev/assets/Project.toml index 8193cda0..7f2bb96c 100644 --- a/dev/assets/Project.toml +++ b/dev/assets/Project.toml @@ -23,7 +23,7 @@ Adapt = "3" BenchmarkTools = "1" CUDA = "4, 5" DiffEqBase = "6.120" -DiffEqGPU = "1,2" +DiffEqGPU = "1,2, 3" Documenter = "1" Flux = "0.13, 0.14" ForwardDiff = "0.10" diff --git a/dev/examples/ad/index.html b/dev/examples/ad/index.html index 3a40c620..e68d295d 100644 --- a/dev/examples/ad/index.html +++ b/dev/examples/ad/index.html @@ -66,4 +66,4 @@ @time sol = solve(monteprob, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 10_000, saveat = 1.0f0)
EnsembleSolution Solution of length 10000 with uType:
-SciMLBase.ODESolution{ForwardDiff.Dual{Nothing, Float64, 3}, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{ForwardDiff.Dual{Nothing, Float64, 3}}, Tuple{Float32, Float32}, true, Vector{Float32}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, IType, DiffEqBase.Stats, Nothing} where {uType, rateType, IType}
+SciMLBase.ODESolution{ForwardDiff.Dual{Nothing, Float64, 3}, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{ForwardDiff.Dual{Nothing, Float64, 3}}, Tuple{Float32, Float32}, true, Vector{Float32}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, IType, DiffEqBase.Stats, Nothing} where {uType, rateType, IType} diff --git a/dev/examples/bruss/index.html b/dev/examples/bruss/index.html index 1bae6e63..7f435ee0 100644 --- a/dev/examples/bruss/index.html +++ b/dev/examples/bruss/index.html @@ -89,4 +89,4 @@ 11.5 u: 2-element Vector{CUDA.CuArray{Float64, 3, CUDA.Mem.DeviceBuffer}}: [0.0 0.12134432813715873 … 0.1213443281371586 0.0; 0.0 0.12134432813715873 … 0.1213443281371586 0.0; … ; 0.0 0.12134432813715873 … 0.1213443281371586 0.0; 0.0 0.12134432813715873 … 0.1213443281371586 0.0;;; 0.0 0.0 … 0.0 0.0; 0.14892258453196755 0.14892258453196755 … 0.14892258453196755 0.14892258453196755; … ; 0.14892258453196738 0.14892258453196738 … 0.14892258453196738 0.14892258453196738; 0.0 0.0 … 0.0 0.0] - [3.0478431718205936 3.0478158588988173 … 3.0479344071354673 3.0478829076429497; 3.0478921742826572 3.047861583972248 … 3.0479950350896505 3.0479368431672165; … ; 3.047757449709251 3.0477354559246868 … 3.0478300382513863 3.04778922650763; 3.0477977675276757 3.047773336691987 … 3.0478788678640973 3.0478331843281357;;; 2.567054744855225 2.5670562676043445 … 2.567049778293458 2.5670525600516014; 2.56705171174118 2.5670533462176053 … 2.5670463664078818 2.5670493630361464; … ; 2.5670601483305377 2.5670614822617197 … 2.5670558195263085 2.5670582401346573; 2.5670575914290477 2.5670590130863213 … 2.5670529666027693 2.5670555547531957] + [3.0478431718176133 3.0478158588958384 … 3.047934407132488 3.0478829076399707; 3.0478921742796787 3.0478615839692695 … 3.0479950350866707 3.0479368431642384; … ; 3.0477574497062734 3.04773545592171 … 3.0478300382484087 3.047789226504652; 3.047797767524695 3.0477733366890103 … 3.047878867861117 3.047833184325156;;; 2.567054744858359 2.5670562676074793 … 2.567049778296592 2.567052560054736; 2.567051711744317 2.56705334622074 … 2.5670463664110166 2.5670493630392817; … ; 2.567060148333675 2.567061482264857 … 2.567055819529447 2.567058240137795; 2.567057591432184 2.567059013089459 … 2.567052966605906 2.567055554756332] diff --git a/dev/examples/reaction_diffusion/index.html b/dev/examples/reaction_diffusion/index.html index 5111f50e..7889812e 100644 --- a/dev/examples/reaction_diffusion/index.html +++ b/dev/examples/reaction_diffusion/index.html @@ -1,2 +1,2 @@ -GPU-Accelerated Stochastic Partial Differential Equations · DiffEqGPU.jl
+GPU-Accelerated Stochastic Partial Differential Equations · DiffEqGPU.jl
diff --git a/dev/examples/reductions/index.html b/dev/examples/reductions/index.html index c1a0ab91..aabaf02f 100644 --- a/dev/examples/reductions/index.html +++ b/dev/examples/reductions/index.html @@ -28,4 +28,4 @@ reduction = reduction, u_init = Vector{eltype(prob.u0)}([0.0])) sim4 = solve(prob2, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 100, batch_size = 20)
EnsembleSolution Solution of length 1 with uType:
-Float64
+Float64 diff --git a/dev/examples/sde/index.html b/dev/examples/sde/index.html index 58ae88fd..dbf825e9 100644 --- a/dev/examples/sde/index.html +++ b/dev/examples/sde/index.html @@ -23,4 +23,4 @@ monteprob = EnsembleProblem(prob, prob_func = prob_func) sol = solve(monteprob, SOSRI(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 10_000, saveat = 1.0f0)
EnsembleSolution Solution of length 10000 with uType:
-SciMLBase.RODESolution{Float32, 2, uType, Nothing, Nothing, Vector{Float32}, randType, SciMLBase.SDEProblem{Vector{Float32}, Tuple{Float32, Float32}, true, Vector{Float32}, Nothing, SciMLBase.SDEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), typeof(Main.multiplicative_noise), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, typeof(Main.multiplicative_noise), Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, Nothing}, StochasticDiffEq.SOSRI, IType, DiffEqBase.Stats, Nothing} where {uType, randType, IType}
+SciMLBase.RODESolution{Float32, 2, uType, Nothing, Nothing, Vector{Float32}, randType, SciMLBase.SDEProblem{Vector{Float32}, Tuple{Float32, Float32}, true, Vector{Float32}, Nothing, SciMLBase.SDEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), typeof(Main.multiplicative_noise), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, typeof(Main.multiplicative_noise), Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, Nothing}, StochasticDiffEq.SOSRI, IType, DiffEqBase.Stats, Nothing} where {uType, randType, IType} diff --git a/dev/getting_started/index.html b/dev/getting_started/index.html index 7e6abc0d..07d60381 100644 --- a/dev/getting_started/index.html +++ b/dev/getting_started/index.html @@ -54,48 +54,48 @@ prob = ODEProblem(f, u0, (0.0f0, 1.0f0)) # Float32 is better on GPUs! sol = solve(prob, Tsit5())
retcode: Success
 Interpolation: specialized 4th order "free" interpolation
-t: 51-element Vector{Float32}:
+t: 50-element Vector{Float32}:
  0.0
- 0.0028032635
- 0.007292892
- 0.013943267
- 0.022358477
- 0.03266243
- 0.044566914
- 0.057291754
- 0.07193968
- 0.087012306
+ 0.0028032633
+ 0.00770806
+ 0.014659371
+ 0.023422156
+ 0.034057636
+ 0.046285566
+ 0.05950984
+ 0.074400246
+ 0.090368554
  ⋮
- 0.8247635
- 0.84982157
- 0.8745116
- 0.89433014
- 0.91924965
- 0.9438157
- 0.96827626
- 0.9933905
+ 0.8050874
+ 0.83047694
+ 0.85561347
+ 0.88050914
+ 0.90504134
+ 0.9308001
+ 0.95659125
+ 0.98157626
  1.0
-u: 51-element Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}:
+u: 50-element Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}:
  Float32[0.88071316, 0.091738656, 0.5644176, 0.89315474, 0.11550513, 0.9899834, 0.76465535, 0.5730443, 0.5333632, 0.46715936  …  0.51651764, 0.69562453, 0.13039295, 0.017110005, 0.3178709, 0.7581309, 0.32872194, 0.65469325, 0.5585072, 0.12346421]
- Float32[0.88458586, 0.10383201, 0.56542593, 0.91055316, 0.14688145, 0.98178184, 0.72525895, 0.59418094, 0.573652, 0.43604094  …  0.4663532, 0.7562544, 0.07489444, 0.09894383, 0.27474692, 0.7218533, 0.30385882, 0.74906904, 0.6420995, 0.16706456]
- Float32[0.8993485, 0.11652332, 0.5549422, 0.9226965, 0.20035666, 0.9682945, 0.64829654, 0.6165711, 0.6383632, 0.39841184  …  0.38447455, 0.86521155, -0.024065342, 0.2509739, 0.21489702, 0.6452851, 0.25786027, 0.89370865, 0.7635648, 0.23452367]
- Float32[0.94403464, 0.11908797, 0.512699, 0.9032732, 0.27996275, 0.94630456, 0.49943888, 0.6203278, 0.73663926, 0.3718802  …  0.25955936, 1.0538634, -0.18425739, 0.5192544, 0.1464948, 0.48769277, 0.18006498, 1.0911947, 0.91577864, 0.3277313]
- Float32[1.0456772, 0.09229674, 0.41570693, 0.80836636, 0.3661887, 0.91255134, 0.24255256, 0.5678781, 0.8680704, 0.39424247  …  0.094817884, 1.3401356, -0.38622913, 0.9196656, 0.09464145, 0.20607205, 0.07886583, 1.3067919, 1.063521, 0.4331251]
- Float32[1.2451123, 0.009822205, 0.23700213, 0.56974095, 0.42532614, 0.8577494, -0.1955113, 0.40833247, 1.0415791, 0.5242374  …  -0.118558235, 1.764022, -0.5804089, 1.4731584, 0.08890158, -0.28040943, -0.013598273, 1.503326, 1.1853585, 0.5476369]
- Float32[1.5779717, -0.16074678, -0.03486809, 0.10167053, 0.4077717, 0.76762325, -0.9066363, 0.09505341, 1.2570384, 0.8555032  …  -0.3853483, 2.353389, -0.6376506, 2.1472414, 0.18251278, -1.075195, -0.0032046076, 1.5974889, 1.2684758, 0.68334526]
- Float32[2.0385969, -0.44505167, -0.36323085, -0.6404346, 0.30168927, 0.62329584, -1.9623735, -0.35553157, 1.5006028, 1.4946269  …  -0.704176, 3.0949616, -0.3621099, 2.8454118, 0.4555171, -2.279417, 0.26366287, 1.4488839, 1.3324683, 0.8927335]
- Float32[2.653575, -0.92650205, -0.68213314, -1.7437161, 0.20209287, 0.35969245, -3.62819, -0.88641196, 1.8046279, 2.715105  …  -1.137421, 4.0728436, 0.60271794, 3.567788, 1.1182202, -4.298576, 1.0722924, 0.7389011, 1.4554504, 1.3578767]
- Float32[3.2878957, -1.6229483, -0.6983753, -2.8794007, 0.5136371, -0.081603736, -5.89597, -1.127807, 2.196508, 4.6699176  …  -1.7071708, 5.187741, 2.6080158, 4.255335, 2.4045234, -7.4276085, 2.6378803, -1.0187249, 1.7589356, 2.3444717]
+ Float32[0.88458586, 0.10383208, 0.56542593, 0.9105531, 0.14688143, 0.98178184, 0.725259, 0.59418094, 0.573652, 0.43604097  …  0.46635318, 0.7562544, 0.07489447, 0.09894379, 0.2747469, 0.7218533, 0.30385882, 0.749069, 0.6420995, 0.16706459]
+ Float32[0.9013025, 0.11726365, 0.5532294, 0.92281747, 0.20539533, 0.96700597, 0.6402653, 0.6178738, 0.6443927, 0.3957117  …  0.3768051, 0.8760311, -0.033707015, 0.26627496, 0.20992415, 0.63702106, 0.253282, 0.90664876, 0.77402776, 0.24059023]
+ Float32[0.9506427, 0.11814861, 0.5062973, 0.8983726, 0.28816503, 0.94372785, 0.48070958, 0.6184501, 0.74748284, 0.37123397  …  0.24583998, 1.076143, -0.20188585, 0.55090535, 0.14055808, 0.4674204, 0.17131874, 1.1111343, 0.93024594, 0.3372425]
+ Float32[1.0623785, 0.08638905, 0.4001696, 0.79027975, 0.37505215, 0.9076652, 0.20401168, 0.5563591, 0.8853406, 0.40204  …  0.073421545, 1.3801469, -0.41001603, 0.9741743, 0.090954676, 0.1634308, 0.066957146, 1.3308672, 1.0788912, 0.44552672]
+ Float32[1.278569, -0.0058233947, 0.20831701, 0.52600056, 0.428239, 0.8488757, -0.2666873, 0.37851772, 1.0660826, 0.5520373  …  -0.14858639, 1.8276249, -0.5984681, 1.5513371, 0.09369785, -0.359674, -0.020600917, 1.5228127, 1.1977124, 0.56266296]
+ Float32[1.6344533, -0.19264252, -0.07817726, 0.015917871, 0.39763498, 0.7514975, -1.030123, 0.039752055, 1.2891968, 0.9227872  …  -0.42609212, 2.4470403, -0.6241573, 2.2441795, 0.20730077, -1.2141467, 0.014517415, 1.5952181, 1.2774434, 0.7059188]
+ Float32[2.1275413, -0.50668025, -0.41936472, -0.79362315, 0.27951172, 0.5912294, -2.1819494, -0.4400623, 1.5444939, 1.6427702  …  -0.76444525, 3.2350993, -0.26647314, 2.9609494, 0.52785707, -2.5357955, 0.34722802, 1.3856934, 1.3453207, 0.9431921]
+ Float32[2.7601502, -1.0255971, -0.71567386, -1.9411954, 0.20904303, 0.30135608, -3.95969, -0.9587043, 1.8606849, 2.981347  …  -1.2199625, 4.248185, 0.85137504, 3.6808665, 1.2795985, -4.724534, 1.273345, 0.5378595, 1.4889644, 1.4750801]
+ Float32[3.416101, -1.8075991, -0.61632633, -3.0732634, 0.7101897, -0.21086018, -6.4749594, -1.0869303, 2.3067675, 5.218714  …  -1.8599517, 5.4480147, 3.2286246, 4.417976, 2.8007317, -8.321874, 3.0981548, -1.6155888, 1.866251, 2.6711357]
  ⋮
- Float32[-3.8355356f9, 1.1298996f10, -3.93494f10, 6.2908006f9, -3.2500275f10, -3.4033916f10, 3.4276102f10, -8.358764f9, -1.3181027f8, 6.9724595f9  …  -3.758224f10, 3.1913632f10, 2.2516085f10, 3.781391f10, -1.4528389f10, 5.0571884f9, 5.5028557f9, 1.6473422f10, -1.3774336f10, -6.9072525f8]
- Float32[-1.1802888f10, 2.3964957f10, -7.842121f10, 2.128094f10, -7.817608f10, -7.8497145f10, 7.4242105f10, -1.5591672f10, 5.0940733f9, 1.9588368f10  …  -8.79277f10, 7.620545f10, 4.6994047f10, 8.263724f10, -1.7877727f10, 1.0438716f10, 7.8208154f9, 4.0790344f10, -3.0817853f10, 7.139321f8]
- Float32[-3.1122395f10, 4.835905f10, -1.5162422f11, 6.8681708f10, -1.8565762f11, -1.7294005f11, 1.5758757f11, -2.8887898f10, 2.2604016f10, 5.4178144f10  …  -1.9935493f11, 1.7930378f11, 9.4641455f10, 1.7341137f11, -1.4456337f10, 2.112963f10, 2.1751183f9, 9.453497f10, -6.3974597f10, 8.769985f9]
- Float32[-6.452887f10, 8.201022f10, -2.5630245f11, 1.6806388f11, -3.7132304f11, -3.191059f11, 2.88068f11, -4.7664165f10, 5.865629f10, 1.2108814f11  …  -3.795655f11, 3.5548863f11, 1.6306613f11, 3.080752f11, 2.4782664f8, 3.6635824f10, -2.2505982f10, 1.7990689f11, -1.1051795f11, 2.9476325f10]
- Float32[-1.5636973f11, 1.4985986f11, -5.021798f11, 4.8890043f11, -8.8476824f11, -6.7263385f11, 6.1807664f11, -8.995169f10, 1.7117613f11, 3.2547294f11  …  -8.40242f11, 8.3662484f11, 3.1607397f11, 6.190827f11, 4.815659f10, 7.027574f10, -1.2605745f11, 3.9060007f11, -2.1035781f11, 1.0438328f11]
- Float32[-3.688076f11, 2.4462803f11, -1.0109411f12, 1.3265278f12, -2.0706932f12, -1.3690599f12, 1.3258288f12, -1.6722428f11, 4.572891f11, 8.401172f11  …  -1.8141f12, 1.9319423f12, 5.9251157f11, 1.2009144f12, 1.4473537f11, 1.2264527f11, -4.2864322f11, 8.121761f11, -3.8142922f11, 3.138882f11]
- Float32[-8.665197f11, 3.2343844f11, -2.1468291f12, 3.4262035f12, -4.7932254f12, -2.718374f12, 2.87366f12, -3.0060757f11, 1.1718073f12, 2.105633f12  …  -3.8650286f12, 4.4083467f12, 1.08114706f12, 2.2714987f12, 2.8944125f11, 1.775824f11, -1.21367f12, 1.6383607f12, -6.7264394f11, 8.601409f11]
- Float32[-2.10209f12, 1.8158826f11, -4.964374f12, 8.729674f12, -1.1245941f13, -5.389325f12, 6.4526274f12, -5.0728174f11, 3.0218288f12, 5.279704f12  …  -8.353269f12, 1.0181811f13, 1.9492538f12, 4.2902613f12, 4.039475f11, 1.2968861f11, -3.182691f12, 3.292595f12, -1.200864f12, 2.2715708f12]
- Float32[-2.6602567f12, 3.8560997f10, -6.2509354f12, 1.1102161f13, -1.4053066f13, -6.433953f12, 8.0016766f12, -5.6851615f11, 3.8712493f12, 6.6998103f12  …  -1.0227779f13, 1.2668629f13, 2.2638093f12, 5.06079f12, 3.965273f11, 6.051217f10, -4.0504638f12, 3.9452772f12, -1.4038904f12, 2.9080688f12]

Notice that the solution values sol[i] are CUDA-based arrays, which can be moved back to the CPU using Array(sol[i]).

More details on effective use of within-method GPU parallelism can be found in the within-method GPU parallelism tutorial.

Example of Parameter-Parallelism with GPU Ensemble Methods

On the other side of the spectrum, what if we want to solve tons of small ODEs? For this use case, we would use the ensemble methods to solve the same ODE many times with different parameters. This looks like:

using DiffEqGPU, OrdinaryDiffEq, StaticArrays, CUDA
+ Float32[-1.2448122f9, 6.120147f9, -2.2411454f10, 2.5883497f9, -1.6325854f10, -1.7176054f10, 1.845867f10, -5.097449f9, -1.0280615f9, 3.1593787f9  …  -1.9029856f10, 1.6097072f10, 1.2417926f10, 2.0032188f10, -1.0834926f10, 2.8556372f9, 3.1497367f9, 7.697759f9, -6.9380485f9, -5.535416f8]
+ Float32[-5.054011f9, 1.3453031f10, -4.616155f10, 8.2861164f9, -3.969783f10, -4.1313247f10, 4.0932168f10, -9.638242f9, 4.9811946f8, 8.812047f9  …  -4.5696623f10, 3.8924104f10, 2.6688123f10, 4.5313098f10, -1.5538131f10, 5.968125f9, 6.2155515f9, 2.0368216f10, -1.6654228f10, -6.140038f8]
+ Float32[-1.4944735f10, 2.8362789f10, -9.165974f10, 2.815186f10, -9.5766045f10, -9.474463f10, 8.862145f10, -1.800876f10, 7.6212874f9, 2.4890206f10  …  -1.0672122f11, 9.316511f10, 5.550552f10, 9.85828f10, -1.7973344f10, 1.2330889f10, 7.6901f9, 4.9893302f10, -3.677066f10, 1.6863081f9]
+ Float32[-3.893791f10, 5.6953594f10, -1.7770832f11, 9.044478f10, -2.2902504f11, -2.0856529f11, 1.8912282f11, -3.359453f10, 3.0586415f10, 6.920851f10  …  -2.425359f11, 2.2062703f11, 1.117672f11, 2.0673906f11, -1.1531223f10, 2.5009125f10, -2.3971753f9, 1.1517141f11, -7.575034f10, 1.3046186f10]
+ Float32[-9.4686446f10, 1.0731566f11, -3.412096f11, 2.6793178f11, -5.3960088f11, -4.4109395f11, 3.9954403f11, -6.260374f10, 9.412401f10, 1.8581643f11  …  -5.3513342f11, 5.1393908f11, 2.1738078f11, 4.1722443f11, 1.5812787f10, 4.882664f10, -5.2608455f10, 2.5214506f11, -1.465566f11, 5.201068f10]
+ Float32[-2.3425732f11, 1.9188338f11, -6.936081f11, 7.865207f11, -1.3207119f12, -9.422044f11, 8.834734f11, -1.2063414f11, 2.7343546f11, 5.0998555f11  …  -1.2084028f12, 1.2411704f12, 4.2599105f11, 8.479224f11, 8.660919f10, 9.264514f10, -2.3117296f11, 5.5311847f11, -2.7944665f11, 1.7754315f11]
+ Float32[-5.7578055f11, 2.9436674f11, -1.4863718f12, 2.1885256f12, -3.2131108f12, -1.963872f12, 1.982272f12, -2.2853788f11, 7.4989955f11, 1.3616497f12  …  -2.6955859f12, 2.9757908f12, 8.1371673f11, 1.6795491f12, 2.162986f11, 1.5407199f11, -7.498414f11, 1.175319f12, -5.1397958f11, 5.3616073f11]
+ Float32[-1.3832491f12, 3.0316875f11, -3.320267f12, 5.6475113f12, -7.5383245f12, -3.914882f12, 4.4024636f12, -4.0234097f11, 1.9380348f12, 3.4359225f12  …  -5.81553f12, 6.876512f12, 1.4829776f12, 3.1870962f12, 3.6673844f11, 1.8028655f11, -2.0433033f12, 2.376735f12, -9.128034f11, 1.4484123f12]
+ Float32[-2.6601705f12, 3.8654902f10, -6.250732f12, 1.1101971f13, -1.4052883f13, -6.4339533f12, 8.0015576f12, -5.68527f11, 3.8711103f12, 6.6996934f12  …  -1.022768f13, 1.2668507f13, 2.263842f12, 5.060772f12, 3.9652835f11, 6.0590236f10, -4.0504087f12, 3.945264f12, -1.403861f12, 2.9080258f12]

Notice that the solution values sol[i] are CUDA-based arrays, which can be moved back to the CPU using Array(sol[i]).

More details on effective use of within-method GPU parallelism can be found in the within-method GPU parallelism tutorial.

Example of Parameter-Parallelism with GPU Ensemble Methods

On the other side of the spectrum, what if we want to solve tons of small ODEs? For this use case, we would use the ensemble methods to solve the same ODE many times with different parameters. This looks like:

using DiffEqGPU, OrdinaryDiffEq, StaticArrays, CUDA
 
 function lorenz(u, p, t)
     σ = p[1]
@@ -116,4 +116,4 @@
 
 sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
     trajectories = 10_000)
EnsembleSolution Solution of length 10000 with uType:
-SciMLBase.ODESolution{Float32, 2, uType, Nothing, Nothing, tType, Nothing, P, A, IType, Nothing, Nothing} where {uType, tType, P, A, IType}

To dig more into this example, see the ensemble GPU solving tutorial.

+SciMLBase.ODESolution{Float32, 2, uType, Nothing, Nothing, tType, Nothing, P, A, IType, Nothing, Nothing} where {uType, tType, P, A, IType}

To dig more into this example, see the ensemble GPU solving tutorial.

diff --git a/dev/index.html b/dev/index.html index 50360c25..89dcc4d1 100644 --- a/dev/index.html +++ b/dev/index.html @@ -1,11 +1,11 @@ DiffEqGPU: Massively Data-Parallel GPU Solving of ODEs · DiffEqGPU.jl

DiffEqGPU: Massively Data-Parallel GPU Solving of ODEs

This library is a component package of the DifferentialEquations.jl ecosystem. It includes functionality for making use of GPUs in the differential equation solvers.

Installation

To install DiffEqGPU.jl, use the Julia package manager:

using Pkg
-Pkg.add("DiffEqGPU")

This will also install all the dependencies, including the CUDA.jl, which will also install all the required versions of CUDA and CuDNN required by these libraries. Note that the same requirements of CUDA.jl apply to DiffEqGPU, such as requiring a GPU with CUDA v11 compatibility. For more information on these requirements, see the requirements of CUDA.jl.

Contributing

Reproducibility

The documentation of this SciML package was built using these direct dependencies,
Status `/var/lib/buildkite-agent/builds/gpuci-15/julialang/diffeqgpu-dot-jl/docs/Project.toml`
+Pkg.add("DiffEqGPU")

This will also install all the dependencies, including the CUDA.jl, which will also install all the required versions of CUDA and CuDNN required by these libraries. Note that the same requirements of CUDA.jl apply to DiffEqGPU, such as requiring a GPU with CUDA v11 compatibility. For more information on these requirements, see the requirements of CUDA.jl.

Contributing

Reproducibility

The documentation of this SciML package was built using these direct dependencies,
Status `/var/lib/buildkite-agent/builds/gpuci-13/julialang/diffeqgpu-dot-jl/docs/Project.toml`
   [79e6a3ab] Adapt v3.6.2
   [6e4b80f9] BenchmarkTools v1.3.2
  [052768ef] CUDA v4.4.1
   [2b5f629d] DiffEqBase v6.133.1
-  [071ae1c0] DiffEqGPU v2.5.1 `/var/lib/buildkite-agent/builds/gpuci-15/julialang/diffeqgpu-dot-jl`
+  [071ae1c0] DiffEqGPU v3.0.0 `/var/lib/buildkite-agent/builds/gpuci-13/julialang/diffeqgpu-dot-jl`
   [e30172f5] Documenter v1.1.1
   [587475ba] Flux v0.14.6
   [f6369f11] ForwardDiff v0.10.36
@@ -34,8 +34,7 @@
   JULIA_CPU_THREADS = 2
   JULIA_DEPOT_PATH = /root/.cache/julia-buildkite-plugin/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b
   LD_LIBRARY_PATH = /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-  JULIA_PKG_SERVER =
-  JULIA_IMAGE_THREADS = 1
A more complete overview of all dependencies and their versions is also provided.
Status `/var/lib/buildkite-agent/builds/gpuci-15/julialang/diffeqgpu-dot-jl/docs/Manifest.toml`
+  JULIA_PKG_SERVER =
A more complete overview of all dependencies and their versions is also provided.
Status `/var/lib/buildkite-agent/builds/gpuci-13/julialang/diffeqgpu-dot-jl/docs/Manifest.toml`
   [47edcb42] ADTypes v0.2.4
   [a4c015fc] ANSIColoredPrinters v0.0.1
   [621f4979] AbstractFFTs v1.5.0
@@ -83,7 +82,7 @@
   [8bb1440f] DelimitedFiles v1.9.1
   [2b5f629d] DiffEqBase v6.133.1
   [459566f4] DiffEqCallbacks v2.33.1
-  [071ae1c0] DiffEqGPU v2.5.1 `/var/lib/buildkite-agent/builds/gpuci-15/julialang/diffeqgpu-dot-jl`
+  [071ae1c0] DiffEqGPU v3.0.0 `/var/lib/buildkite-agent/builds/gpuci-13/julialang/diffeqgpu-dot-jl`
   [77a26b50] DiffEqNoiseProcess v5.19.0
   [163ba53b] DiffResults v1.1.0
   [b552c78f] DiffRules v1.15.1
@@ -401,4 +400,4 @@
   [8e850b90] libblastrampoline_jll v5.8.0+0
   [8e850ede] nghttp2_jll v1.48.0+0
   [3f19e933] p7zip_jll v17.4.0+0
-Info Packages marked with  and  have new versions available, but those with  are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m`

You can also download the manifest file and the project file.

+Info Packages marked with and have new versions available, but those with are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m`

You can also download the manifest file and the project file.

diff --git a/dev/manual/backends/index.html b/dev/manual/backends/index.html index ab54587a..bbea32c1 100644 --- a/dev/manual/backends/index.html +++ b/dev/manual/backends/index.html @@ -1,2 +1,2 @@ -Compute Backends (GPU Choices) · DiffEqGPU.jl

Compute Backends (GPU Choices)

DiffEqGPU.jl supports a multitude of different GPU devices. These must be chosen during the construction of the EnsembleGPUArray and EnsembleGPUKernel construction and correpond to the compute backends of KernelAbstractions.jl. The choices for backends are:

  • CUDA.CUDABackend(): For NVIDIA GPUs via code generation for CUDA kernels.
  • AMDGPU.ROCBackend(): For AMD GPUs via code generation for ROCm kernels.
  • oneAPI.oneAPIBackend(): For Intel GPUs via code generation for OneAPI kernels.
  • Metal.MetalBackend(): For Apple Silicon (M-Series such as M1 or M2) via code generation for Metal kernels.

This is used for example like EnsembleGPUKernel(oneAPI.oneAPIBackend()) to enable the computations for Intel GPUs. The choice of backend is mandatory and requires the installation of the respective package. Thus for example, using the OneAPI backend requires that the user has successfully installed oneAPI.jl and has an Intel GPU.

+Compute Backends (GPU Choices) · DiffEqGPU.jl

Compute Backends (GPU Choices)

DiffEqGPU.jl supports a multitude of different GPU devices. These must be chosen during the construction of the EnsembleGPUArray and EnsembleGPUKernel construction and correpond to the compute backends of KernelAbstractions.jl. The choices for backends are:

  • CUDA.CUDABackend(): For NVIDIA GPUs via code generation for CUDA kernels.
  • AMDGPU.ROCBackend(): For AMD GPUs via code generation for ROCm kernels.
  • oneAPI.oneAPIBackend(): For Intel GPUs via code generation for OneAPI kernels.
  • Metal.MetalBackend(): For Apple Silicon (M-Series such as M1 or M2) via code generation for Metal kernels.

This is used for example like EnsembleGPUKernel(oneAPI.oneAPIBackend()) to enable the computations for Intel GPUs. The choice of backend is mandatory and requires the installation of the respective package. Thus for example, using the OneAPI backend requires that the user has successfully installed oneAPI.jl and has an Intel GPU.

diff --git a/dev/manual/choosing_ensembler/index.html b/dev/manual/choosing_ensembler/index.html index 41623c7a..93eadcdf 100644 --- a/dev/manual/choosing_ensembler/index.html +++ b/dev/manual/choosing_ensembler/index.html @@ -1,2 +1,2 @@ -Choosing the Ensemble: EnsembleGPUArray vs EnsembleGPUKernel · DiffEqGPU.jl

Choosing the Ensemble: EnsembleGPUArray vs EnsembleGPUKernel

The short answer for how to choose an ensemble method is that, if EnsembleGPUKernel works on your problem, you should use it. A more complex discussion is the following:

  • EnsembleGPUKernel is more asynchronous and has lower kernel call counts than EnsembleGPUArray. This should amount to lower overhead in any case where the algorithms are the same.
  • EnsembleGPUKernel is restrictive on the types of ODE solvers that have been implemented for it. If the most efficient method is not in the list of GPU kernel methods, it may be more efficient to use EnsembleGPUArray with the better method.
  • EnsembleGPUKernel requires equations to be written in out-of-place form, along with a few other restrictions, and thus in some cases can be less automatic than EnsembleGPUArray depending on how the code was originally written.
+Choosing the Ensemble: EnsembleGPUArray vs EnsembleGPUKernel · DiffEqGPU.jl

Choosing the Ensemble: EnsembleGPUArray vs EnsembleGPUKernel

The short answer for how to choose an ensemble method is that, if EnsembleGPUKernel works on your problem, you should use it. A more complex discussion is the following:

  • EnsembleGPUKernel is more asynchronous and has lower kernel call counts than EnsembleGPUArray. This should amount to lower overhead in any case where the algorithms are the same.
  • EnsembleGPUKernel is restrictive on the types of ODE solvers that have been implemented for it. If the most efficient method is not in the list of GPU kernel methods, it may be more efficient to use EnsembleGPUArray with the better method.
  • EnsembleGPUKernel requires equations to be written in out-of-place form, along with a few other restrictions, and thus in some cases can be less automatic than EnsembleGPUArray depending on how the code was originally written.
diff --git a/dev/manual/ensemblegpuarray/index.html b/dev/manual/ensemblegpuarray/index.html index ba5fef68..c0dce59d 100644 --- a/dev/manual/ensemblegpuarray/index.html +++ b/dev/manual/ensemblegpuarray/index.html @@ -12,4 +12,4 @@ prob = ODEProblem(lorenz,u0,tspan,p) prob_func = (prob,i,repeat) -> remake(prob,p=rand(Float32,3).*p) monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy=false) -@time sol = solve(monteprob,Tsit5(),EnsembleGPUArray(CUDADevice()),trajectories=10_000,saveat=1.0f0)source
DiffEqGPU.EnsembleCPUArrayType
EnsembleCPUArray(cpu_offload = 0.2)

An EnsembleArrayAlgorithm which utilizes the CPU kernels to parallelize each ODE solve with their separate ODE integrator on each kernel. This method is meant to be a debugging counterpart to EnsembleGPUArray, having the same behavior and using the same KernelAbstractions.jl process to build the combined ODE, but without the restrictions of f being a GPU-compatible kernel function.

It is unlikely that this method is useful beyond library development and debugging, as almost any case should be faster with EnsembleThreads or EnsembleDistributed.

source
+@time sol = solve(monteprob,Tsit5(),EnsembleGPUArray(CUDADevice()),trajectories=10_000,saveat=1.0f0)source
DiffEqGPU.EnsembleCPUArrayType
EnsembleCPUArray(cpu_offload = 0.2)

An EnsembleArrayAlgorithm which utilizes the CPU kernels to parallelize each ODE solve with their separate ODE integrator on each kernel. This method is meant to be a debugging counterpart to EnsembleGPUArray, having the same behavior and using the same KernelAbstractions.jl process to build the combined ODE, but without the restrictions of f being a GPU-compatible kernel function.

It is unlikely that this method is useful beyond library development and debugging, as almost any case should be faster with EnsembleThreads or EnsembleDistributed.

source
diff --git a/dev/manual/ensemblegpukernel/index.html b/dev/manual/ensemblegpukernel/index.html index df265a1f..072d4dab 100644 --- a/dev/manual/ensemblegpukernel/index.html +++ b/dev/manual/ensemblegpukernel/index.html @@ -19,13 +19,13 @@ monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false) @time sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(), trajectories = 10_000, - adaptive = false, dt = 0.1f0)source

Specialized Solvers

DiffEqGPU.GPUTsit5Type

GPUTsit5()

A specialized implementation of the 5th order Tsit5 method specifically for kernel generation with EnsembleGPUKernel. For a similar CPU implementation, see SimpleATsit5 from SimpleDiffEq.jl.

source
DiffEqGPU.GPUVern7Type

GPUVern7()

A specialized implementation of the 7th order Vern7 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUVern9Type

GPUVern9()

A specialized implementation of the 9th order Vern9 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUEMType

GPUEM()

A specialized implementation of the Euler-Maruyama GPUEM method with weak order 1.0. Made specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUSIEAType

GPUSIEA()

A specialized implementation of the weak order 2.0 for Ito SDEs GPUSIEA method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURosenbrock23Type

GPURosenbrock23()

A specialized implementation of the W-method Rosenbrock23 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURodas4Type

GPURodas4()

A specialized implementation of the Rodas4 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURodas5PType

GPURodas5P()

A specialized implementation of the Rodas5P method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUKvaerno3Type

GPUKvaerno3()

A specialized implementation of the Kvaerno3 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUKvaerno5Type

GPUKvaerno5()

A specialized implementation of the Kvaerno5 method specifically for kernel generation with EnsembleGPUKernel.

source

Lower Level API

DiffEqGPU.vectorized_solveFunction
vectorized_solve(probs, prob::Union{ODEProblem, SDEProblem}alg;
+                  adaptive = false, dt = 0.1f0)
source

Specialized Solvers

DiffEqGPU.GPUTsit5Type

GPUTsit5()

A specialized implementation of the 5th order Tsit5 method specifically for kernel generation with EnsembleGPUKernel. For a similar CPU implementation, see SimpleATsit5 from SimpleDiffEq.jl.

source
DiffEqGPU.GPUVern7Type

GPUVern7()

A specialized implementation of the 7th order Vern7 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUVern9Type

GPUVern9()

A specialized implementation of the 9th order Vern9 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUEMType

GPUEM()

A specialized implementation of the Euler-Maruyama GPUEM method with weak order 1.0. Made specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUSIEAType

GPUSIEA()

A specialized implementation of the weak order 2.0 for Ito SDEs GPUSIEA method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURosenbrock23Type

GPURosenbrock23()

A specialized implementation of the W-method Rosenbrock23 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURodas4Type

GPURodas4()

A specialized implementation of the Rodas4 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPURodas5PType

GPURodas5P()

A specialized implementation of the Rodas5P method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUKvaerno3Type

GPUKvaerno3()

A specialized implementation of the Kvaerno3 method specifically for kernel generation with EnsembleGPUKernel.

source
DiffEqGPU.GPUKvaerno5Type

GPUKvaerno5()

A specialized implementation of the Kvaerno5 method specifically for kernel generation with EnsembleGPUKernel.

source

Lower Level API

DiffEqGPU.vectorized_solveFunction
vectorized_solve(probs, prob::Union{ODEProblem, SDEProblem}alg;
                  dt, saveat = nothing,
                  save_everystep = true,
-                 debug = false, callback = CallbackSet(nothing), tstops = nothing)

A lower level interface to the kernel generation solvers of EnsembleGPUKernel with fixed time-stepping.

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • prob: the quintessential problem form. Can be just probs[1]
  • alg: the kernel-based differential equation solver. Must be one of the EnsembleGPUKernel specialized methods.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
DiffEqGPU.vectorized_asolveFunction
vectorized_asolve(probs, prob::ODEProblem, alg;
+                 debug = false, callback = CallbackSet(nothing), tstops = nothing)

A lower level interface to the kernel generation solvers of EnsembleGPUKernel with fixed time-stepping.

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • prob: the quintessential problem form. Can be just probs[1]
  • alg: the kernel-based differential equation solver. Must be one of the EnsembleGPUKernel specialized methods.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
DiffEqGPU.vectorized_asolveFunction
vectorized_asolve(probs, prob::ODEProblem, alg;
                   dt = 0.1f0, saveat = nothing,
                   save_everystep = false,
                   abstol = 1.0f-6, reltol = 1.0f-3,
-                  callback = CallbackSet(nothing), tstops = nothing)

A lower level interface to the kernel generation solvers of EnsembleGPUKernel with adaptive time-stepping.

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • prob: the quintessential problem form. Can be just probs[1]
  • alg: the kernel-based differential equation solver. Must be one of the EnsembleGPUKernel specialized methods.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
DiffEqGPU.vectorized_map_solveFunction

Lower level API for EnsembleArrayAlgorithm. Avoids conversion of solution to CPU arrays.

vectorized_map_solve(probs, alg,
+                  callback = CallbackSet(nothing), tstops = nothing)

A lower level interface to the kernel generation solvers of EnsembleGPUKernel with adaptive time-stepping.

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • prob: the quintessential problem form. Can be just probs[1]
  • alg: the kernel-based differential equation solver. Must be one of the EnsembleGPUKernel specialized methods.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
DiffEqGPU.vectorized_map_solveFunction

Lower level API for EnsembleArrayAlgorithm. Avoids conversion of solution to CPU arrays.

vectorized_map_solve(probs, alg,
                      ensemblealg::Union{EnsembleArrayAlgorithm}, I,
-                     adaptive)

Arguments

  • probs: the GPU-setup problems generated by the ensemble.
  • alg: the kernel-based differential equation solver. Most of the solvers from OrdinaryDiffEq.jl are supported.
  • ensemblealg: The EnsembleGPUArray() algorithm.
  • I: The iterator argument. Can be set to for e.g. 1:10_000 to simulate 10,000 trajectories.
  • adaptive: The Boolean argument for time-stepping. Use true to enable adaptive time-stepping.

Keyword Arguments

Only a subset of the common solver arguments are supported.

source
+ adaptive)

Arguments

Keyword Arguments

Only a subset of the common solver arguments are supported.

source diff --git a/dev/manual/optimal_trajectories/index.html b/dev/manual/optimal_trajectories/index.html index 2e187aac..f2ef2009 100644 --- a/dev/manual/optimal_trajectories/index.html +++ b/dev/manual/optimal_trajectories/index.html @@ -1,2 +1,2 @@ -Choosing Optimal Numbers of Trajectories · DiffEqGPU.jl

Choosing Optimal Numbers of Trajectories

There is a balance between two things for choosing the number of trajectories:

  • The number of trajectories needs to be high enough that the work per kernel is sufficient to overcome the kernel call cost.
  • More trajectories means that every trajectory will need more time steps, since the adaptivity syncs all solves.

From our testing, the balance is found at around 10,000 trajectories being optimal for EnsembleGPUArray, since it has higher kernel call costs because every internal operation of the ODE solver requires a kernel call. Thus, for larger sets of trajectories, use a batch size of 10,000. Of course, benchmark for yourself on your own setup, as all GPUs are different.

On the other hand, EnsembleGPUKernel fuses the entire GPU solve into a single kernel, greatly reducing the kernel call cost. This means longer or more expensive ODE solves will require even less of a percentage of time kernel launching, making the cutoff much smaller. We see some cases with around 100 ODEs being viable with EnsembleGPUKernel. Again, this is highly dependent on the ODE and the chosen GPU and thus one will need to benchmark to get accurate numbers for their system, this is merely a ballpark estimate.

+Choosing Optimal Numbers of Trajectories · DiffEqGPU.jl

Choosing Optimal Numbers of Trajectories

There is a balance between two things for choosing the number of trajectories:

  • The number of trajectories needs to be high enough that the work per kernel is sufficient to overcome the kernel call cost.
  • More trajectories means that every trajectory will need more time steps, since the adaptivity syncs all solves.

From our testing, the balance is found at around 10,000 trajectories being optimal for EnsembleGPUArray, since it has higher kernel call costs because every internal operation of the ODE solver requires a kernel call. Thus, for larger sets of trajectories, use a batch size of 10,000. Of course, benchmark for yourself on your own setup, as all GPUs are different.

On the other hand, EnsembleGPUKernel fuses the entire GPU solve into a single kernel, greatly reducing the kernel call cost. This means longer or more expensive ODE solves will require even less of a percentage of time kernel launching, making the cutoff much smaller. We see some cases with around 100 ODEs being viable with EnsembleGPUKernel. Again, this is highly dependent on the ODE and the chosen GPU and thus one will need to benchmark to get accurate numbers for their system, this is merely a ballpark estimate.

diff --git a/dev/tutorials/gpu_ensemble_basic/index.html b/dev/tutorials/gpu_ensemble_basic/index.html index 59294268..42ce7cfe 100644 --- a/dev/tutorials/gpu_ensemble_basic/index.html +++ b/dev/tutorials/gpu_ensemble_basic/index.html @@ -67,4 +67,4 @@ solve(monteprob_jac, Rodas5(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 10_000, saveat = 1.0f0)
EnsembleSolution Solution of length 10000 with uType:
-SciMLBase.ODESolution{Float32, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{Float32}, Tuple{Float32, Float32}, true, StaticArraysCore.SizedVector{3, Float32, Vector{Float32}}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, typeof(Main.lorenz_tgrad), typeof(Main.lorenz_jac), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, A, IType, DiffEqBase.Stats, Nothing} where {uType, rateType, A, IType}
+SciMLBase.ODESolution{Float32, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{Float32}, Tuple{Float32, Float32}, true, StaticArraysCore.SizedVector{3, Float32, Vector{Float32}}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, typeof(Main.lorenz_tgrad), typeof(Main.lorenz_jac), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, A, IType, DiffEqBase.Stats, Nothing} where {uType, rateType, A, IType} diff --git a/dev/tutorials/lower_level_api/index.html b/dev/tutorials/lower_level_api/index.html index d16de41a..a2d165a9 100644 --- a/dev/tutorials/lower_level_api/index.html +++ b/dev/tutorials/lower_level_api/index.html @@ -41,7 +41,7 @@ save_everystep = false, dt = 0.1f0) @time CUDA.@sync ts, us = DiffEqGPU.vectorized_asolve(probs, prob, GPUTsit5(); - save_everystep = false, dt = 0.1f0)
(Float32[0.0 0.0 … 0.0 0.0; 10.0 10.0 … 10.0 10.0], StaticArraysCore.SVector{3, Float32}[[1.0, 0.0, 0.0] [1.0, 0.0, 0.0] … [1.0, 0.0, 0.0] [1.0, 0.0, 0.0]; [5.861511, 5.857241, 14.757899] [3.04703, 3.0470963, 3.7174907] … [6.586052, 6.638747, 18.88271] [-2.9703623f-8, -9.194951f-8, 26.466835]])

Note that the core is the function DiffEqGPU.vectorized_solve which is the solver for the CUDA-based probs which uses the manually converted problems, and returns us which is a vector of CuArrays for the solution.

Similarily, there exists a lower-level API for EnsembleGPUArray as well, primarily for benchmarking purposes. The solution returned for state (sol.u) is a matrix having columns as different parameter-parallel solutions for the ensemble problem. An example is shown below:

using DiffEqGPU, OrdinaryDiffEq, StaticArrays, CUDA, DiffEqBase
+    save_everystep = false, dt = 0.1f0)
(Float32[0.0 0.0 … 0.0 0.0; 10.0 10.0 … 10.0 10.0], StaticArraysCore.SVector{3, Float32}[[1.0, 0.0, 0.0] [1.0, 0.0, 0.0] … [1.0, 0.0, 0.0] [1.0, 0.0, 0.0]; [-4.153736, -4.5594797, 9.090304] [-4.160997, -4.060143, 12.015403] … [-4.5326314, -4.549619, 13.34241] [1.9596212, 1.959193, 1.9413165]])

Note that the core is the function DiffEqGPU.vectorized_solve which is the solver for the CUDA-based probs which uses the manually converted problems, and returns us which is a vector of CuArrays for the solution.

Similarily, there exists a lower-level API for EnsembleGPUArray as well, primarily for benchmarking purposes. The solution returned for state (sol.u) is a matrix having columns as different parameter-parallel solutions for the ensemble problem. An example is shown below:

using DiffEqGPU, OrdinaryDiffEq, StaticArrays, CUDA, DiffEqBase
 
 trajectories = 10_000
 
@@ -82,4 +82,4 @@
  10.0
 u: 2-element Vector{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}:
  [1.0 1.0 … 1.0 1.0; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
- [-0.009350048 -4.3812757 … -0.00016895184 4.1092334; -0.016221652 -7.267738 … -0.00021049446 4.193789; 14.588885 20.678904 … 18.329168 14.777752]
+ [-3.537868 6.8042526 … 2.8931596 -4.712209; -3.9131458 6.857932 … 2.8436182 -2.8206668; 15.165056 23.594097 … 6.5120435 22.65308] diff --git a/dev/tutorials/multigpu/index.html b/dev/tutorials/multigpu/index.html index 43f86b0c..7f6fa278 100644 --- a/dev/tutorials/multigpu/index.html +++ b/dev/tutorials/multigpu/index.html @@ -71,4 +71,4 @@ @time sol = solve(monteprob, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 100_000, - batch_size = 50_000, saveat = 1.0f0) + batch_size = 50_000, saveat = 1.0f0) diff --git a/dev/tutorials/parallel_callbacks/index.html b/dev/tutorials/parallel_callbacks/index.html index fadeda94..b3470db8 100644 --- a/dev/tutorials/parallel_callbacks/index.html +++ b/dev/tutorials/parallel_callbacks/index.html @@ -19,4 +19,4 @@ trajectories = 10, adaptive = false, dt = 0.01f0, callback = gpu_cb, merge_callbacks = true, tstops = [4.0f0])
EnsembleSolution Solution of length 10 with uType:
-SciMLBase.ODESolution{Float32, 2, SubArray{StaticArraysCore.SVector{1, Float32}, 1, Matrix{StaticArraysCore.SVector{1, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, Nothing, SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, DiffEqGPU.ImmutableODEProblem{StaticArraysCore.SVector{1, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, typeof(Main.f), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, GPUTsit5, SciMLBase.LinearInterpolation{SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, SubArray{StaticArraysCore.SVector{1, Float32}, 1, Matrix{StaticArraysCore.SVector{1, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}}, Nothing, Nothing}
+SciMLBase.ODESolution{Float32, 2, SubArray{StaticArraysCore.SVector{1, Float32}, 1, Matrix{StaticArraysCore.SVector{1, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, Nothing, SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, Nothing, DiffEqGPU.ImmutableODEProblem{StaticArraysCore.SVector{1, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, typeof(Main.f), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, GPUTsit5, SciMLBase.LinearInterpolation{SubArray{Float32, 1, Matrix{Float32}, Tuple{UnitRange{Int64}, Int64}, true}, SubArray{StaticArraysCore.SVector{1, Float32}, 1, Matrix{StaticArraysCore.SVector{1, Float32}}, Tuple{UnitRange{Int64}, Int64}, true}}, Nothing, Nothing} diff --git a/dev/tutorials/weak_order_conv_sde/713812a7.svg b/dev/tutorials/weak_order_conv_sde/028d9c3b.svg similarity index 80% rename from dev/tutorials/weak_order_conv_sde/713812a7.svg rename to dev/tutorials/weak_order_conv_sde/028d9c3b.svg index f19d93c6..27dbfeaf 100644 --- a/dev/tutorials/weak_order_conv_sde/713812a7.svg +++ b/dev/tutorials/weak_order_conv_sde/028d9c3b.svg @@ -1,46 +1,46 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dev/tutorials/weak_order_conv_sde/index.html b/dev/tutorials/weak_order_conv_sde/index.html index 71441d98..bcdf1efd 100644 --- a/dev/tutorials/weak_order_conv_sde/index.html +++ b/dev/tutorials/weak_order_conv_sde/index.html @@ -30,4 +30,4 @@ plot(ts, us_expect, lw = 5, xaxis = "Time (t)", yaxis = "y(t)", label = "True Expected value") -plot!(ts, us_calc, lw = 3, ls = :dash, label = "Caculated Expected value")Example block output +plot!(ts, us_calc, lw = 3, ls = :dash, label = "Caculated Expected value")Example block output diff --git a/dev/tutorials/within_method_gpu/index.html b/dev/tutorials/within_method_gpu/index.html index b76ae007..a45ee1a7 100644 --- a/dev/tutorials/within_method_gpu/index.html +++ b/dev/tutorials/within_method_gpu/index.html @@ -22,16 +22,16 @@ 0.039545782 0.06525832 0.09650515 - 0.13952377 + 0.13952234 ⋮ - 95.27473 - 95.91403 - 96.57579 - 97.23755 - 97.89931 - 98.561066 - 99.23637 - 99.91167 + 95.27474 + 95.91405 + 96.57581 + 97.23758 + 97.899345 + 98.56111 + 99.23641 + 99.91171 100.0 u: 229-element Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}: Float32[1.0, 0.0, 0.0] @@ -43,14 +43,14 @@ Float32[0.9666134, -0.016576296, -0.0360636] Float32[0.9460261, -0.026261317, -0.05836517] Float32[0.9221204, -0.036940157, -0.08431637] - Float32[0.8910716, -0.04980573, -0.11809613] + Float32[0.89107263, -0.049805336, -0.11809503] ⋮ - Float32[0.00049719884, -0.0022435721, 0.0021882595] - Float32[0.0010859583, -0.0024246967, 0.0015704404] - Float32[0.0015605923, -0.0023972206, 0.00084748876] - Float32[0.0018686663, -0.0021694193, 0.000106063744] - Float32[0.0019964012, -0.0017752235, -0.0005898974] - Float32[0.0019462514, -0.0012601186, -0.0011849877] - Float32[0.001729764, -0.00066427403, -0.0016438589] - Float32[0.0013776367, -5.4696073f-5, -0.0019241067] - Float32[0.001323388, 2.3389219f-5, -0.0019467665]

Notice that both stiff and non-stiff ODE solvers were used here.

Note

Time span was changed to Float32 types, as GPUs generally have very slow Float64 operations, usually around 1/32 of the speed of Float32. cu(x) on an array automatically changes an Array{Float64} to a CuArray{Float32}. If this is not intended, use the CuArray constructor directly. For more information on GPU Float64 performance issues, search around Google for discussions like this.

Warn

Float32 precision is sometimes not enough precision to accurately solve a stiff ODE. Make sure that the precision is necessary by investigating the condition number of the Jacobian. If this value is well-above 1e8, use Float32 with caution!

Restrictions of CuArrays

Note that all the rules of CUDA.jl apply when CuArrays are being used in the solver. While for most of the AbstractArray interface they act similarly to Arrays, such as having valid broadcasting operations (x .* y) defined, they will work on GPUs. For more information on the rules and restrictions of CuArrays, see this page from the CUDA.jl documentation.

+ Float32[0.000497219, -0.002243581, 0.0021882413] + Float32[0.0010859751, -0.0024246988, 0.0015704188] + Float32[0.0015606071, -0.0023972152, 0.0008474603] + Float32[0.0018686759, -0.0021694046, 0.00010603073] + Float32[0.0019964029, -0.001775199, -0.00058993173] + Float32[0.001946243, -0.0012600849, -0.0011850193] + Float32[0.0017297472, -0.000664239, -0.0016438798] + Float32[0.001377614, -5.466292f-5, -0.0019241166] + Float32[0.0013233931, 2.3381774f-5, -0.0019467642]

Notice that both stiff and non-stiff ODE solvers were used here.

Note

Time span was changed to Float32 types, as GPUs generally have very slow Float64 operations, usually around 1/32 of the speed of Float32. cu(x) on an array automatically changes an Array{Float64} to a CuArray{Float32}. If this is not intended, use the CuArray constructor directly. For more information on GPU Float64 performance issues, search around Google for discussions like this.

Warn

Float32 precision is sometimes not enough precision to accurately solve a stiff ODE. Make sure that the precision is necessary by investigating the condition number of the Jacobian. If this value is well-above 1e8, use Float32 with caution!

Restrictions of CuArrays

Note that all the rules of CUDA.jl apply when CuArrays are being used in the solver. While for most of the AbstractArray interface they act similarly to Arrays, such as having valid broadcasting operations (x .* y) defined, they will work on GPUs. For more information on the rules and restrictions of CuArrays, see this page from the CUDA.jl documentation.