Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use extension for LoopVectorization dependency #83

Merged
merged 8 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,28 @@ name = "ImplicitGlobalGrid"
uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
version = "0.14.0"

[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"

[weakdeps]
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"

[extensions]
ImplicitGlobalGrid_LoopVectorizationExt = "LoopVectorization"

[compat]
AMDGPU = "0.5, 0.6, 0.7, 0.8"
CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4, 5"
LoopVectorization = "0.12"
MPI = "0.20"
julia = "1.9"

[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"

[extras]
CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "MPIPreferences"]
test = ["Test", "MPIPreferences", "LoopVectorization"]
3 changes: 3 additions & 0 deletions ext/ImplicitGlobalGrid_LoopVectorizationExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module ImplicitGlobalGrid_LoopVectorizationExt
include(joinpath(@__DIR__, "..", "src", "LoopVectorizationExt", "memcopy_LV.jl"))
end
49 changes: 49 additions & 0 deletions src/Exceptions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
module Exceptions
export @ModuleInternalError, @IncoherentCallError, @NotInitializedError, @NotLoadedError, @IncoherentArgumentError, @KeywordArgumentError, @ArgumentEvaluationError, @ArgumentError
export ModuleInternalError, IncoherentCallError, NotInitializedError, NotLoadedError, IncoherentArgumentError, KeywordArgumentError, ArgumentEvaluationError

macro ModuleInternalError(msg) esc(:(throw(ModuleInternalError($msg)))) end
macro IncoherentCallError(msg) esc(:(throw(IncoherentCallError($msg)))) end
macro NotInitializedError(msg) esc(:(throw(NotInitializedError($msg)))) end
macro NotLoadedError(msg) esc(:(throw(NotLoadedError($msg)))) end
macro IncoherentArgumentError(msg) esc(:(throw(IncoherentArgumentError($msg)))) end
macro KeywordArgumentError(msg) esc(:(throw(KeywordArgumentError($msg)))) end
macro ArgumentEvaluationError(msg) esc(:(throw(ArgumentEvaluationError($msg)))) end
macro ArgumentError(msg) esc(:(throw(ArgumentError($msg)))) end

struct ModuleInternalError <: Exception
msg::String
end
Base.showerror(io::IO, e::ModuleInternalError) = print(io, "ModuleInternalError: ", e.msg)

struct IncoherentCallError <: Exception
msg::String
end
Base.showerror(io::IO, e::IncoherentCallError) = print(io, "IncoherentCallError: ", e.msg)

struct NotInitializedError <: Exception
msg::String
end
Base.showerror(io::IO, e::NotInitializedError) = print(io, "NotInitializedError: ", e.msg)

struct NotLoadedError <: Exception
msg::String
end
Base.showerror(io::IO, e::NotLoadedError) = print(io, "NotLoadedError: ", e.msg)

struct IncoherentArgumentError <: Exception
msg::String
end
Base.showerror(io::IO, e::IncoherentArgumentError) = print(io, "IncoherentArgumentError: ", e.msg)

struct KeywordArgumentError <: Exception
msg::String
end
Base.showerror(io::IO, e::KeywordArgumentError) = print(io, "KeywordArgumentError: ", e.msg)

struct ArgumentEvaluationError <: Exception
msg::String
end
Base.showerror(io::IO, e::ArgumentEvaluationError) = print(io, "ArgumentEvaluationError: ", e.msg)

end # Module Exceptions
7 changes: 7 additions & 0 deletions src/ImplicitGlobalGrid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,16 @@ To see a description of a function type `?<functionname>`.
"""
module ImplicitGlobalGrid

## Include of exception module
include("Exceptions.jl");
using .Exceptions

## Include of shared constant parameters, types and syntax sugar
include("shared.jl")

## Alphabetical include of defaults for extensions
include(joinpath("LoopVectorizationExt", "memcopy_LV_default.jl"))

## Alphabetical include of files
include("finalize_global_grid.jl")
include("gather.jl")
Expand Down
9 changes: 9 additions & 0 deletions src/LoopVectorizationExt/memcopy_LV.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import ImplicitGlobalGrid
import ImplicitGlobalGrid: GGNumber
using LoopVectorization

function ImplicitGlobalGrid.memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
@tturbo for i ∈ eachindex(dst, src) # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228
@inbounds dst[i] = src[i] # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe.
end
end
3 changes: 3 additions & 0 deletions src/LoopVectorizationExt/memcopy_LV_default.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
const ERRMSG_EXTENSION_NOT_LOADED = "AD: the LoopVectorization extension was not loaded. Make sure to import LoopVectorization before ImplicitGlobalGrid."

memcopy_loopvect!(args...) = @NotLoadedError(ERRMSG_EXTENSION_NOT_LOADED)
1 change: 0 additions & 1 deletion src/shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import MPI
using CUDA
using AMDGPU
using Base.Threads
using LoopVectorization


##-------------------------
Expand Down
44 changes: 19 additions & 25 deletions src/update_halo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -583,13 +583,14 @@ function write_h2h!(sendbuf::AbstractArray{T}, A::Array{T}, sendranges::Array{Un
ix = (length(sendranges[1])==1) ? sendranges[1][1] : sendranges[1];
iy = (length(sendranges[2])==1) ? sendranges[2][1] : sendranges[2];
iz = (length(sendranges[3])==1) ? sendranges[3][1] : sendranges[3];
if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(sendbuf, view(A,ix, :, :), loopvectorization(dim));
elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(sendbuf, view(A,ix, :,iz), loopvectorization(dim));
elseif (dim == 1 && length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(sendbuf, view(A,ix,iy,iz), loopvectorization(dim));
elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(sendbuf, view(A, :,iy, :), loopvectorization(dim));
elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(sendbuf, view(A, :,iy,iz), loopvectorization(dim));
elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(sendbuf, view(A, :, :,iz), loopvectorization(dim));
elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course).
if (length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(sendbuf, 1, :, :), view(A,ix, :, :), loopvectorization(dim));
elseif (length(ix)==1 && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(sendbuf, 1, 1, :), view(A,ix,iy, :), loopvectorization(dim));
elseif (length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(sendbuf, 1, :, 1), view(A,ix, :,iz), loopvectorization(dim));
elseif (length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(sendbuf, 1, 1, 1), view(A,ix,iy,iz), loopvectorization(dim));
elseif (ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(sendbuf, :, 1, :), view(A, :,iy, :), loopvectorization(dim));
elseif (ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(sendbuf, :, 1, 1), view(A, :,iy,iz), loopvectorization(dim));
elseif (ix == 1:size(A,1) && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(sendbuf, :, :, 1), view(A, :, :,iz), loopvectorization(dim));
else memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the optimised cases above (the result would be the same, of course).
end
end

Expand All @@ -598,13 +599,14 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni
ix = (length(recvranges[1])==1) ? recvranges[1][1] : recvranges[1];
iy = (length(recvranges[2])==1) ? recvranges[2][1] : recvranges[2];
iz = (length(recvranges[3])==1) ? recvranges[3][1] : recvranges[3];
if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), recvbuf, loopvectorization(dim));
elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A,ix, :,iz), recvbuf, loopvectorization(dim));
elseif (dim == 1 && length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(A,ix,iy,iz), recvbuf, loopvectorization(dim));
elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), recvbuf, loopvectorization(dim));
elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(A, :,iy,iz), recvbuf, loopvectorization(dim));
elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(view(A, :, :,iz), recvbuf, loopvectorization(dim));
elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course).
if (length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), view(recvbuf, 1, :, :), loopvectorization(dim));
elseif (length(ix)==1 && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A,ix,iy, :), view(recvbuf, 1, 1, :), loopvectorization(dim));
elseif (length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A,ix, :,iz), view(recvbuf, 1, :, 1), loopvectorization(dim));
elseif (length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(A,ix,iy,iz), view(recvbuf, 1, 1, 1), loopvectorization(dim));
elseif (ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), view(recvbuf, :, 1, :), loopvectorization(dim));
elseif (ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(A, :,iy,iz), view(recvbuf, :, 1, 1), loopvectorization(dim));
elseif (ix == 1:size(A,1) && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A, :, :,iz), view(recvbuf, :, :, 1), loopvectorization(dim));
else memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the optimised cases above (the result would be the same, of course).
end
end

Expand Down Expand Up @@ -757,7 +759,7 @@ function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer)
end

function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorization::Bool) where T <: GGNumber
if loopvectorization && !(T <: Complex) # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance.
if loopvectorization && nthreads() > 1 && length(src) > 1 && !(T <: Complex) # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance.
memcopy_loopvect!(dst, src)
else
dst_flat = view(dst,:)
Expand All @@ -766,7 +768,9 @@ function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorizatio
end
end


# (CPU functions)

function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
if nthreads() > 1 && sizeof(src) >= GG_THREADCOPY_THRESHOLD
@threads for i = 1:length(dst) # NOTE: Set the number of threads e.g. as: export JULIA_NUM_THREADS=12
Expand All @@ -777,16 +781,6 @@ function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T
end
end

function memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
if nthreads() > 1 && length(src) > 1
@tturbo for i ∈ eachindex(dst, src) # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228
@inbounds dst[i] = src[i] # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe.
end
else
@inbounds copyto!(dst, src)
end
end


# (CUDA functions)

Expand Down
1 change: 1 addition & 0 deletions test/test_update_halo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

push!(LOAD_PATH, "../src")
using Test
import LoopVectorization
using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
import MPI
using CUDA
Expand Down
Loading