Skip to content

Commit

Permalink
Initial AMDGPU support (#858)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfherbst authored Mar 1, 2025
1 parent c9cb6cb commit a9e42fd
Show file tree
Hide file tree
Showing 14 changed files with 118 additions and 61 deletions.
8 changes: 5 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ PkgVersion = "eebad327-c553-4316-9ea0-9fa01ccd7688"
Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
PseudoPotentialIO = "cb339c56-07fa-4cb2-923a-142469552264"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand All @@ -46,6 +45,7 @@ Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"

[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
GenericLinearAlgebra = "14197337-ba66-59df-a3e3-ca00e7dcff7a"
GeometryOptimization = "673bf261-a53d-43b9-876f-d3c1fc8329c2"
Expand All @@ -59,6 +59,7 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
wannier90_jll = "c5400fa0-8d08-52c2-913f-1e3f656c1ce9"

[extensions]
DFTKAMDGPUExt = "AMDGPU"
DFTKCUDAExt = "CUDA"
DFTKGenericLinearAlgebraExt = "GenericLinearAlgebra"
DFTKGeometryOptimizationExt = "GeometryOptimization"
Expand All @@ -72,6 +73,7 @@ DFTKWannierExt = "Wannier"
DFTKWriteVTKExt = "WriteVTK"

[compat]
AMDGPU = "1.2.3"
ASEconvert = "0.1"
AbstractFFTs = "1"
Aqua = "0.8.5"
Expand Down Expand Up @@ -116,7 +118,6 @@ Plots = "1"
Polynomials = "3, 4"
PrecompileTools = "1"
Preferences = "1"
Primes = "0.5"
Printf = "1"
PseudoPotentialData = "0.2.2"
PseudoPotentialIO = "0.1"
Expand All @@ -140,6 +141,7 @@ julia = "1.10"
wannier90_jll = "3.1"

[extras]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
ASEconvert = "3da9722f-58c2-4165-81be-b4d7253e8fd2"
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
AtomsBuilder = "f5cc8831-eeb7-4288-8d9f-d6c1ddb77004"
Expand Down Expand Up @@ -169,4 +171,4 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
wannier90_jll = "c5400fa0-8d08-52c2-913f-1e3f656c1ce9"

[targets]
test = ["Test", "TestItemRunner", "ASEconvert", "AtomsBuilder", "Aqua", "AtomsIO", "AtomsIOPython", "CUDA", "CUDA_Runtime_jll", "ComponentArrays", "DoubleFloats", "FiniteDiff", "FiniteDifferences", "GenericLinearAlgebra", "GeometryOptimization", "IntervalArithmetic", "JLD2", "JSON3", "Logging", "Plots", "PseudoPotentialData", "PythonCall", "QuadGK", "Random", "KrylovKit", "Wannier", "WriteVTK", "wannier90_jll"]
test = ["Test", "TestItemRunner", "AMDGPU", "ASEconvert", "AtomsBuilder", "Aqua", "AtomsIO", "AtomsIOPython", "CUDA", "CUDA_Runtime_jll", "ComponentArrays", "DoubleFloats", "FiniteDiff", "FiniteDifferences", "GenericLinearAlgebra", "GeometryOptimization", "IntervalArithmetic", "JLD2", "JSON3", "Logging", "Plots", "PseudoPotentialData", "PythonCall", "QuadGK", "Random", "KrylovKit", "Wannier", "WriteVTK", "wannier90_jll"]
16 changes: 16 additions & 0 deletions examples/amdgpu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
using AtomsBuilder
using DFTK
using AMDGPU
using PseudoPotentialData

model = model_DFT(bulk(:Si);
functionals=PBE(),
pseudopotentials=PseudoFamily("cp2k.nc.sr.pbe.v0_1.semicore.gth"))

# If available, use AMDGPU to store DFT quantities and perform main computations
architecture = has_rocm_gpu() ? DFTK.GPU(AMDGPU.ROCArray) : DFTK.CPU()

basis = PlaneWaveBasis(model; Ecut=20, kgrid=(1, 1, 1), architecture)

# Anderson does not yet work ...
scfres = self_consistent_field(basis; tol=1e-2, solver=scf_damping_solver())
4 changes: 2 additions & 2 deletions examples/gpu.jl → examples/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ using PseudoPotentialData

model = model_DFT(bulk(:Si);
functionals=PBE(),
pseudopotentials=PseudoFamily("cp2k.nc.sr.pbe.v0_1.semicore.gth"))
pseudopotentials=PseudoFamily("dojo.nc.sr.pbe.v0_4_1.standard.upf"))

# If available, use CUDA to store DFT quantities and perform main computations
architecture = has_cuda() ? DFTK.GPU(CuArray) : DFTK.CPU()

basis = PlaneWaveBasis(model; Ecut=30, kgrid=(5, 5, 5), architecture)
scfres = self_consistent_field(basis; tol=1e-2, solver=scf_damping_solver())
scfres = self_consistent_field(basis; tol=1e-6)
2 changes: 1 addition & 1 deletion examples/silicon.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ a = 10.26 # Silicon lattice constant in Bohr
lattice = a / 2 * [[0 1 1.];
[1 0 1.];
[1 1 0.]]
Si = ElementPsp(:Si, PseudoFamily("cp2k.nc.sr.lda.v0_1.semicore.gth"))
Si = ElementPsp(:Si, PseudoFamily("dojo.nc.sr.lda.v0_4_1.standard.upf"))
atoms = [Si, Si]
positions = [ones(3)/8, -ones(3)/8]

Expand Down
15 changes: 15 additions & 0 deletions ext/DFTKAMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module DFTKAMDGPUExt
using AMDGPU
using LinearAlgebra
import DFTK: GPU
using DFTK

DFTK.synchronize_device(::GPU{<:AMDGPU.ROCArray}) = AMDGPU.synchronize()

# Temporary workaround to not trigger https://github.com/JuliaGPU/AMDGPU.jl/issues/734
function LinearAlgebra.cholesky(A::Hermitian{T, <:AMDGPU.ROCArray}) where {T}
Acopy, info = AMDGPU.rocSOLVER.potrf!(A.uplo, copy(A.data))
LinearAlgebra.Cholesky(Acopy, A.uplo, info)
end

end
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,8 @@ function DFTK.build_fft_plans!(tmp::AbstractArray{<:Complex})
# opBFFT = inv(opFFT).p
opFFT = generic_plan_fft(tmp) # Fallback for now
opBFFT = generic_plan_bfft(tmp)
# TODO Can be cut once FourierTransforms supports AbstractFFTs properly
ipFFT = DummyInplace{typeof(opFFT)}(opFFT)
ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
ipFFT = DummyInplace(opFFT)
ipBFFT = DummyInplace(opBFFT)

ipFFT, opFFT, ipBFFT, opBFFT
end
Expand All @@ -51,7 +50,7 @@ struct GenericPlan{T}
factor::T
end

function Base.:*(p::GenericPlan, X::AbstractArray)
function Base.:*(p::GenericPlan, X::AbstractArray{T, 3}) where {T}
pl1, pl2, pl3 = p.subplans
ret = similar(X)
for i = 1:size(X, 1), j = 1:size(X, 2)
Expand All @@ -66,7 +65,7 @@ function Base.:*(p::GenericPlan, X::AbstractArray)
p.factor .* ret
end

LinearAlgebra.mul!(Y, p::GenericPlan, X) = Y .= p * X
LinearAlgebra.mul!(Y, p::GenericPlan, X) = Y .= p * X
LinearAlgebra.ldiv!(Y, p::GenericPlan, X) = Y .= p \ X

length(p::GenericPlan) = prod(length, p.subplans)
Expand Down
17 changes: 9 additions & 8 deletions ext/DFTKGenericLinearAlgebraExt/ctfft.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using Primes

# COV_EXCL_START
## import Primes

# These files are copied (and very slightly modified) from [FourierTransforms.jl](https://github.com/JuliaComputing/FourierTransforms.jl)
# commit [6a206bcfc8f49a129ca34beaf57d05cdc148dc8a](https://github.com/JuliaComputing/FourierTransforms.jl/tree/6a206bcfc8f49a129ca34beaf57d05cdc148dc8a).
Expand Down Expand Up @@ -618,12 +617,14 @@ function fft_factors(T::Type, n::Integer)
end
end
end
# get any leftover prime factors:
for (f,k) in factor(m)
for i=1:k
push!(factors, f)
end
end
@assert isone(m)
## Disabled to get rid of Primes dependency
## # get any leftover prime factors:
## for (f,k) in Primes.factor(m)
## for i=1:k
## push!(factors, f)
## end
## end
end
factors
end
Expand Down
6 changes: 5 additions & 1 deletion ext/DFTKIntervalArithmeticExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ import SpecialFunctions: erfc
# ... this is far from proper and a bit specific for our use case here
# (that's why it's not contributed upstream).
# should be done e.g. by changing the rounding mode ...
#
# Could be made more rigorous by using core-math C library, which provides
# a interval rounding version.
#
erfc(i::Interval) = Interval(prevfloat(erfc(i.lo)), nextfloat(erfc(i.hi)))
Base.nextfloat(x::Interval) = Interval(nextfloat(x.lo), nextfloat(x.hi))
Base.prevfloat(x::Interval) = Interval(prevfloat(x.lo), prevfloat(x.hi))
Expand All @@ -20,7 +24,7 @@ Base.prevfloat(x::Interval) = Interval(prevfloat(x.lo), prevfloat(x.hi))
# see issue #513 on IntervalArithmetic repository
DFTK.cis2pi(x::Interval) = exp(2 * (pi * (im * x)))

DFTK.value_type(::Type{<:Interval{T}}) where {T} = T
DFTK.value_type(::Type{<:Interval{T}}) where {T} = T # IntervalArithmetic.numtype(T)

function compute_Glims_fast(lattice::AbstractMatrix{<:Interval}, args...; kwargs...)
# This is done to avoid a call like ceil(Int, ::Interval)
Expand Down
10 changes: 6 additions & 4 deletions src/fft.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ function ifft!(f_real::AbstractArray3, fft_grid::FFTGrid,
fill!(f_real, 0)
f_real[Gvec_mapping] = f_fourier

mul!(f_real, fft_grid.ipBFFT, f_real) # perform IFFT
# Perform iFFT, equivalent to mul!(f_real, fft_grid.ipBFFT, f_real)
f_real = fft_grid.ipBFFT * f_real
normalize && (f_real .*= fft_grid.ifft_normalization)
f_real
end
Expand Down Expand Up @@ -165,8 +166,8 @@ function fft!(f_fourier::AbstractVector, fft_grid::FFTGrid,
@assert size(f_real) == fft_grid.fft_size
@assert length(f_fourier) == length(Gvec_mapping)

# FFT
mul!(f_real, fft_grid.ipFFT, f_real)
# Perform FFT, equivalent to mul!(f_real, fft_grid.ipFFT, f_real)
f_real = fft_grid.ipFFT * f_real

# Truncate
f_fourier .= view(f_real, Gvec_mapping)
Expand Down Expand Up @@ -365,7 +366,8 @@ function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Union{Float3
end

# TODO Some grid sizes are broken in the generic FFT implementation
# in FourierTransforms, for more details see workarounds/fft_generic.jl
# in FourierTransforms, for more details see
# ext/DFTKGenericLinearAlgebraExt/DFTKGenericLinearAlgebraExt.jl
default_primes(::Type{Float32}) = (2, 3, 5)
default_primes(::Type{Float64}) = default_primes(Float32)
next_working_fft_size(::Type{Float32}, size::Int) = size
Expand Down
9 changes: 3 additions & 6 deletions src/workarounds/dummy_inplace_fft.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
# A dummy wrapper around an out-of-place FFT plan to make it appear in-place
# This is needed for some generic FFT implementations, which do not have in-place plans
# This is needed for some FFT implementations, which do not have in-place plans
struct DummyInplace{opFFT}
fft::opFFT
end
LinearAlgebra.mul!(Y, p::DummyInplace, X) = copy!(Y, p * X)
LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = copy!(Y, p \ X)

import Base: *, \, length
*(p::DummyInplace, X) = p.fft * X
\(p::DummyInplace, X) = p.fft \ X
import Base: *, length
Base.:*(p::DummyInplace, X) = copy!(X, p.fft * X)
length(p::DummyInplace) = length(p.fft)
4 changes: 2 additions & 2 deletions src/workarounds/forwarddiff_rules.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ end
function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Dual}
opFFT = AbstractFFTs.plan_fft(tmp)
opBFFT = AbstractFFTs.plan_bfft(tmp)
ipFFT = DummyInplace{typeof(opFFT)}(opFFT)
ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
ipFFT = DummyInplace(opFFT)
ipBFFT = DummyInplace(opBFFT)
ipFFT, opFFT, ipBFFT, opBFFT
end

Expand Down
1 change: 0 additions & 1 deletion test/aqua.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,5 @@
DFTK.potential_terms, DFTK.kernel_terms]),
piracies=false,
deps_compat=(; check_extras=(; ignore=[:CUDA_Runtime_jll])),
stale_deps=(; ignore=[:Primes, ]),
)
end
77 changes: 49 additions & 28 deletions test/gpu.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
# These are not yet the best tests, but just to ensure our GPU support
# does not just break randomly
#
# TODO Also test other cases on AMDGPU
# TODO Refactor this file to reduce code duplication
#
# TODO Test Hamiltonian application on GPU
# TODO Direct minimisation
# TODO Float32
# TODO meta GGA

@testitem "CUDA silicon functionality test" tags=[:gpu] setup=[TestCases] begin
@testitem "GPU silicon functionality test" tags=[:gpu] setup=[TestCases] begin
using DFTK
using CUDA
using AMDGPU
using LinearAlgebra
silicon = TestCases.silicon

Expand All @@ -14,12 +23,27 @@
self_consistent_field(basis; tol=1e-9, solver=scf_damping_solver(damping=1.0))
end

# Bad copy and paste for now ... think of something more clever later
scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
@test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-9
@test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-9
# Test that forces compute: symmetric structure, forces are zero
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-10
@testset "CUDA" begin
if CUDA.has_cuda() && CUDA.has_cuda_gpu()
scfres_cuda = run_problem(; architecture=DFTK.GPU(CuArray))
@test scfres_cuda.ρ isa CuArray
@test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-9
@test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-9
@test norm(compute_forces(scfres_cuda)) < 1e-6 # Symmetric structure
end
end

@testset "AMDGPU" begin
if AMDGPU.has_rocm_gpu()
scfres_rocm = run_problem(; architecture=DFTK.GPU(ROCArray))
@test scfres_rocm.ρ isa ROCArray
@test abs(scfres_cpu.energies.total - scfres_rocm.energies.total) < 1e-9
@test norm(scfres_cpu.ρ - Array(scfres_rocm.ρ)) < 1e-9
@test norm(compute_forces(scfres_rocm)) < 1e-6 # Symmetric structure
end
end
end

@testitem "CUDA iron functionality test" tags=[:gpu] setup=[TestCases] begin
Expand All @@ -37,24 +61,26 @@ end
ρ = guess_density(basis, magnetic_moments)

# TODO Bump tolerance a bit here ... still leads to NaNs unfortunately
self_consistent_field(basis; ρ, tol=1e-7, mixing=KerkerMixing(),
solver=scf_damping_solver(damping=1.0))
self_consistent_field(basis; ρ, tol=1e-7, mixing=KerkerMixing())
end

scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
@test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-7
@test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-6
# Test that forces compute: symmetric structure, forces are zero
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-9
if CUDA.has_cuda() && CUDA.has_cuda_gpu()
ArrayType = CuArray
scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_cuda = run_problem(; architecture=DFTK.GPU(ArrayType))
@test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-7
@test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-6
# Test that forces compute: symmetric structure, forces are zero
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_cuda)) < 1e-9
end
end

@testitem "CUDA aluminium forces test" tags=[:gpu] setup=[TestCases] begin
using DFTK
using CUDA
using LinearAlgebra
aluminium = TestCases.aluminium
positions = aluminium.positions
positions = copy(aluminium.positions)
# Perturb equilibrium position for non-zero forces
positions[1] += [0.01, 0.0, -0.01]

Expand All @@ -68,17 +94,12 @@ end
self_consistent_field(basis; tol=1e-10)
end

scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
@test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-10
@test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-8
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-7
if CUDA.has_cuda() && CUDA.has_cuda_gpu()
ArrayType = CuArray
scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_cuda = run_problem(; architecture=DFTK.GPU(ArrayType))
@test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-10
@test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-8
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_cuda)) < 1e-7
end
end


# TODO Test hamiltonian application on GPU
# TODO Direct minimisation
# TODO Float32
# TODO meta GGA
# TODO Aluminium with LdosMixing
# TODO Anderson acceleration
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ runfile = joinpath(@__DIR__, "runtests_runner.jl")
# Trigger some precompilation and build steps
using ASEconvert
using CUDA
using AMDGPU
using DFTK
using Interpolations

Expand Down

0 comments on commit a9e42fd

Please sign in to comment.