Initial AMDGPU support (#858)

JuliaMolSim · Mar 1, 2025 · a9e42fd · a9e42fd
1 parent c9cb6cb
commit a9e42fd
Show file tree

Hide file tree

Showing 14 changed files with 118 additions and 61 deletions.
diff --git a/Project.toml b/Project.toml
@@ -31,7 +31,6 @@ PkgVersion = "eebad327-c553-4316-9ea0-9fa01ccd7688"
 Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
-Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 PseudoPotentialIO = "cb339c56-07fa-4cb2-923a-142469552264"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -46,6 +45,7 @@ Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
 UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GenericLinearAlgebra = "14197337-ba66-59df-a3e3-ca00e7dcff7a"
 GeometryOptimization = "673bf261-a53d-43b9-876f-d3c1fc8329c2"
@@ -59,6 +59,7 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 wannier90_jll = "c5400fa0-8d08-52c2-913f-1e3f656c1ce9"
 
 [extensions]
+DFTKAMDGPUExt = "AMDGPU"
 DFTKCUDAExt = "CUDA"
 DFTKGenericLinearAlgebraExt = "GenericLinearAlgebra"
 DFTKGeometryOptimizationExt = "GeometryOptimization"
@@ -72,6 +73,7 @@ DFTKWannierExt = "Wannier"
 DFTKWriteVTKExt = "WriteVTK"
 
 [compat]
+AMDGPU = "1.2.3"
 ASEconvert = "0.1"
 AbstractFFTs = "1"
 Aqua = "0.8.5"
@@ -116,7 +118,6 @@ Plots = "1"
 Polynomials = "3, 4"
 PrecompileTools = "1"
 Preferences = "1"
-Primes = "0.5"
 Printf = "1"
 PseudoPotentialData = "0.2.2"
 PseudoPotentialIO = "0.1"
@@ -140,6 +141,7 @@ julia = "1.10"
 wannier90_jll = "3.1"
 
 [extras]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 ASEconvert = "3da9722f-58c2-4165-81be-b4d7253e8fd2"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 AtomsBuilder = "f5cc8831-eeb7-4288-8d9f-d6c1ddb77004"
@@ -169,4 +171,4 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 wannier90_jll = "c5400fa0-8d08-52c2-913f-1e3f656c1ce9"
 
 [targets]
-test = ["Test", "TestItemRunner", "ASEconvert", "AtomsBuilder", "Aqua", "AtomsIO", "AtomsIOPython", "CUDA", "CUDA_Runtime_jll", "ComponentArrays", "DoubleFloats", "FiniteDiff", "FiniteDifferences", "GenericLinearAlgebra", "GeometryOptimization", "IntervalArithmetic", "JLD2", "JSON3", "Logging", "Plots", "PseudoPotentialData", "PythonCall", "QuadGK", "Random", "KrylovKit", "Wannier", "WriteVTK", "wannier90_jll"]
+test = ["Test", "TestItemRunner", "AMDGPU", "ASEconvert", "AtomsBuilder", "Aqua", "AtomsIO", "AtomsIOPython", "CUDA", "CUDA_Runtime_jll", "ComponentArrays", "DoubleFloats", "FiniteDiff", "FiniteDifferences", "GenericLinearAlgebra", "GeometryOptimization", "IntervalArithmetic", "JLD2", "JSON3", "Logging", "Plots", "PseudoPotentialData", "PythonCall", "QuadGK", "Random", "KrylovKit", "Wannier", "WriteVTK", "wannier90_jll"]
diff --git a/examples/amdgpu.jl b/examples/amdgpu.jl
@@ -0,0 +1,16 @@
+using AtomsBuilder
+using DFTK
+using AMDGPU
+using PseudoPotentialData
+
+model  = model_DFT(bulk(:Si);
+                   functionals=PBE(),
+                   pseudopotentials=PseudoFamily("cp2k.nc.sr.pbe.v0_1.semicore.gth"))
+
+# If available, use AMDGPU to store DFT quantities and perform main computations
+architecture = has_rocm_gpu() ? DFTK.GPU(AMDGPU.ROCArray) : DFTK.CPU()
+
+basis  = PlaneWaveBasis(model; Ecut=20, kgrid=(1, 1, 1), architecture)
+
+# Anderson does not yet work ...
+scfres = self_consistent_field(basis; tol=1e-2, solver=scf_damping_solver())
diff --git a/examples/gpu.jl → examples/cuda.jl b/examples/gpu.jl → examples/cuda.jl
@@ -5,10 +5,10 @@ using PseudoPotentialData
 
 model  = model_DFT(bulk(:Si);
                    functionals=PBE(),
-                   pseudopotentials=PseudoFamily("cp2k.nc.sr.pbe.v0_1.semicore.gth"))
+                   pseudopotentials=PseudoFamily("dojo.nc.sr.pbe.v0_4_1.standard.upf"))
 
 # If available, use CUDA to store DFT quantities and perform main computations
 architecture = has_cuda() ? DFTK.GPU(CuArray) : DFTK.CPU()
 
 basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(5, 5, 5), architecture)
-scfres = self_consistent_field(basis; tol=1e-2, solver=scf_damping_solver())
+scfres = self_consistent_field(basis; tol=1e-6)
diff --git a/examples/silicon.jl b/examples/silicon.jl
@@ -6,7 +6,7 @@ a = 10.26  # Silicon lattice constant in Bohr
 lattice = a / 2 * [[0 1 1.];
                    [1 0 1.];
                    [1 1 0.]]
-Si = ElementPsp(:Si, PseudoFamily("cp2k.nc.sr.lda.v0_1.semicore.gth"))
+Si = ElementPsp(:Si, PseudoFamily("dojo.nc.sr.lda.v0_4_1.standard.upf"))
 atoms     = [Si, Si]
 positions = [ones(3)/8, -ones(3)/8]
 

diff --git a/ext/DFTKAMDGPUExt.jl b/ext/DFTKAMDGPUExt.jl
@@ -0,0 +1,15 @@
+module DFTKAMDGPUExt
+using AMDGPU
+using LinearAlgebra
+import DFTK: GPU
+using DFTK
+
+DFTK.synchronize_device(::GPU{<:AMDGPU.ROCArray}) = AMDGPU.synchronize()
+
+# Temporary workaround to not trigger https://github.com/JuliaGPU/AMDGPU.jl/issues/734
+function LinearAlgebra.cholesky(A::Hermitian{T, <:AMDGPU.ROCArray}) where {T}
+    Acopy, info = AMDGPU.rocSOLVER.potrf!(A.uplo, copy(A.data))
+    LinearAlgebra.Cholesky(Acopy, A.uplo, info)
+end
+
+end
diff --git a/ext/DFTKGenericLinearAlgebraExt/DFTKGenericLinearAlgebraExt.jl b/ext/DFTKGenericLinearAlgebraExt/DFTKGenericLinearAlgebraExt.jl
@@ -39,9 +39,8 @@ function DFTK.build_fft_plans!(tmp::AbstractArray{<:Complex})
     # opBFFT = inv(opFFT).p
     opFFT  = generic_plan_fft(tmp)               # Fallback for now
     opBFFT = generic_plan_bfft(tmp)
-    # TODO Can be cut once FourierTransforms supports AbstractFFTs properly
-    ipFFT  = DummyInplace{typeof(opFFT)}(opFFT)
-    ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
+    ipFFT  = DummyInplace(opFFT)
+    ipBFFT = DummyInplace(opBFFT)
 
     ipFFT, opFFT, ipBFFT, opBFFT
 end
@@ -51,7 +50,7 @@ struct GenericPlan{T}
     factor::T
 end
 
-function Base.:*(p::GenericPlan, X::AbstractArray)
+function Base.:*(p::GenericPlan, X::AbstractArray{T, 3}) where {T}
     pl1, pl2, pl3 = p.subplans
     ret = similar(X)
     for i = 1:size(X, 1), j = 1:size(X, 2)
@@ -66,7 +65,7 @@ function Base.:*(p::GenericPlan, X::AbstractArray)
     p.factor .* ret
 end
 
-LinearAlgebra.mul!(Y, p::GenericPlan, X) = Y .= p * X
+LinearAlgebra.mul!(Y,  p::GenericPlan, X) = Y .= p * X
 LinearAlgebra.ldiv!(Y, p::GenericPlan, X) = Y .= p \ X
 
 length(p::GenericPlan) = prod(length, p.subplans)

diff --git a/ext/DFTKGenericLinearAlgebraExt/ctfft.jl b/ext/DFTKGenericLinearAlgebraExt/ctfft.jl
@@ -1,6 +1,5 @@
-using Primes
-
 # COV_EXCL_START
+## import Primes
 
 # These files are copied (and very slightly modified) from [FourierTransforms.jl](https://github.com/JuliaComputing/FourierTransforms.jl)
 # commit [6a206bcfc8f49a129ca34beaf57d05cdc148dc8a](https://github.com/JuliaComputing/FourierTransforms.jl/tree/6a206bcfc8f49a129ca34beaf57d05cdc148dc8a).
@@ -618,12 +617,14 @@ function fft_factors(T::Type, n::Integer)
                 end
             end
         end
-        # get any leftover prime factors:
-        for (f,k) in factor(m)
-            for i=1:k
-                push!(factors, f)
-            end
-        end
+        @assert isone(m)
+        ## Disabled to get rid of Primes dependency
+        ## # get any leftover prime factors:
+        ## for (f,k) in Primes.factor(m)
+        ##     for i=1:k
+        ##         push!(factors, f)
+        ##     end
+        ## end
     end
     factors
 end

diff --git a/ext/DFTKIntervalArithmeticExt.jl b/ext/DFTKIntervalArithmeticExt.jl
@@ -11,6 +11,10 @@ import SpecialFunctions: erfc
 # ... this is far from proper and a bit specific for our use case here
 # (that's why it's not contributed upstream).
 # should be done e.g. by changing  the rounding mode ...
+#
+# Could be made more rigorous by using core-math C library, which provides
+# a interval rounding version.
+#
 erfc(i::Interval) = Interval(prevfloat(erfc(i.lo)), nextfloat(erfc(i.hi)))
 Base.nextfloat(x::Interval) = Interval(nextfloat(x.lo), nextfloat(x.hi))
 Base.prevfloat(x::Interval) = Interval(prevfloat(x.lo), prevfloat(x.hi))
@@ -20,7 +24,7 @@ Base.prevfloat(x::Interval) = Interval(prevfloat(x.lo), prevfloat(x.hi))
 # see issue #513 on IntervalArithmetic repository
 DFTK.cis2pi(x::Interval) = exp(2 * (pi * (im * x)))
 
-DFTK.value_type(::Type{<:Interval{T}}) where {T} = T
+DFTK.value_type(::Type{<:Interval{T}}) where {T} = T  # IntervalArithmetic.numtype(T)
 
 function compute_Glims_fast(lattice::AbstractMatrix{<:Interval}, args...; kwargs...)
     # This is done to avoid a call like ceil(Int, ::Interval)

diff --git a/src/fft.jl b/src/fft.jl
@@ -116,7 +116,8 @@ function ifft!(f_real::AbstractArray3, fft_grid::FFTGrid,
     fill!(f_real, 0)
     f_real[Gvec_mapping] = f_fourier
 
-    mul!(f_real, fft_grid.ipBFFT, f_real)  # perform IFFT
+    # Perform iFFT, equivalent to mul!(f_real, fft_grid.ipBFFT, f_real)
+    f_real = fft_grid.ipBFFT * f_real
     normalize && (f_real .*= fft_grid.ifft_normalization)
     f_real
 end
@@ -165,8 +166,8 @@ function fft!(f_fourier::AbstractVector, fft_grid::FFTGrid,
     @assert size(f_real) == fft_grid.fft_size
     @assert length(f_fourier) == length(Gvec_mapping)
 
-    # FFT
-    mul!(f_real, fft_grid.ipFFT, f_real)
+    # Perform FFT, equivalent to mul!(f_real, fft_grid.ipFFT, f_real)
+    f_real = fft_grid.ipFFT * f_real
 
     # Truncate
     f_fourier .= view(f_real, Gvec_mapping)
@@ -365,7 +366,8 @@ function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Union{Float3
 end
 
 # TODO Some grid sizes are broken in the generic FFT implementation
-# in FourierTransforms, for more details see workarounds/fft_generic.jl
+# in FourierTransforms, for more details see
+# ext/DFTKGenericLinearAlgebraExt/DFTKGenericLinearAlgebraExt.jl
 default_primes(::Type{Float32}) = (2, 3, 5)
 default_primes(::Type{Float64}) = default_primes(Float32)
 next_working_fft_size(::Type{Float32}, size::Int) = size

diff --git a/src/workarounds/dummy_inplace_fft.jl b/src/workarounds/dummy_inplace_fft.jl
@@ -1,12 +1,9 @@
 # A dummy wrapper around an out-of-place FFT plan to make it appear in-place
-# This is needed for some generic FFT implementations, which do not have in-place plans
+# This is needed for some FFT implementations, which do not have in-place plans
 struct DummyInplace{opFFT}
     fft::opFFT
 end
-LinearAlgebra.mul!(Y, p::DummyInplace, X)  = copy!(Y, p * X)
-LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = copy!(Y, p \ X)
 
-import Base: *, \, length
-*(p::DummyInplace, X) = p.fft * X
-\(p::DummyInplace, X) = p.fft \ X
+import Base: *, length
+Base.:*(p::DummyInplace, X) = copy!(X, p.fft * X)
 length(p::DummyInplace) = length(p.fft)
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
@@ -50,8 +50,8 @@ end
 function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Dual}
     opFFT  = AbstractFFTs.plan_fft(tmp)
     opBFFT = AbstractFFTs.plan_bfft(tmp)
-    ipFFT  = DummyInplace{typeof(opFFT)}(opFFT)
-    ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
+    ipFFT  = DummyInplace(opFFT)
+    ipBFFT = DummyInplace(opBFFT)
     ipFFT, opFFT, ipBFFT, opBFFT
 end
 

diff --git a/test/aqua.jl b/test/aqua.jl
@@ -13,6 +13,5 @@
                                DFTK.potential_terms, DFTK.kernel_terms]),
                   piracies=false,
                   deps_compat=(; check_extras=(; ignore=[:CUDA_Runtime_jll])),
-                  stale_deps=(; ignore=[:Primes, ]),
                  )
 end
diff --git a/test/gpu.jl b/test/gpu.jl
@@ -1,9 +1,18 @@
 # These are not yet the best tests, but just to ensure our GPU support
 # does not just break randomly
+#
+# TODO Also test other cases on AMDGPU
+# TODO Refactor this file to reduce code duplication
+#
+# TODO Test Hamiltonian application on GPU
+# TODO Direct minimisation
+# TODO Float32
+# TODO meta GGA
 
-@testitem "CUDA silicon functionality test" tags=[:gpu] setup=[TestCases] begin
+@testitem "GPU silicon functionality test" tags=[:gpu] setup=[TestCases] begin
     using DFTK
     using CUDA
+    using AMDGPU
     using LinearAlgebra
     silicon = TestCases.silicon
 
@@ -14,12 +23,27 @@
         self_consistent_field(basis; tol=1e-9, solver=scf_damping_solver(damping=1.0))
     end
 
+    # Bad copy and paste for now ... think of something more clever later
     scfres_cpu = run_problem(; architecture=DFTK.CPU())
-    scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
-    @test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-9
-    @test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-9
-    # Test that forces compute: symmetric structure, forces are zero
-    @test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-10
+    @testset "CUDA" begin
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        scfres_cuda = run_problem(; architecture=DFTK.GPU(CuArray))
+        @test scfres_cuda.ρ isa CuArray
+        @test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-9
+        @test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-9
+        @test norm(compute_forces(scfres_cuda)) < 1e-6  # Symmetric structure
+    end
+    end
+
+    @testset "AMDGPU" begin
+    if AMDGPU.has_rocm_gpu()
+        scfres_rocm = run_problem(; architecture=DFTK.GPU(ROCArray))
+        @test scfres_rocm.ρ isa ROCArray
+        @test abs(scfres_cpu.energies.total - scfres_rocm.energies.total) < 1e-9
+        @test norm(scfres_cpu.ρ - Array(scfres_rocm.ρ)) < 1e-9
+        @test norm(compute_forces(scfres_rocm)) < 1e-6  # Symmetric structure
+    end
+    end
 end
 
 @testitem "CUDA iron functionality test" tags=[:gpu] setup=[TestCases] begin
@@ -37,24 +61,26 @@ end
         ρ = guess_density(basis, magnetic_moments)
 
         # TODO Bump tolerance a bit here ... still leads to NaNs unfortunately
-        self_consistent_field(basis; ρ, tol=1e-7, mixing=KerkerMixing(),
-                              solver=scf_damping_solver(damping=1.0))
+        self_consistent_field(basis; ρ, tol=1e-7, mixing=KerkerMixing())
     end
 
-    scfres_cpu = run_problem(; architecture=DFTK.CPU())
-    scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
-    @test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-7
-    @test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-6
-    # Test that forces compute: symmetric structure, forces are zero
-    @test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-9
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ArrayType = CuArray
+        scfres_cpu  = run_problem(; architecture=DFTK.CPU())
+        scfres_cuda = run_problem(; architecture=DFTK.GPU(ArrayType))
+        @test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-7
+        @test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-6
+        # Test that forces compute: symmetric structure, forces are zero
+        @test norm(compute_forces(scfres_cpu) - compute_forces(scfres_cuda)) < 1e-9
+    end
 end
 
 @testitem "CUDA aluminium forces test" tags=[:gpu] setup=[TestCases] begin
     using DFTK
     using CUDA
     using LinearAlgebra
     aluminium = TestCases.aluminium
-    positions = aluminium.positions
+    positions = copy(aluminium.positions)
     # Perturb equilibrium position for non-zero forces
     positions[1] += [0.01, 0.0, -0.01]
 
@@ -68,17 +94,12 @@ end
         self_consistent_field(basis; tol=1e-10)
     end
 
-    scfres_cpu = run_problem(; architecture=DFTK.CPU())
-    scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
-    @test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-10
-    @test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-8
-    @test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-7
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ArrayType = CuArray
+        scfres_cpu  = run_problem(; architecture=DFTK.CPU())
+        scfres_cuda = run_problem(; architecture=DFTK.GPU(ArrayType))
+        @test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-10
+        @test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-8
+        @test norm(compute_forces(scfres_cpu) - compute_forces(scfres_cuda)) < 1e-7
+    end
 end
-
-
-# TODO Test hamiltonian application on GPU
-# TODO Direct minimisation
-# TODO Float32
-# TODO meta GGA
-# TODO Aluminium with LdosMixing
-# TODO Anderson acceleration
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -16,6 +16,7 @@ runfile = joinpath(@__DIR__, "runtests_runner.jl")
 # Trigger some precompilation and build steps
 using ASEconvert
 using CUDA
+using AMDGPU
 using DFTK
 using Interpolations