diff --git a/Manifest.toml b/Manifest.toml index 6c5e435e02..65ef0b2d31 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,19 +1,32 @@ # This file is machine-generated - editing it directly is not advised [[AbstractFFTs]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] +deps = ["LinearAlgebra"] git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" version = "1.5.0" + [AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + [[Adapt]] deps = ["LinearAlgebra", "Requires"] git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" version = "3.6.2" +weakdeps = ["StaticArrays"] + + [Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" [[ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" [[Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" @@ -56,27 +69,20 @@ git-tree-sha1 = "5248d9c45712e51e27ba9b30eebec65658c6ce29" uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" version = "0.6.0+0" -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "e30f2f4e20f7f186dc36529910beaedc60cfa644" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.16.0" - -[[ChangesOfVariables]] -deps = ["InverseFunctions", "LinearAlgebra", "Test"] -git-tree-sha1 = "2fba81a302a7be671aefe194f0525ef231104e7f" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" -version = "0.1.8" - [[Compat]] -deps = ["Dates", "LinearAlgebra", "UUIDs"] +deps = ["UUIDs"] git-tree-sha1 = "e460f044ca8b99be31d35fe54fc33a5c33dd8ed7" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" version = "4.9.0" +weakdeps = ["Dates", "LinearAlgebra"] + + [Compat.extensions] + CompatLinearAlgebraExt = "LinearAlgebra" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.2+0" [[Crayons]] git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" @@ -116,14 +122,25 @@ uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" version = "0.9.3" [[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" [[ExprTools]] git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" version = "0.1.10" +[[FastmathOverlay]] +git-tree-sha1 = "f4babf7d075349104366b0f68c9492d3c82ed021" +repo-rev = "main" +repo-url = "https://github.com/vchuravy/FastmathOverlay.jl" +uuid = "32340bd4-0178-4ab8-8fce-9b94bc594446" +version = "0.1.0" + +[[FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + [[Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" @@ -142,9 +159,11 @@ version = "0.1.5" [[GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3" +git-tree-sha1 = "7b56a06e5b8ff4e4ecebe070cc90662497f16858" +repo-rev = "vc/mtv" +repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl.git" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.21.4" +version = "0.22.0" [[InlineStrings]] deps = ["Parsers"] @@ -156,12 +175,6 @@ version = "1.4.0" deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "68772f49f54b479fa88ace904f6127f0a3bb2e46" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.12" - [[InvertedIndices]] git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" @@ -178,10 +191,10 @@ uuid = "82899510-4779-5014-852e-03e436cf321d" version = "1.0.0" [[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "a7e91ef94114d5bc8952bcaa8d6ff952cf709808" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.4.1" +version = "1.4.2" [[KernelAbstractions]] deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] @@ -189,6 +202,12 @@ git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" version = "0.9.8" + [KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" + + [KernelAbstractions.weakdeps] + EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + [[LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] git-tree-sha1 = "8695a49bfe05a2dc0feeefd06b4ca6361a018729" @@ -213,10 +232,12 @@ uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" [[LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" [[LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] @@ -225,20 +246,31 @@ uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" [[LinearAlgebra]] -deps = ["Libdl"] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] git-tree-sha1 = "c3ce8e7420b3a6e071e0fe4745f5d4300e37b13f" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" version = "0.3.24" + [LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" @@ -255,6 +287,7 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" [[Missings]] deps = ["DataAPI"] @@ -264,13 +297,21 @@ version = "1.1.0" [[MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" [[NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" [[OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" [[OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] @@ -290,8 +331,9 @@ uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" version = "2.7.2" [[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.0" [[PooledArrays]] deps = ["DataAPI", "Future"] @@ -326,7 +368,7 @@ deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[Random]] -deps = ["Serialization"] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[Random123]] @@ -354,6 +396,7 @@ version = "1.3.0" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" [[Scratch]] deps = ["Dates"] @@ -380,20 +423,30 @@ uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" version = "1.1.1" [[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3" +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.3.0" +version = "2.3.1" + + [SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" + + [SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" [[StaticArrays]] -deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] git-tree-sha1 = "9cabadf6e7cd2349b6cf49f1915ad2028d65e881" uuid = "90137ffa-7385-5640-81b9-e52037218182" version = "1.6.2" +weakdeps = ["Statistics"] + + [StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" [[StaticArraysCore]] git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" @@ -403,15 +456,22 @@ version = "1.4.2" [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" [[StringManipulation]] git-tree-sha1 = "46da2434b41f41ac3594ee9816ce5541c6096123" uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" version = "0.3.0" +[[SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + [[TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" [[TableTraits]] deps = ["IteratorInterfaceExtensions"] @@ -428,6 +488,7 @@ version = "1.10.1" [[Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" [[Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] @@ -460,11 +521,19 @@ version = "0.1.3" [[Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" + +[[libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" [[nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" [[p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/Project.toml b/Project.toml index 1c9fea40d9..4335ffce2c 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +FastmathOverlay = "32340bd4-0178-4ab8-8fce-9b94bc594446" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" @@ -46,7 +47,7 @@ Crayons = "4" DataFrames = "1" ExprTools = "0.1" GPUArrays = "8.6" -GPUCompiler = "0.21" +GPUCompiler = "0.22" KernelAbstractions = "0.9.2" LLVM = "6" Preferences = "1" diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index c46f031bcd..284f63ca7a 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -1,6 +1,8 @@ ## gpucompiler interface implementation -struct CUDACompilerParams <: AbstractCompilerParams end +Base.@kwdef struct CUDACompilerParams <: AbstractCompilerParams + contract::Bool=false +end const CUDACompilerConfig = CompilerConfig{PTXCompilerTarget, CUDACompilerParams} const CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams} @@ -38,7 +40,19 @@ function GPUCompiler.link_libraries!(@nospecialize(job::CUDACompilerJob), mod::L return end -GPUCompiler.method_table(@nospecialize(job::CUDACompilerJob)) = method_table +import FastmathOverlay +if FastmathOverlay.functional() + function GPUCompiler.method_table_view(@nospecialize(job::CUDACompilerJob)) + if job.config.params.contract + mtv = FastmathOverlay.contract(job.world, method_table) + return mtv + else + return Core.Compiler.OverlayMethodTable(job.world, method_table) + end + end +else + GPUCompiler.method_table(@nospecialize(job::CUDACompilerJob)) = method_table +end GPUCompiler.kernel_state_type(job::CUDACompilerJob) = KernelState @@ -68,7 +82,7 @@ function compiler_config(dev; kwargs...) end return config end -@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, kwargs...) +@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, contract=false, kwargs...) # determine the toolchain (cached, because this is slow) if !isassigned(_toolchain) _toolchain[] = supported_toolchain() @@ -92,7 +106,7 @@ end # create GPUCompiler objects target = PTXCompilerTarget(; cap, ptx, debuginfo, kwargs...) - params = CUDACompilerParams() + params = CUDACompilerParams(; contract) CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 5859de2d88..4b19d4327d 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -6,7 +6,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp ## high-level @cuda interface const MACRO_KWARGS = [:dynamic, :launch] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :contract] const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :shmem, :stream] @@ -306,6 +306,7 @@ The following keyword arguments are supported: supported on LLVM 4.0+) - `name`: override the name that the kernel will have in the generated code - `always_inline`: inline all function calls in the kernel +- `fastmath`: use less precise square roots and flush denormals The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/test/core/codegen.jl b/test/core/codegen.jl index e948972407..8ebeff0558 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -112,6 +112,34 @@ end @test cpu(input) == gpu(input) end +@testset "contract" begin + # FIXME: Overlay table is not used for top-level function + ir = sprint(io->CUDA.code_llvm(io, *, Tuple{Float64, Float64}, contract=true)) + @test_broken contains(ir, "fmul nsz contract double %0, %1") + + f(a, b) = a * b + g(a, b) = a + b + h(a, b) = a - b + + ir = sprint(io->CUDA.code_llvm(io, f, Tuple{Float64, Float64}, contract=true)) + @test_broken contains(ir, "fmul nsz contract double %0, %1") + + ir = sprint(io->CUDA.code_llvm(io, f, Tuple{Float32, Float32}, contract=true)) + @test contains(ir, "fmul nsz contract float %0, %1") + + ir = sprint(io->CUDA.code_llvm(io, g, Tuple{Float64, Float64}, contract=true)) + @test contains(ir, "fadd nsz contract double %0, %1") + + ir = sprint(io->CUDA.code_llvm(io, g, Tuple{Float32, Float32}, contract=true)) + @test contains(ir, "fadd nsz contract float %0, %1") + + ir = sprint(io->CUDA.code_llvm(io, h, Tuple{Float64, Float64}, contract=true)) + @test contains(ir, "fsub nsz contract double %0, %1") + + ir = sprint(io->CUDA.code_llvm(io, h, Tuple{Float32, Float32}, contract=true)) + @test contains(ir, "fsub nsz contract float %0, %1") +end + end ############################################################################################ @@ -157,6 +185,29 @@ end @test !occursin(".local", asm) end +@testset "fastmath" begin + function sqrt_kernel(x) + i = threadIdx().x + @inbounds x[i] = sqrt(x[i]) + return + end + + function div_kernel(x) + i = threadIdx().x + @fastmath @inbounds x[i] = 1 / x[i] + return + end + + asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}})) + @test occursin("sqrt.r", asm) + + asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) + @test occursin("sqrt.approx.ftz", asm) + + asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) + @test occursin("div.approx.ftz", asm) +end + end ############################################################################################