From ccd1bc2ad1be8516f16d80b8cb6dffd79f5c6c85 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sun, 22 Sep 2024 17:10:44 +0000
Subject: [PATCH 1/2] Return values larger than 2 registers using a return area
 pointer

LLVM and Cranelift disagree about how to return values that don't fit
in the registers designated for return values. LLVM will force the
entire return value to be passed by return area pointer, while
Cranelift will look at each IR level return value independently and
decide to pass it in a register or not, which would result in the
return value being passed partially in registers and partially through
a return area pointer.

While Cranelift may need to be fixed as the LLVM behavior is generally
more correct with respect to the surface language, forcing this
behavior in rustc itself makes it easier for other backends to conform
to the Rust ABI and for the C ABI rustc already handles this behavior
anyway.

In addition LLVM's decision to pass the return value in registers or
using a return area pointer depends on how exactly the return type is
lowered to an LLVM IR type. For example `Option<u128>` can be lowered
as `{ i128, i128 }` in which case the x86_64 backend would use a return
area pointer, or it could be passed as `{ i32, i128 }` in which case
the x86_64 backend would pass it in registers by taking advantage of an
LLVM ABI extension that allows using 3 registers for the x86_64 sysv
call conv rather than the officially specified 2 registers.

This adjustment is only necessary for the Rust ABI as for other ABI's
the calling convention implementations in rustc_target already ensure
any return value which doesn't fit in the available amount of return
registers is passed in the right way for the current target.
---
 compiler/rustc_ty_utils/src/abi.rs | 43 ++++++++++++++++++++++++++++++
 tests/codegen/i128-x86-align.rs    | 32 ++++++++++------------
 tests/codegen/tuple-layout-opt.rs  |  8 +++---
 3 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/compiler/rustc_ty_utils/src/abi.rs b/compiler/rustc_ty_utils/src/abi.rs
index f23c2cf2c07a7..536bcc0cd9e4e 100644
--- a/compiler/rustc_ty_utils/src/abi.rs
+++ b/compiler/rustc_ty_utils/src/abi.rs
@@ -726,6 +726,49 @@ fn fn_abi_adjust_for_abi<'tcx>(
                 };
             }
 
+            if arg_idx.is_none() && arg.layout.size > Pointer(AddressSpace::DATA).size(cx) * 2 {
+                // Return values larger than 2 registers using a return area
+                // pointer. LLVM and Cranelift disagree about how to return
+                // values that don't fit in the registers designated for return
+                // values. LLVM will force the entire return value to be passed
+                // by return area pointer, while Cranelift will look at each IR level
+                // return value independently and decide to pass it in a
+                // register or not, which would result in the return value
+                // being passed partially in registers and partially through a
+                // return area pointer.
+                //
+                // While Cranelift may need to be fixed as the LLVM behavior is
+                // generally more correct with respect to the surface language,
+                // forcing this behavior in rustc itself makes it easier for
+                // other backends to conform to the Rust ABI and for the C ABI
+                // rustc already handles this behavior anyway.
+                //
+                // In addition LLVM's decision to pass the return value in
+                // registers or using a return area pointer depends on how
+                // exactly the return type is lowered to an LLVM IR type. For
+                // example `Option<u128>` can be lowered as `{ i128, i128 }`
+                // in which case the x86_64 backend would use a return area
+                // pointer, or it could be passed as `{ i32, i128 }` in which
+                // case the x86_64 backend would pass it in registers by taking
+                // advantage of an LLVM ABI extension that allows using 3
+                // registers for the x86_64 sysv call conv rather than the
+                // officially specified 2 registers.
+                //
+                // FIXME: Technically we should look at the amount of available
+                // return registers rather than guessing that there are 2
+                // registers for return values. In practice only a couple of
+                // architectures have less than 2 return registers. None of
+                // which supported by Cranelift.
+                //
+                // NOTE: This adjustment is only necessary for the Rust ABI as
+                // for other ABI's the calling convention implementations in
+                // rustc_target already ensure any return value which doesn't
+                // fit in the available amount of return registers is passed in
+                // the right way for the current target.
+                arg.make_indirect();
+                return;
+            }
+
             match arg.layout.abi {
                 Abi::Aggregate { .. } => {}
 
diff --git a/tests/codegen/i128-x86-align.rs b/tests/codegen/i128-x86-align.rs
index 3e6ed2b8e16a2..6b1da445c40ca 100644
--- a/tests/codegen/i128-x86-align.rs
+++ b/tests/codegen/i128-x86-align.rs
@@ -19,13 +19,15 @@ pub struct ScalarPair {
 #[no_mangle]
 pub fn load(x: &ScalarPair) -> ScalarPair {
     // CHECK-LABEL: @load(
+    // CHECK-SAME: sret([32 x i8]) align 16 dereferenceable(32) %_0,
     // CHECK-SAME: align 16 dereferenceable(32) %x
     // CHECK:      [[A:%.*]] = load i32, ptr %x, align 16
     // CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr %x, i64 16
     // CHECK-NEXT: [[B:%.*]] = load i128, ptr [[GEP]], align 16
-    // CHECK-NEXT: [[IV1:%.*]] = insertvalue { i32, i128 } poison, i32 [[A]], 0
-    // CHECK-NEXT: [[IV2:%.*]] = insertvalue { i32, i128 } [[IV1]], i128 [[B]], 1
-    // CHECK-NEXT: ret { i32, i128 } [[IV2]]
+    // CHECK-NEXT: store i32 [[A]], ptr %_0, align 16
+    // CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr %_0, i64 16
+    // CHECK-NEXT: store i128 [[B]], ptr [[GEP]], align 16
+    // CHECK-NEXT: ret void
     *x
 }
 
@@ -53,29 +55,23 @@ pub fn alloca() {
 #[no_mangle]
 pub fn load_volatile(x: &ScalarPair) -> ScalarPair {
     // CHECK-LABEL: @load_volatile(
+    // CHECK-SAME: sret([32 x i8]) align 16 dereferenceable(32) %_0,
     // CHECK-SAME: align 16 dereferenceable(32) %x
-    // CHECK:      [[TMP:%.*]] = alloca [32 x i8], align 16
     // CHECK:      [[LOAD:%.*]] = load volatile %ScalarPair, ptr %x, align 16
-    // CHECK-NEXT: store %ScalarPair [[LOAD]], ptr [[TMP]], align 16
-    // CHECK-NEXT: [[A:%.*]] = load i32, ptr [[TMP]], align 16
-    // CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16
-    // CHECK-NEXT: [[B:%.*]] = load i128, ptr [[GEP]], align 16
+    // CHECK-NEXT: store %ScalarPair [[LOAD]], ptr %_0, align 16
+    // CHECK-NEXT: ret void
     unsafe { std::intrinsics::volatile_load(x) }
 }
 
 #[no_mangle]
 pub fn transmute(x: ScalarPair) -> (std::mem::MaybeUninit<i128>, i128) {
-    // CHECK-LABEL: define { i128, i128 } @transmute(i32 noundef %x.0, i128 noundef %x.1)
-    // CHECK:       [[TMP:%.*]] = alloca [32 x i8], align 16
-    // CHECK-NEXT:  store i32 %x.0, ptr [[TMP]], align 16
-    // CHECK-NEXT:  [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16
+    // CHECK-LABEL: @transmute(
+    // CHECK-SAME:  sret([32 x i8]) align 16 dereferenceable(32) %_0,
+    // CHECK-SAME:  i32 noundef %x.0, i128 noundef %x.1
+    // CHECK:       store i32 %x.0, ptr %_0, align 16
+    // CHECK-NEXT:  [[GEP:%.*]] = getelementptr inbounds i8, ptr %_0, i64 16
     // CHECK-NEXT:  store i128 %x.1, ptr [[GEP]], align 16
-    // CHECK-NEXT:  [[LOAD1:%.*]] = load i128, ptr %_0, align 16
-    // CHECK-NEXT:  [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16
-    // CHECK-NEXT:  [[LOAD2:%.*]] = load i128, ptr [[GEP2]], align 16
-    // CHECK-NEXT:  [[IV1:%.*]] = insertvalue { i128, i128 } poison, i128 [[LOAD1]], 0
-    // CHECK-NEXT:  [[IV2:%.*]] = insertvalue { i128, i128 } [[IV1]], i128 [[LOAD2]], 1
-    // CHECK-NEXT:  ret { i128, i128 } [[IV2]]
+    // CHECK-NEXT:  ret void
     unsafe { std::mem::transmute(x) }
 }
 
diff --git a/tests/codegen/tuple-layout-opt.rs b/tests/codegen/tuple-layout-opt.rs
index 601563bc0613a..3202e4749c58e 100644
--- a/tests/codegen/tuple-layout-opt.rs
+++ b/tests/codegen/tuple-layout-opt.rs
@@ -19,28 +19,28 @@ pub fn test_ScalarZstFirst(_: ScalarZstFirst) -> ScalarZstFirst {
 }
 
 type ScalarPairZstLast = (u8, u128, ());
-// CHECK: define {{(dso_local )?}}{ i128, i8 } @test_ScalarPairZstLast(i128 %_1.0, i8 %_1.1)
+// CHECK: define {{(dso_local )?}}void @test_ScalarPairZstLast(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairZstLast(_: ScalarPairZstLast) -> ScalarPairZstLast {
     loop {}
 }
 
 type ScalarPairZstFirst = ((), u8, u128);
-// CHECK: define {{(dso_local )?}}{ i8, i128 } @test_ScalarPairZstFirst(i8 %_1.0, i128 %_1.1)
+// CHECK: define {{(dso_local )?}}void @test_ScalarPairZstFirst(ptr sret([32 x i8]) align 16 %_0, i8 %_1.0, i128 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairZstFirst(_: ScalarPairZstFirst) -> ScalarPairZstFirst {
     loop {}
 }
 
 type ScalarPairLotsOfZsts = ((), u8, (), u128, ());
-// CHECK: define {{(dso_local )?}}{ i128, i8 } @test_ScalarPairLotsOfZsts(i128 %_1.0, i8 %_1.1)
+// CHECK: define {{(dso_local )?}}void @test_ScalarPairLotsOfZsts(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairLotsOfZsts(_: ScalarPairLotsOfZsts) -> ScalarPairLotsOfZsts {
     loop {}
 }
 
 type ScalarPairLottaNesting = (((), ((), u8, (), u128, ())), ());
-// CHECK: define {{(dso_local )?}}{ i128, i8 } @test_ScalarPairLottaNesting(i128 %_1.0, i8 %_1.1)
+// CHECK: define {{(dso_local )?}}void @test_ScalarPairLottaNesting(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairLottaNesting(_: ScalarPairLottaNesting) -> ScalarPairLottaNesting {
     loop {}

From 8ed77fd29e289a591df16390049a8e94f3c62b04 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sat, 12 Oct 2024 16:17:34 +0000
Subject: [PATCH 2/2] Update tests for 32bit targets

---
 tests/codegen/float/f128.rs              | 106 +++++++++++++++++------
 tests/codegen/float/f16.rs               |  26 ++++--
 tests/codegen/mir-aggregate-no-alloca.rs |  30 +++++--
 tests/codegen/range-attribute.rs         |  11 ++-
 tests/codegen/tuple-layout-opt.rs        |  26 ++++--
 tests/codegen/union-abi.rs               |  18 ++--
 6 files changed, 162 insertions(+), 55 deletions(-)

diff --git a/tests/codegen/float/f128.rs b/tests/codegen/float/f128.rs
index 80b572fbbc9f7..4af264101deed 100644
--- a/tests/codegen/float/f128.rs
+++ b/tests/codegen/float/f128.rs
@@ -1,7 +1,11 @@
 // 32-bit x86 returns `f32` and `f64` differently to avoid the x87 stack.
-//@ revisions: x86 other
+// 32-bit systems will return 128bit values using a return area pointer.
+//@ revisions: x86 bit32 bit64
 //@[x86] only-x86
-//@[other] ignore-x86
+//@[bit32] ignore-x86
+//@[bit32] only-32bit
+//@[bit64] ignore-x86
+//@[bit64] only-64bit
 
 // Verify that our intrinsics generate the correct LLVM calls for f128
 
@@ -52,42 +56,54 @@ pub fn f128_le(a: f128, b: f128) -> bool {
     a <= b
 }
 
-// CHECK-LABEL: fp128 @f128_neg(
+// x86-LABEL: void @f128_neg({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_neg({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f128_neg(
 #[no_mangle]
 pub fn f128_neg(a: f128) -> f128 {
     // CHECK: fneg fp128
     -a
 }
 
-// CHECK-LABEL: fp128 @f128_add(
+// x86-LABEL: void @f128_add({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_add({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f128_add(
 #[no_mangle]
 pub fn f128_add(a: f128, b: f128) -> f128 {
     // CHECK: fadd fp128 %{{.+}}, %{{.+}}
     a + b
 }
 
-// CHECK-LABEL: fp128 @f128_sub(
+// x86-LABEL: void @f128_sub({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_sub({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f128_sub(
 #[no_mangle]
 pub fn f128_sub(a: f128, b: f128) -> f128 {
     // CHECK: fsub fp128 %{{.+}}, %{{.+}}
     a - b
 }
 
-// CHECK-LABEL: fp128 @f128_mul(
+// x86-LABEL: void @f128_mul({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_mul({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f128_mul(
 #[no_mangle]
 pub fn f128_mul(a: f128, b: f128) -> f128 {
     // CHECK: fmul fp128 %{{.+}}, %{{.+}}
     a * b
 }
 
-// CHECK-LABEL: fp128 @f128_div(
+// x86-LABEL: void @f128_div({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_div({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f128_div(
 #[no_mangle]
 pub fn f128_div(a: f128, b: f128) -> f128 {
     // CHECK: fdiv fp128 %{{.+}}, %{{.+}}
     a / b
 }
 
-// CHECK-LABEL: fp128 @f128_rem(
+// x86-LABEL: void @f128_rem({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_rem({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f128_rem(
 #[no_mangle]
 pub fn f128_rem(a: f128, b: f128) -> f128 {
     // CHECK: frem fp128 %{{.+}}, %{{.+}}
@@ -143,44 +159,56 @@ pub fn f128_as_f16(a: f128) -> f16 {
     a as f16
 }
 
-// other-LABEL: float @f128_as_f32(
 // x86-LABEL: i32 @f128_as_f32(
+// bit32-LABEL: float @f128_as_f32(
+// bit64-LABEL: float @f128_as_f32(
 #[no_mangle]
 pub fn f128_as_f32(a: f128) -> f32 {
     // CHECK: fptrunc fp128 %{{.+}} to float
     a as f32
 }
 
-// other-LABEL: double @f128_as_f64(
 // x86-LABEL: void @f128_as_f64(
+// bit32-LABEL: double @f128_as_f64(
+// bit64-LABEL: double @f128_as_f64(
 #[no_mangle]
 pub fn f128_as_f64(a: f128) -> f64 {
     // CHECK: fptrunc fp128 %{{.+}} to double
     a as f64
 }
 
-// CHECK-LABEL: fp128 @f128_as_self(
+// x86-LABEL: void @f128_as_self({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_as_self({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f128_as_self(
 #[no_mangle]
 pub fn f128_as_self(a: f128) -> f128 {
-    // CHECK: ret fp128 %{{.+}}
+    // x86: store fp128 %a, ptr %_0, align 16
+    // bit32: store fp128 %a, ptr %_0, align 16
+    // bit64: ret fp128 %{{.+}}
     a as f128
 }
 
-// CHECK-LABEL: fp128 @f16_as_f128(
+// x86-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f16_as_f128(
 #[no_mangle]
 pub fn f16_as_f128(a: f16) -> f128 {
     // CHECK: fpext half %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @f32_as_f128(
+// x86-LABEL: void @f32_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f32_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f32_as_f128(
 #[no_mangle]
 pub fn f32_as_f128(a: f32) -> f128 {
     // CHECK: fpext float %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @f64_as_f128(
+// x86-LABEL: void @f64_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f64_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f64_as_f128(
 #[no_mangle]
 pub fn f64_as_f128(a: f64) -> f128 {
     // CHECK: fpext double %{{.+}} to fp128
@@ -216,7 +244,9 @@ pub fn f128_as_u64(a: f128) -> u64 {
     a as u64
 }
 
-// CHECK-LABEL: i128 @f128_as_u128(
+// x86-LABEL: void @f128_as_u128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_as_u128({{.*}}sret([16 x i8])
+// bit64-LABEL: i128 @f128_as_u128(
 #[no_mangle]
 pub fn f128_as_u128(a: f128) -> u128 {
     // CHECK: call i128 @llvm.fptoui.sat.i128.f128(fp128 %{{.+}})
@@ -250,7 +280,9 @@ pub fn f128_as_i64(a: f128) -> i64 {
     a as i64
 }
 
-// CHECK-LABEL: i128 @f128_as_i128(
+// x86-LABEL: void @f128_as_i128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f128_as_i128({{.*}}sret([16 x i8])
+// bit64-LABEL: i128 @f128_as_i128(
 #[no_mangle]
 pub fn f128_as_i128(a: f128) -> i128 {
     // CHECK: call i128 @llvm.fptosi.sat.i128.f128(fp128 %{{.+}})
@@ -259,70 +291,90 @@ pub fn f128_as_i128(a: f128) -> i128 {
 
 /* int to float conversions */
 
-// CHECK-LABEL: fp128 @u8_as_f128(
+// x86-LABEL: void @u8_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @u8_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @u8_as_f128(
 #[no_mangle]
 pub fn u8_as_f128(a: u8) -> f128 {
     // CHECK: uitofp i8 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @u16_as_f128(
+// x86-LABEL: void @u16_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @u16_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @u16_as_f128(
 #[no_mangle]
 pub fn u16_as_f128(a: u16) -> f128 {
     // CHECK: uitofp i16 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @u32_as_f128(
+// x86-LABEL: void @u32_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @u32_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @u32_as_f128(
 #[no_mangle]
 pub fn u32_as_f128(a: u32) -> f128 {
     // CHECK: uitofp i32 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @u64_as_f128(
+// x86-LABEL: void @u64_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @u64_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @u64_as_f128(
 #[no_mangle]
 pub fn u64_as_f128(a: u64) -> f128 {
     // CHECK: uitofp i64 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @u128_as_f128(
+// x86-LABEL: void @u128_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @u128_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @u128_as_f128(
 #[no_mangle]
 pub fn u128_as_f128(a: u128) -> f128 {
     // CHECK: uitofp i128 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @i8_as_f128(
+// x86-LABEL: void @i8_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @i8_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @i8_as_f128(
 #[no_mangle]
 pub fn i8_as_f128(a: i8) -> f128 {
     // CHECK: sitofp i8 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @i16_as_f128(
+// x86-LABEL: void @i16_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @i16_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @i16_as_f128(
 #[no_mangle]
 pub fn i16_as_f128(a: i16) -> f128 {
     // CHECK: sitofp i16 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @i32_as_f128(
+// x86-LABEL: void @i32_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @i32_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @i32_as_f128(
 #[no_mangle]
 pub fn i32_as_f128(a: i32) -> f128 {
     // CHECK: sitofp i32 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @i64_as_f128(
+// x86-LABEL: void @i64_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @i64_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @i64_as_f128(
 #[no_mangle]
 pub fn i64_as_f128(a: i64) -> f128 {
     // CHECK: sitofp i64 %{{.+}} to fp128
     a as f128
 }
 
-// CHECK-LABEL: fp128 @i128_as_f128(
+// x86-LABEL: void @i128_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @i128_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @i128_as_f128(
 #[no_mangle]
 pub fn i128_as_f128(a: i128) -> f128 {
     // CHECK: sitofp i128 %{{.+}} to fp128
diff --git a/tests/codegen/float/f16.rs b/tests/codegen/float/f16.rs
index 2910d7d3e928d..80931051f1885 100644
--- a/tests/codegen/float/f16.rs
+++ b/tests/codegen/float/f16.rs
@@ -1,7 +1,11 @@
 // 32-bit x86 returns `f32` and `f64` differently to avoid the x87 stack.
-//@ revisions: x86 other
+// 32-bit systems will return 128bit values using a return area pointer.
+//@ revisions: x86 bit32 bit64
 //@[x86] only-x86
-//@[other] ignore-x86
+//@[bit32] ignore-x86
+//@[bit32] only-32bit
+//@[bit64] ignore-x86
+//@[bit64] only-64bit
 
 // Verify that our intrinsics generate the correct LLVM calls for f16
 
@@ -145,23 +149,27 @@ pub fn f16_as_self(a: f16) -> f16 {
     a as f16
 }
 
-// other-LABEL: float @f16_as_f32(
 // x86-LABEL: i32 @f16_as_f32(
+// bit32-LABEL: float @f16_as_f32(
+// bit64-LABEL: float @f16_as_f32(
 #[no_mangle]
 pub fn f16_as_f32(a: f16) -> f32 {
     // CHECK: fpext half %{{.+}} to float
     a as f32
 }
 
-// other-LABEL: double @f16_as_f64(
 // x86-LABEL: void @f16_as_f64(
+// bit32-LABEL: double @f16_as_f64(
+// bit64-LABEL: double @f16_as_f64(
 #[no_mangle]
 pub fn f16_as_f64(a: f16) -> f64 {
     // CHECK: fpext half %{{.+}} to double
     a as f64
 }
 
-// CHECK-LABEL: fp128 @f16_as_f128(
+// x86-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
+// bit64-LABEL: fp128 @f16_as_f128(
 #[no_mangle]
 pub fn f16_as_f128(a: f16) -> f128 {
     // CHECK: fpext half %{{.+}} to fp128
@@ -218,7 +226,9 @@ pub fn f16_as_u64(a: f16) -> u64 {
     a as u64
 }
 
-// CHECK-LABEL: i128 @f16_as_u128(
+// x86-LABEL: void @f16_as_u128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f16_as_u128({{.*}}sret([16 x i8])
+// bit64-LABEL: i128 @f16_as_u128(
 #[no_mangle]
 pub fn f16_as_u128(a: f16) -> u128 {
     // CHECK: call i128 @llvm.fptoui.sat.i128.f16(half %{{.+}})
@@ -252,7 +262,9 @@ pub fn f16_as_i64(a: f16) -> i64 {
     a as i64
 }
 
-// CHECK-LABEL: i128 @f16_as_i128(
+// x86-LABEL: void @f16_as_i128({{.*}}sret([16 x i8])
+// bit32-LABEL: void @f16_as_i128({{.*}}sret([16 x i8])
+// bit64-LABEL: i128 @f16_as_i128(
 #[no_mangle]
 pub fn f16_as_i128(a: f16) -> i128 {
     // CHECK: call i128 @llvm.fptosi.sat.i128.f16(half %{{.+}})
diff --git a/tests/codegen/mir-aggregate-no-alloca.rs b/tests/codegen/mir-aggregate-no-alloca.rs
index c0e7e1a05e390..84eb565c85239 100644
--- a/tests/codegen/mir-aggregate-no-alloca.rs
+++ b/tests/codegen/mir-aggregate-no-alloca.rs
@@ -1,3 +1,7 @@
+// 32-bit systems will return 128bit values using a return area pointer.
+//@ revisions: bit32 bit64
+//@[bit32] only-32bit
+//@[bit64] only-64bit
 //@ compile-flags: -O -C no-prepopulate-passes -Z randomize-layout=no
 
 #![crate_type = "lib"]
@@ -98,26 +102,36 @@ pub fn make_struct_1(a: i32) -> Struct1 {
 
 pub struct Struct2Asc(i16, i64);
 
-// CHECK-LABEL: { i64, i16 } @make_struct_2_asc(i16 noundef %a, i64 noundef %b)
+// bit32-LABEL: void @make_struct_2_asc({{.*}} sret([16 x i8]) {{.*}} %s,
+// bit64-LABEL: { i64, i16 } @make_struct_2_asc(
+// CHECK-SAME: i16 noundef %a, i64 noundef %b)
 #[no_mangle]
 pub fn make_struct_2_asc(a: i16, b: i64) -> Struct2Asc {
     // CHECK-NOT: alloca
-    // CHECK: %[[TEMP0:.+]] = insertvalue { i64, i16 } poison, i64 %b, 0
-    // CHECK: %[[TEMP1:.+]] = insertvalue { i64, i16 } %[[TEMP0]], i16 %a, 1
-    // CHECK: ret { i64, i16 } %[[TEMP1]]
+    // bit32: %[[GEP:.+]] = getelementptr inbounds i8, ptr %s, i32 8
+    // bit32: store i16 %a, ptr %[[GEP]], align 8
+    // bit32: store i64 %b, ptr %s, align 8
+    // bit64: %[[TEMP0:.+]] = insertvalue { i64, i16 } poison, i64 %b, 0
+    // bit64: %[[TEMP1:.+]] = insertvalue { i64, i16 } %[[TEMP0]], i16 %a, 1
+    // bit64: ret { i64, i16 } %[[TEMP1]]
     let s = Struct2Asc(a, b);
     s
 }
 
 pub struct Struct2Desc(i64, i16);
 
-// CHECK-LABEL: { i64, i16 } @make_struct_2_desc(i64 noundef %a, i16 noundef %b)
+// bit32-LABEL: void @make_struct_2_desc({{.*}} sret([16 x i8]) {{.*}} %s,
+// bit64-LABEL: { i64, i16 } @make_struct_2_desc(
+// CHECK-SAME: i64 noundef %a, i16 noundef %b)
 #[no_mangle]
 pub fn make_struct_2_desc(a: i64, b: i16) -> Struct2Desc {
     // CHECK-NOT: alloca
-    // CHECK: %[[TEMP0:.+]] = insertvalue { i64, i16 } poison, i64 %a, 0
-    // CHECK: %[[TEMP1:.+]] = insertvalue { i64, i16 } %[[TEMP0]], i16 %b, 1
-    // CHECK: ret { i64, i16 } %[[TEMP1]]
+    // bit32: store i64 %a, ptr %s, align 8
+    // bit32: %[[GEP:.+]] = getelementptr inbounds i8, ptr %s, i32 8
+    // bit32: store i16 %b, ptr %[[GEP]], align 8
+    // bit64: %[[TEMP0:.+]] = insertvalue { i64, i16 } poison, i64 %a, 0
+    // bit64: %[[TEMP1:.+]] = insertvalue { i64, i16 } %[[TEMP0]], i16 %b, 1
+    // bit64: ret { i64, i16 } %[[TEMP1]]
     let s = Struct2Desc(a, b);
     s
 }
diff --git a/tests/codegen/range-attribute.rs b/tests/codegen/range-attribute.rs
index bb19bec0fb932..456b28fc53578 100644
--- a/tests/codegen/range-attribute.rs
+++ b/tests/codegen/range-attribute.rs
@@ -1,6 +1,10 @@
 // Checks that range metadata gets emitted on functions result and arguments
 // with scalar value.
 
+// 32-bit systems will return 128bit values using a return area pointer.
+//@ revisions: bit32 bit64
+//@[bit32] only-32bit
+//@[bit64] only-64bit
 //@ compile-flags: -O -C no-prepopulate-passes
 //@ min-llvm-version: 19
 
@@ -13,7 +17,8 @@ use std::num::NonZero;
 #[no_mangle]
 pub fn helper(_: usize) {}
 
-// CHECK: noundef range(i128 1, 0) i128 @nonzero_int(i128 noundef range(i128 1, 0) %x)
+// bit32: void @nonzero_int({{.*}} sret([16 x i8]) {{.*}}, i128 noundef range(i128 1, 0) %x)
+// bit64: noundef range(i128 1, 0) i128 @nonzero_int(i128 noundef range(i128 1, 0) %x)
 #[no_mangle]
 pub fn nonzero_int(x: NonZero<u128>) -> NonZero<u128> {
     x
@@ -43,7 +48,9 @@ pub enum Enum1 {
     C(u64),
 }
 
-// CHECK: { [[ENUM1_TYP:i[0-9]+]], i64 } @enum1_value([[ENUM1_TYP]] noundef range([[ENUM1_TYP]] 0, 3) %x.0, i64 noundef %x.1)
+// bit32: void @enum1_value({{.*}} sret([16 x i8]) {{[^,]*}}, [[ENUM1_TYP:i[0-9]+]]
+// bit64: { [[ENUM1_TYP:i[0-9]+]], i64 } @enum1_value([[ENUM1_TYP]]
+// CHECK-SAME: noundef range([[ENUM1_TYP]] 0, 3) %x.0, i64 noundef %x.1)
 #[no_mangle]
 pub fn enum1_value(x: Enum1) -> Enum1 {
     x
diff --git a/tests/codegen/tuple-layout-opt.rs b/tests/codegen/tuple-layout-opt.rs
index 3202e4749c58e..ce64b7b46a918 100644
--- a/tests/codegen/tuple-layout-opt.rs
+++ b/tests/codegen/tuple-layout-opt.rs
@@ -1,3 +1,7 @@
+// 32-bit systems will return 128bit values using a return area pointer.
+//@ revisions: bit32 bit64
+//@[bit32] only-32bit
+//@[bit64] only-64bit
 //@ compile-flags: -C no-prepopulate-passes -Copt-level=0
 
 // Test that tuples get optimized layout, in particular with a ZST in the last field (#63244)
@@ -5,42 +9,52 @@
 #![crate_type = "lib"]
 
 type ScalarZstLast = (u128, ());
-// CHECK: define {{(dso_local )?}}i128 @test_ScalarZstLast(i128 %_1)
+// bit32: define {{(dso_local )?}}void @test_ScalarZstLast({{.*}} sret([16 x i8]) {{.*}}, i128 %_1)
+// bit64: define {{(dso_local )?}}i128 @test_ScalarZstLast(i128 %_1)
 #[no_mangle]
 pub fn test_ScalarZstLast(_: ScalarZstLast) -> ScalarZstLast {
     loop {}
 }
 
 type ScalarZstFirst = ((), u128);
-// CHECK: define {{(dso_local )?}}i128 @test_ScalarZstFirst(i128 %_1)
+// bit32: define {{(dso_local )?}}void @test_ScalarZstFirst({{.*}} sret([16 x i8]) {{.*}}, i128 %_1)
+// bit64: define {{(dso_local )?}}i128 @test_ScalarZstFirst(i128 %_1)
 #[no_mangle]
 pub fn test_ScalarZstFirst(_: ScalarZstFirst) -> ScalarZstFirst {
     loop {}
 }
 
 type ScalarPairZstLast = (u8, u128, ());
-// CHECK: define {{(dso_local )?}}void @test_ScalarPairZstLast(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
+// bit32: define {{(dso_local )?}}void @test_ScalarPairZstLast(ptr sret([24 x i8]) align 8
+// bit64: define {{(dso_local )?}}void @test_ScalarPairZstLast(ptr sret([32 x i8]) align 16
+// CHECK-SAME: %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairZstLast(_: ScalarPairZstLast) -> ScalarPairZstLast {
     loop {}
 }
 
 type ScalarPairZstFirst = ((), u8, u128);
-// CHECK: define {{(dso_local )?}}void @test_ScalarPairZstFirst(ptr sret([32 x i8]) align 16 %_0, i8 %_1.0, i128 %_1.1)
+// bit32: define {{(dso_local )?}}void @test_ScalarPairZstFirst(ptr sret([24 x i8]) align 8
+// bit64: define {{(dso_local )?}}void @test_ScalarPairZstFirst(ptr sret([32 x i8]) align 16
+// CHECK-SAME: %_0, i8 %_1.0, i128 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairZstFirst(_: ScalarPairZstFirst) -> ScalarPairZstFirst {
     loop {}
 }
 
 type ScalarPairLotsOfZsts = ((), u8, (), u128, ());
-// CHECK: define {{(dso_local )?}}void @test_ScalarPairLotsOfZsts(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
+// bit32: define {{(dso_local )?}}void @test_ScalarPairLotsOfZsts(ptr sret([24 x i8]) align 8
+// bit64: define {{(dso_local )?}}void @test_ScalarPairLotsOfZsts(ptr sret([32 x i8]) align 16
+// CHECK-SAME: %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairLotsOfZsts(_: ScalarPairLotsOfZsts) -> ScalarPairLotsOfZsts {
     loop {}
 }
 
 type ScalarPairLottaNesting = (((), ((), u8, (), u128, ())), ());
-// CHECK: define {{(dso_local )?}}void @test_ScalarPairLottaNesting(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
+// bit32: define {{(dso_local )?}}void @test_ScalarPairLottaNesting(ptr sret([24 x i8]) align 8
+// bit64: define {{(dso_local )?}}void @test_ScalarPairLottaNesting(ptr sret([32 x i8]) align 16
+// CHECK-SAME: %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairLottaNesting(_: ScalarPairLottaNesting) -> ScalarPairLottaNesting {
     loop {}
diff --git a/tests/codegen/union-abi.rs b/tests/codegen/union-abi.rs
index a1c081d7d61c2..b3c67a59730d9 100644
--- a/tests/codegen/union-abi.rs
+++ b/tests/codegen/union-abi.rs
@@ -1,9 +1,13 @@
 //@ ignore-emscripten vectors passed directly
 //@ compile-flags: -O -C no-prepopulate-passes
 // 32-bit x86 returns `f32` differently to avoid the x87 stack.
-//@ revisions: x86 other
+// 32-bit systems will return 128bit values using a return area pointer.
+//@ revisions: x86 bit32 bit64
 //@[x86] only-x86
-//@[other] ignore-x86
+//@[bit32] ignore-x86
+//@[bit32] only-32bit
+//@[bit64] ignore-x86
+//@[bit64] only-64bit
 
 // This test that using union forward the abi of the inner type, as
 // discussed in #54668
@@ -71,8 +75,9 @@ pub union UnionF32 {
     a: f32,
 }
 
-// other: define {{(dso_local )?}}float @test_UnionF32(float %_1)
 // x86: define {{(dso_local )?}}i32 @test_UnionF32(float %_1)
+// bit32: define {{(dso_local )?}}float @test_UnionF32(float %_1)
+// bit64: define {{(dso_local )?}}float @test_UnionF32(float %_1)
 #[no_mangle]
 pub fn test_UnionF32(_: UnionF32) -> UnionF32 {
     loop {}
@@ -83,8 +88,9 @@ pub union UnionF32F32 {
     b: f32,
 }
 
-// other: define {{(dso_local )?}}float @test_UnionF32F32(float %_1)
 // x86: define {{(dso_local )?}}i32 @test_UnionF32F32(float %_1)
+// bit32: define {{(dso_local )?}}float @test_UnionF32F32(float %_1)
+// bit64: define {{(dso_local )?}}float @test_UnionF32F32(float %_1)
 #[no_mangle]
 pub fn test_UnionF32F32(_: UnionF32F32) -> UnionF32F32 {
     loop {}
@@ -104,7 +110,9 @@ pub fn test_UnionF32U32(_: UnionF32U32) -> UnionF32U32 {
 pub union UnionU128 {
     a: u128,
 }
-// CHECK: define {{(dso_local )?}}i128 @test_UnionU128(i128 %_1)
+// x86: define {{(dso_local )?}}void @test_UnionU128({{.*}}sret([16 x i8]){{.*}}, i128 %_1)
+// bit32: define {{(dso_local )?}}void @test_UnionU128({{.*}}sret([16 x i8]){{.*}}, i128 %_1)
+// bit64: define {{(dso_local )?}}i128 @test_UnionU128(i128 %_1)
 #[no_mangle]
 pub fn test_UnionU128(_: UnionU128) -> UnionU128 {
     loop {}