Add fp4 support #3532

lanluo-nvidia · 2025-05-25T17:51:27Z

Description

Add fp4 support

Fixes # (issue)

Type of change

Please delete options that are not relevant and/or add your own.

Bug fix (non-breaking change which fixes an issue)
New feature (non-breaking change which adds functionality)
Breaking change (fix or feature that would cause existing functionality to not work as expected)
This change requires a documentation update

Checklist:

My code follows the style guidelines of this project (You can use the linters)
I have performed a self-review of my own code
I have commented my code, particularly in hard-to-understand areas and hacks
I have made corresponding changes to the documentation
I have added tests to verify my fix or my feature
New and existing unit tests pass locally with my changes
I have added the relevant labels to my PR in so that relevant reviewers are notified

github-actions

There are some changes that do not conform to Python style guidelines:

--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/addmm.py	2025-05-25 17:51:42.835275+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/addmm.py	2025-05-25 17:52:07.703670+00:00
@@ -6,10 +6,11 @@
from torch_tensorrt.dynamo._SourceIR import SourceIR
from torch_tensorrt.dynamo.conversion import impl
from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
from torch_tensorrt.fx.types import TRTTensor
import os
+

def addmm(
    ctx: ConversionContext,
    target: Target,
    source_ir: Optional[SourceIR],
--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py	2025-05-25 17:51:42.834275+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py	2025-05-25 17:52:08.266101+00:00
@@ -272,17 +272,23 @@
            builder_config.set_memory_pool_limit(
                trt.MemoryPoolType.DLA_GLOBAL_DRAM,
                self.compilation_settings.dla_global_dram_size,
            )

-        if not self.compilation_settings.use_explicit_typing and dtype.float16 in self.compilation_settings.enabled_precisions:
+        if (
+            not self.compilation_settings.use_explicit_typing
+            and dtype.float16 in self.compilation_settings.enabled_precisions
+        ):
            builder_config.set_flag(trt.BuilderFlag.FP16)

        if dtype.int8 in self.compilation_settings.enabled_precisions:
            builder_config.set_flag(trt.BuilderFlag.INT8)

-        if not self.compilation_settings.use_explicit_typing and dtype.fp8 in self.compilation_settings.enabled_precisions:
+        if (
+            not self.compilation_settings.use_explicit_typing
+            and dtype.fp8 in self.compilation_settings.enabled_precisions
+        ):
            builder_config.set_flag(trt.BuilderFlag.FP8)

        if dtype.bfloat16 in self.compilation_settings.enabled_precisions:
            builder_config.set_flag(trt.BuilderFlag.BF16)

--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/permutation.py	2025-05-25 17:51:42.836275+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/permutation.py	2025-05-25 17:52:08.286663+00:00
@@ -13,10 +13,11 @@
)
from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape
from torch_tensorrt.fx.types import TRTTensor
import os

+
def permute(
    ctx: ConversionContext,
    target: Target,
    source_ir: Optional[SourceIR],
    name: str,
--- /home/runner/work/TensorRT/TensorRT/tests/py/dynamo/models/test_models_export.py	2025-05-25 17:51:42.863275+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/py/dynamo/models/test_models_export.py	2025-05-25 17:52:13.684130+00:00
@@ -13,10 +13,11 @@
from packaging.version import Version

assertions = unittest.TestCase()
import os

+
@pytest.mark.unit
def test_resnet18(ir):
    model = models.resnet18(pretrained=True).eval().to("cuda")
    input = torch.randn((1, 3, 224, 224)).to("cuda")

@@ -208,10 +209,11 @@
)
@pytest.mark.unit
def test_base_fp4(ir):
    import modelopt.torch.quantization as mtq
    from modelopt.torch.quantization.utils import export_torch_mode
+
    dtype = torch.float16

    class SimpleNetwork(torch.nn.Module):
        def __init__(self):
            super(SimpleNetwork, self).__init__()
@@ -227,21 +229,20 @@
        """Simple calibration function for testing."""
        model(input_tensor)

    input_tensor = torch.ones(128, 64, dtype=dtype).cuda()

-    
    model = SimpleNetwork().eval().cuda()
    model.linear1.weight = torch.nn.Parameter(torch.ones(32, 64, dtype=dtype).cuda())
    model.linear1.bias = torch.nn.Parameter(torch.zeros(128, 32, dtype=dtype).cuda())
    print(f"lan added amax: {input_tensor.abs().amax()=}")
    print(f"lan added amax: {model.linear1.weight.abs().amax()=}")
    expected_output = model(input_tensor)
-    print(f"lan added model input: {input_tensor=}")    
+    print(f"lan added model input: {input_tensor=}")
    print(f"lan added model weight: {model.linear1.weight=}")
    print(f"lan added model bias: {model.linear1.bias=}")
-    
+
    quant_cfg = mtq.NVFP4_DEFAULT_CFG
    mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
    # model has qdq nodes at this point
    with torch.no_grad():
        with export_torch_mode():
@@ -269,15 +270,21 @@
                print("lan added disable_gemm is set, compring result with weights")
                expected_output = model.linear1.weight
            else:
                print("lan added disable_gemm is not set, compring result with pytorch")

-            print(f"lan added torch_tensorrt outputs_trt: {outputs_trt=} {outputs_trt.dtype=} {outputs_trt.shape=} {outputs_trt.abs().amax()=}")
-            print(f"lan added expected output_pyt: {expected_output=} {expected_output.dtype=} {expected_output.shape=} {expected_output.abs().amax()=}")
+            print(
+                f"lan added torch_tensorrt outputs_trt: {outputs_trt=} {outputs_trt.dtype=} {outputs_trt.shape=} {outputs_trt.abs().amax()=}"
+            )
+            print(
+                f"lan added expected output_pyt: {expected_output=} {expected_output.dtype=} {expected_output.shape=} {expected_output.abs().amax()=}"
+            )

            abs_diff = torch.abs(expected_output - outputs_trt)
-            print(f"lan added max /mean abs_diff: {abs_diff.max().item()=} {abs_diff.mean()=}")
+            print(
+                f"lan added max /mean abs_diff: {abs_diff.max().item()=} {abs_diff.mean()=}"
+            )
            print(f"lan added abs_diff: {abs_diff=}")
            assert torch.allclose(expected_output, outputs_trt, rtol=0.8, atol=0.8)


@unittest.skipIf(

github-actions

There are some changes that do not conform to Python style guidelines:

--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/addmm.py	2025-05-28 16:06:33.359691+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/addmm.py	2025-05-28 16:06:58.870610+00:00
@@ -6,10 +6,11 @@
from torch_tensorrt.dynamo._SourceIR import SourceIR
from torch_tensorrt.dynamo.conversion import impl
from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
from torch_tensorrt.fx.types import TRTTensor
import os
+

def addmm(
    ctx: ConversionContext,
    target: Target,
    source_ir: Optional[SourceIR],
--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py	2025-05-28 16:06:33.358691+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py	2025-05-28 16:06:59.425511+00:00
@@ -272,17 +272,23 @@
            builder_config.set_memory_pool_limit(
                trt.MemoryPoolType.DLA_GLOBAL_DRAM,
                self.compilation_settings.dla_global_dram_size,
            )

-        if not self.compilation_settings.use_explicit_typing and dtype.float16 in self.compilation_settings.enabled_precisions:
+        if (
+            not self.compilation_settings.use_explicit_typing
+            and dtype.float16 in self.compilation_settings.enabled_precisions
+        ):
            builder_config.set_flag(trt.BuilderFlag.FP16)

        if dtype.int8 in self.compilation_settings.enabled_precisions:
            builder_config.set_flag(trt.BuilderFlag.INT8)

-        if not self.compilation_settings.use_explicit_typing and dtype.fp8 in self.compilation_settings.enabled_precisions:
+        if (
+            not self.compilation_settings.use_explicit_typing
+            and dtype.fp8 in self.compilation_settings.enabled_precisions
+        ):
            builder_config.set_flag(trt.BuilderFlag.FP8)

        if dtype.bfloat16 in self.compilation_settings.enabled_precisions:
            builder_config.set_flag(trt.BuilderFlag.BF16)

--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/permutation.py	2025-05-28 16:06:33.360691+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/permutation.py	2025-05-28 16:06:59.489610+00:00
@@ -13,10 +13,11 @@
)
from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape
from torch_tensorrt.fx.types import TRTTensor
import os

+
def permute(
    ctx: ConversionContext,
    target: Target,
    source_ir: Optional[SourceIR],
    name: str,

github-actions

There are some changes that do not conform to Python style guidelines:

--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/addmm.py	2025-06-05 16:38:29.810386+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/addmm.py	2025-06-05 16:38:53.407521+00:00
@@ -6,10 +6,11 @@
from torch_tensorrt.dynamo._SourceIR import SourceIR
from torch_tensorrt.dynamo.conversion import impl
from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
from torch_tensorrt.fx.types import TRTTensor
import os
+

def addmm(
    ctx: ConversionContext,
    target: Target,
    source_ir: Optional[SourceIR],
--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py	2025-06-05 16:38:29.809386+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py	2025-06-05 16:38:53.955223+00:00
@@ -272,17 +272,23 @@
            builder_config.set_memory_pool_limit(
                trt.MemoryPoolType.DLA_GLOBAL_DRAM,
                self.compilation_settings.dla_global_dram_size,
            )

-        if not self.compilation_settings.use_explicit_typing and dtype.float16 in self.compilation_settings.enabled_precisions:
+        if (
+            not self.compilation_settings.use_explicit_typing
+            and dtype.float16 in self.compilation_settings.enabled_precisions
+        ):
            builder_config.set_flag(trt.BuilderFlag.FP16)

        if dtype.int8 in self.compilation_settings.enabled_precisions:
            builder_config.set_flag(trt.BuilderFlag.INT8)

-        if not self.compilation_settings.use_explicit_typing and dtype.fp8 in self.compilation_settings.enabled_precisions:
+        if (
+            not self.compilation_settings.use_explicit_typing
+            and dtype.fp8 in self.compilation_settings.enabled_precisions
+        ):
            builder_config.set_flag(trt.BuilderFlag.FP8)

        if dtype.bfloat16 in self.compilation_settings.enabled_precisions:
            builder_config.set_flag(trt.BuilderFlag.BF16)

--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/permutation.py	2025-06-05 16:38:29.811386+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/conversion/impl/permutation.py	2025-06-05 16:38:54.015692+00:00
@@ -13,10 +13,11 @@
)
from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape
from torch_tensorrt.fx.types import TRTTensor
import os

+
def permute(
    ctx: ConversionContext,
    target: Target,
    source_ir: Optional[SourceIR],
    name: str,

py/torch_tensorrt/dynamo/_compiler.py

py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py

py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py

py/torch_tensorrt/dynamo/conversion/converter_utils.py

py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py

tests/py/dynamo/models/test_models_export.py

py/torch_tensorrt/dynamo/_compiler.py

py/torch_tensorrt/dynamo/conversion/_ConversionContext.py

py/torch_tensorrt/dynamo/conversion/converter_utils.py

py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py

narendasan

Add an end to end example for FP4 using modelopt and tensorrt

lanluo-nvidia · 2025-06-06T20:33:48Z

Add an end to end example for FP4 using modelopt and tensorrt

There is a simple linear end to end FP4 example in this PR.
There is another FP4 flux example in a seperate PR: #3537
Which based on Adrian's flux PR.

…T into lluo/fp4_issue_debugging

github-actions

There are some changes that do not conform to Python style guidelines:

--- /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py	2025-06-12 14:32:27.314241+00:00
+++ /home/runner/work/TensorRT/TensorRT/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py	2025-06-12 14:32:52.040416+00:00
@@ -98,16 +98,17 @@
class _TorchTensorRTConstantFolder(ConstantFolder):  # type: ignore[misc]
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

    def is_impure(self, node: torch.fx.node.Node) -> bool:
-        # Set of known quantization ops to be excluded from constant folding. 
+        # Set of known quantization ops to be excluded from constant folding.
        # Currently, we exclude all quantization ops coming from modelopt library.
        quantization_ops = {}
        try:
-            # modelopt import ensures torch.ops.tensorrt.quantize_op.default is registered 
+            # modelopt import ensures torch.ops.tensorrt.quantize_op.default is registered
            import modelopt.torch.quantization as mtq
+
            assert torch.ops.tensorrt.quantize_op.default
            quantization_ops.add(torch.ops.tensorrt.quantize_op.default)
            quantization_ops.add(torch.ops.tensorrt.dynamic_block_quantize_op.default)
        except Exception as e:
            pass

peri044

LGTM, pending CI fixes

Add fp4 support

01c900e

lanluo-nvidia self-assigned this May 25, 2025

facebook-github-bot added the cla signed label May 25, 2025

lanluo-nvidia added WIP Work is in progress, pull request should not be merged yet and removed cla signed labels May 25, 2025

github-actions bot requested a review from peri044 May 25, 2025 17:51

github-actions bot requested changes May 25, 2025

View reviewed changes

facebook-github-bot added the cla signed label May 25, 2025

add dynamic_shape support

f989864

github-actions bot requested changes May 28, 2025

View reviewed changes

Merge branch 'main' into lluo/fp4_issue_debugging

3d2d891

github-actions bot removed the component: build system Issues re: Build system label Jun 5, 2025

github-actions bot requested changes Jun 5, 2025

View reviewed changes

lanluo-nvidia added 2 commits June 5, 2025 10:50

clean up the PR

b99a9db

test

4e226d0

lanluo-nvidia removed the WIP Work is in progress, pull request should not be merged yet label Jun 5, 2025

lanluo-nvidia changed the title ~~Add fp4 support(Currently static double quantization is not working)~~ Add fp4 support Jun 5, 2025

test

951a920