Add fma and sub

2025-10-03 14:50:43 +00:00 · 2024-08-21 03:02:41 +02:00 · 2024-08-21 03:02:41 +02:00 · 798bbf06e1
commit 798bbf06e1
parent 6cd18bfdb8
2 changed files with 190 additions and 10 deletions
--- a/ptx_parser/src/ast.rs
+++ b/ptx_parser/src/ast.rs
@ -221,6 +221,25 @@ gen::generate_instruction_type!(
                src3: T,
            }
        },
+        Fma {
+            type: { Type::from(data.type_) },
+            data: ArithFloat,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T,
+                src3: T,
+            }
+        },
+        Sub {
+            type: { Type::from(data.type_()) },
+            data: ArithDetails,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T,
+            }
+        },
        Trap { }
    }
 );
--- a/ptx_parser/src/main.rs
+++ b/ptx_parser/src/main.rs
@ -1481,7 +1481,7 @@ derive_parser!(
    mul{.rnd}{.ftz}{.sat}.f32  d, a, b => {
        ast::Instruction::Mul {
            data: ast::MulDetails::Float (
-                ArithFloat {
+                ast::ArithFloat {
                    type_: f32,
                    rounding: rnd.map(Into::into),
                    flush_to_zero: Some(ftz),
@ -1494,7 +1494,7 @@ derive_parser!(
    mul{.rnd}.f64              d, a, b => {
        ast::Instruction::Mul {
            data: ast::MulDetails::Float (
-                ArithFloat {
+                ast::ArithFloat {
                    type_: f64,
                    rounding: rnd.map(Into::into),
                    flush_to_zero: None,
@ -1510,7 +1510,7 @@ derive_parser!(
    mul{.rnd}{.ftz}{.sat}.f16   d, a, b => {
        ast::Instruction::Mul {
            data: ast::MulDetails::Float (
-                ArithFloat {
+                ast::ArithFloat {
                    type_: f16,
                    rounding: rnd.map(Into::into),
                    flush_to_zero: Some(ftz),
@ -1523,7 +1523,7 @@ derive_parser!(
    mul{.rnd}{.ftz}{.sat}.f16x2 d, a, b => {
        ast::Instruction::Mul {
            data: ast::MulDetails::Float (
-                ArithFloat {
+                ast::ArithFloat {
                    type_: f16x2,
                    rounding: rnd.map(Into::into),
                    flush_to_zero: Some(ftz),
@ -1536,7 +1536,7 @@ derive_parser!(
    mul{.rnd}.bf16   d, a, b => {
        ast::Instruction::Mul {
            data: ast::MulDetails::Float (
-                ArithFloat {
+                ast::ArithFloat {
                    type_: bf16,
                    rounding: rnd.map(Into::into),
                    flush_to_zero: None,
@ -1549,7 +1549,7 @@ derive_parser!(
    mul{.rnd}.bf16x2 d, a, b => {
        ast::Instruction::Mul {
            data: ast::MulDetails::Float (
-                ArithFloat {
+                ast::ArithFloat {
                    type_: bf16x2,
                    rounding: rnd.map(Into::into),
                    flush_to_zero: None,
@ -1835,7 +1835,7 @@ derive_parser!(
    mad{.ftz}{.sat}.f32      d, a, b, c => {
        ast::Instruction::Mad {
            data: ast::MadDetails::Float(
-                ArithFloat {
+                ast::ArithFloat {
                    type_: f32,
                    rounding: None,
                    flush_to_zero: Some(ftz),
@ -1848,7 +1848,7 @@ derive_parser!(
    mad.rnd{.ftz}{.sat}.f32  d, a, b, c => {
        ast::Instruction::Mad {
            data: ast::MadDetails::Float(
-                ArithFloat {
+                ast::ArithFloat {
                    type_: f32,
                    rounding: Some(rnd.into()),
                    flush_to_zero: Some(ftz),
@ -1861,7 +1861,7 @@ derive_parser!(
    mad.rnd.f64              d, a, b, c => {
        ast::Instruction::Mad {
            data: ast::MadDetails::Float(
-                ArithFloat {
+                ast::ArithFloat {
                    type_: f64,
                    rounding: Some(rnd.into()),
                    flush_to_zero: None,
@ -1869,10 +1869,171 @@ derive_parser!(
                }
            ),
            arguments: MadArgs { dst: d, src1: a, src2: b, src3: c  }
-        }}
+        }
+    }
    .rnd: RawRoundingMode   = { .rn, .rz, .rm, .rp };
    ScalarType =        { .f32, .f64 };

+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma
+    fma.rnd{.ftz}{.sat}.f32  d, a, b, c => {
+        ast::Instruction::Fma {
+            data: ast::ArithFloat {
+                type_: f32,
+                rounding: Some(rnd.into()),
+                flush_to_zero: Some(ftz),
+                saturate: sat
+            },
+            arguments: FmaArgs { dst: d, src1: a, src2: b, src3: c  }
+        }
+    }
+    fma.rnd.f64              d, a, b, c => {
+        ast::Instruction::Fma {
+            data: ast::ArithFloat {
+                type_: f64,
+                rounding: Some(rnd.into()),
+                flush_to_zero: None,
+                saturate: false
+            },
+            arguments: FmaArgs { dst: d, src1: a, src2: b, src3: c  }
+        }
+    }
+    .rnd: RawRoundingMode = { .rn, .rz, .rm, .rp };
+    ScalarType =            { .f32, .f64 };
+
+    fma.rnd{.ftz}{.sat}.f16 d, a, b, c => {
+        ast::Instruction::Fma {
+            data: ast::ArithFloat {
+                type_: f16,
+                rounding: Some(rnd.into()),
+                flush_to_zero: Some(ftz),
+                saturate: sat
+            },
+            arguments: FmaArgs { dst: d, src1: a, src2: b, src3: c  }
+        }
+    }
+    //fma.rnd{.ftz}{.sat}.f16x2   d, a, b, c;
+    //fma.rnd{.ftz}.relu.f16      d, a, b, c;
+    //fma.rnd{.ftz}.relu.f16x2    d, a, b, c;
+    //fma.rnd{.relu}.bf16         d, a, b, c;
+    //fma.rnd{.relu}.bf16x2       d, a, b, c;
+    //fma.rnd.oob.{relu}.type     d, a, b, c;
+    .rnd: RawRoundingMode = { .rn };
+    ScalarType =            { .f16 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sub
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub
+    sub.type       d, a, b => {
+        ast::Instruction::Sub {
+            data: ast::ArithDetails::Integer(
+                ArithInteger {
+                    type_,
+                    saturate: false
+                }
+            ),
+            arguments: SubArgs { dst: d, src1: a, src2: b  }
+        }
+    }
+    sub.sat.s32  d, a, b => {
+        ast::Instruction::Sub {
+            data: ast::ArithDetails::Integer(
+                ArithInteger {
+                    type_: s32,
+                    saturate: true
+                }
+            ),
+            arguments: SubArgs { dst: d, src1: a, src2: b  }
+        }
+    }
+    .type: ScalarType = { .u16, .u32, .u64,
+                          .s16, .s32, .s64 };
+    ScalarType = { .s32 };
+
+    sub{.rnd}{.ftz}{.sat}.f32  d, a, b => {
+        ast::Instruction::Sub {
+            data: ast::ArithDetails::Float(
+                ast::ArithFloat {
+                    type_: f32,
+                    rounding: rnd.map(Into::into),
+                    flush_to_zero: Some(ftz),
+                    saturate: sat
+                }
+            ),
+            arguments: SubArgs { dst: d, src1: a, src2: b  }
+        }
+    }
+    sub{.rnd}.f64              d, a, b => {
+        ast::Instruction::Sub {
+            data: ast::ArithDetails::Float(
+                ast::ArithFloat {
+                    type_: f64,
+                    rounding: rnd.map(Into::into),
+                    flush_to_zero: None,
+                    saturate: false
+                }
+            ),
+            arguments: SubArgs { dst: d, src1: a, src2: b  }
+        }
+    }
+    .rnd: RawRoundingMode = { .rn, .rz, .rm, .rp };
+    ScalarType = { .f32, .f64 };
+
+    sub{.rnd}{.ftz}{.sat}.f16   d, a, b => {
+        ast::Instruction::Sub {
+            data: ast::ArithDetails::Float(
+                ast::ArithFloat {
+                    type_: f16,
+                    rounding: rnd.map(Into::into),
+                    flush_to_zero: Some(ftz),
+                    saturate: sat
+                }
+            ),
+            arguments: SubArgs { dst: d, src1: a, src2: b  }
+        }
+    }
+    sub{.rnd}{.ftz}{.sat}.f16x2 d, a, b => {
+        ast::Instruction::Sub {
+            data: ast::ArithDetails::Float(
+                ast::ArithFloat {
+                    type_: f16x2,
+                    rounding: rnd.map(Into::into),
+                    flush_to_zero: Some(ftz),
+                    saturate: sat
+                }
+            ),
+            arguments: SubArgs { dst: d, src1: a, src2: b  }
+        }
+    }
+    sub{.rnd}.bf16   d, a, b => {
+        ast::Instruction::Sub {
+            data: ast::ArithDetails::Float(
+                ast::ArithFloat {
+                    type_: bf16,
+                    rounding: rnd.map(Into::into),
+                    flush_to_zero: None,
+                    saturate: false
+                }
+            ),
+            arguments: SubArgs { dst: d, src1: a, src2: b  }
+        }
+    }
+    sub{.rnd}.bf16x2 d, a, b => {
+        ast::Instruction::Sub {
+            data: ast::ArithDetails::Float(
+                ast::ArithFloat {
+                    type_: bf16x2,
+                    rounding: rnd.map(Into::into),
+                    flush_to_zero: None,
+                    saturate: false
+                }
+            ),
+            arguments: SubArgs { dst: d, src1: a, src2: b  }
+        }
+    }
+    .rnd: RawRoundingMode = { .rn };
+    ScalarType = { .f16, .f16x2, .bf16, .bf16x2 };
+
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
    ret{.uni} => {
        Instruction::Ret { data: RetData { uniform: uni } }