Fix faulty codegen for non-vectorized binary fp8 operations

AntonMoberg · AntonMoberg · commit efb2d90ed481 · 2025-03-10T12:47:07.000+01:00
Non-vectorized FP8 are store as __nv_fp8_[e5m2/e4m3] types, these types
do not have support for binary operatios because internally FP8 are
store in 16bit registers. This commits adds binary operator support by
doing the operations in __half instead of fp8 (i.e cast up to 16-bit,
then cast down to 8-bit).
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
@@ -214,7 +214,7 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
   if (alloc_storage_scope_.count(buffer_var)) {
     scope = alloc_storage_scope_.at(buffer_var);
   }
-  bool is_vol = IsVolatile(buffer_var);
+  bool is_vol = IsVolatile(buffer_var) && !t.is_float8();
 
   auto ptr_cast = [this, is_vol, scope](DataType pointed_to) {
     std::ostringstream ptr_os;
@@ -840,7 +840,8 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
     std::string value = this->PrintExpr(op->value);
     std::string ref = this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
     this->PrintIndent();
-    stream << ref << " = " << value << ";\n";
+    stream << ref << " = ";
+    stream << value << ";\n";
   } else {
     arith::PVar<PrimExpr> base;
 
@@ -876,7 +877,16 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
         stream << '[';
         PrintVecElemLoad(index, index_expr.dtype(), i, stream);
         stream << "] = ";
-        PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        if (op->value.dtype().is_float8()) {
+          ICHECK(value_dtype.lanes() == 2);
+          std::string fp8_type = op->value.dtype().is_e5m2_float8() ? "e5m2" : "e4m3";
+          static const char access[] = {'x', 'y'};
+          stream << "__nv_fp8_" << fp8_type << "(__half2(";
+          PrintVecElemLoad(value, op->value.dtype(), i, stream);
+          stream << ")." << access[i % 2] << ")";
+        } else {
+          PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        }
         stream << ";\n";
       }
       EndScope(vec_scope);
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -536,26 +536,32 @@ void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr l
       std::string fp8_lanes = (t.lanes() == 4) ? "x4" : ((t.lanes() == 2) ? "x2" : "");
       ICHECK(t.is_e4m3_float8() || t.is_e5m2_float8());
       if (t.lanes() == 2) {
-        value_temp << "__nv_cvt_halfraw2_to_fp8x2(";
+        value_temp << "__nv_fp8x2_" << (t.is_e5m2_float8() ? "e5m2" : "e4m3") << "(";
       } else {
-        value_temp << "__nv_fp8x4(";
+        value_temp << "__nv_fp8x4_" << (t.is_e5m2_float8() ? "e5m2" : "e4m3") << "(";
       }
       for (int i = 0, lanes = t.lanes() / 2; i < lanes; ++i) {
-        if (i == 0) {
-          value_temp << "make_half2(";
+        if (isalpha(op[0]) || op[0] == '_') {
+          value_temp << op << "2"
+                     << "(__half2(";
+          PrintVecElemLoad(vlhs, lhs.dtype(), i * lanes, value_temp);
+          value_temp << "), __half2(";
+          PrintVecElemLoad(vrhs, rhs.dtype(), i * lanes, value_temp);
+          value_temp << "))";
+        } else {
+          value_temp << "__half2(";
+          PrintVecElemLoad(vlhs, lhs.dtype(), i * lanes, value_temp);
+          value_temp << ") " << op << " __half2(";
+          PrintVecElemLoad(vrhs, rhs.dtype(), i * lanes, value_temp);
+          value_temp << ")";
+        }
+
+        if (i != lanes - 1) {
+          value_temp << ", ";
         }
-        PrintVecElemLoad(vlhs, lhs.dtype(), i * lanes, value_temp);
-        value_temp << op;
-        PrintVecElemLoad(vrhs, rhs.dtype(), i * lanes, value_temp);
-        value_temp << ",";
-        PrintVecElemLoad(vlhs, lhs.dtype(), i * lanes + 1, value_temp);
-        value_temp << op;
-        PrintVecElemLoad(vrhs, rhs.dtype(), i * lanes + 1, value_temp);
-        value_temp << ")";
         if (i == lanes - 1) {
           if (t.lanes() == 2) {
-            value_temp << ", __NV_SATFINITE, " << (t.is_e5m2_float8() ? "__NV_E5M2" : "__NV_E4M3")
-                       << ")";
+            value_temp << ")";
           }
           PrintVecElemStore(sret, t, i, value_temp.str());
         }
@@ -612,7 +618,7 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
     os << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
   } else if (t.is_float8()) {
     os << "__nv_cvt_fp8x2_to_halfraw2(" << vec << ".__x,"
-       << (t.is_e5m2_float8() ? "__NV_E5M2" : "__NV_E4M3") << ")." << access[i % 2];
+       << (t.is_e5m2_float8() ? "__NV_E5M2" : "__NV_E4M3") << ")";
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -672,7 +678,7 @@ void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
   } else if (t.is_float8()) {
     // Since fp8 is a packed type (2 or 4 lanes), we only want call at end.
     ICHECK(i == (t.lanes() / 2) - 1);
-    stream << vec << ".__x = " << value << ";\n";
+    stream << vec << " = " << value << ";\n";
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -1740,5 +1746,115 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
   return;
 }
 
+template <typename T>
+inline void PrintBinaryExpr(const T* op, const char* opstr,
+                            std::ostream& os,  // NOLINT(*)
+                            CodeGenCUDA* p) {
+  if (op->dtype.lanes() == 1) {
+    if (op->dtype.is_float8()) {
+      std::string fp8_type = (op->dtype.is_e5m2_float8() ? "__NV_E5M2" : "__NV_E4M3");
+      if (isalpha(opstr[0]) || opstr[0] == '_') {
+        os << "__nv_fp8_" << (op->dtype.is_e5m2_float8() ? "e5m2" : "e4m3") << "(";
+        os << opstr << "(";
+        os << "__half(__nv_cvt_fp8_to_halfraw(";
+        p->PrintExpr(op->a, os);
+        os << ".__x, " << fp8_type << ")), __half(__nv_cvt_fp8_to_halfraw(";
+        p->PrintExpr(op->b, os);
+        os << ".__x, " << fp8_type << ")))";
+        os << ")";
+      } else {
+        os << "__nv_fp8_" << (op->dtype.is_e5m2_float8() ? "e5m2" : "e4m3") << "(";
+        os << "__half(__nv_cvt_fp8_to_halfraw(";
+        p->PrintExpr(op->a, os);
+        os << ".__x, " << fp8_type << ")) " << opstr << " __half(__nv_cvt_fp8_to_halfraw(";
+        p->PrintExpr(op->b, os);
+        os << ".__x, " << fp8_type << ")))";
+      }
+    } else {
+      if (isalpha(opstr[0])) {
+        os << opstr << '(';
+        p->PrintExpr(op->a, os);
+        os << ", ";
+        p->PrintExpr(op->b, os);
+        os << ')';
+      } else {
+        os << '(';
+        p->PrintExpr(op->a, os);
+        os << ' ' << opstr << ' ';
+        p->PrintExpr(op->b, os);
+        os << ')';
+      }
+    }
+
+  } else {
+    p->PrintVecBinaryOp(opstr, op->dtype, op->a, op->b, os);
+  }
+}
+
+void CodeGenCUDA::VisitExpr_(const AddNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "+", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const SubNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "-", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const MulNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "*", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const DivNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "/", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const ModNode* op, std::ostream& os) {  // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr(op, "%", os, this);
+  } else {
+    ICHECK(op->dtype.is_float()) << "Expected floating point or integer dtype in Mod, but got "
+                                 << op->dtype;
+    if (op->dtype.bits() == 32) {
+      PrintBinaryExpr(op, "fmodf", os, this);
+    } else if (op->dtype.bits() == 64) {
+      PrintBinaryExpr(op, "fmod", os, this);
+    } else {
+      ICHECK(false)
+          << "Non single or double precision floating point in Mod, expected 32 or 64 bits but got "
+          << op->dtype.bits() << " bits.";
+    }
+  }
+}
+
+void CodeGenCUDA::VisitExpr_(const MinNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, op->dtype.is_float8() ? "__hmin" : "min", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const MaxNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, op->dtype.is_float8() ? "__hmax" : "max", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const EQNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "==", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const NENode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "!=", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const LTNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "<", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const LENode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "<=", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const GTNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, ">", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const GENode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, ">=", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const AndNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "&&", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const OrNode* op, std::ostream& os) {  // NOLINT(*)
+  PrintBinaryExpr(op, "||", os, this);
+}
+void CodeGenCUDA::VisitExpr_(const NotNode* op, std::ostream& os) {  // NOLINT(*)
+  os << '!';
+  PrintExpr(op->a, os);
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h
@@ -68,6 +68,22 @@ class CodeGenCUDA final : public CodeGenC {
   void VisitExpr_(const FloatImmNode* op, std::ostream& os) final;
   void VisitExpr_(const CallNode* op, std::ostream& os) final;
   void VisitExpr_(const CastNode* op, std::ostream& os) final;
+  void VisitExpr_(const AddNode* op, std::ostream& os) final;
+  void VisitExpr_(const SubNode* op, std::ostream& os) final;
+  void VisitExpr_(const MulNode* op, std::ostream& os) final;
+  void VisitExpr_(const DivNode* op, std::ostream& os) final;
+  void VisitExpr_(const ModNode* op, std::ostream& os) final;
+  void VisitExpr_(const MinNode* op, std::ostream& os) final;
+  void VisitExpr_(const MaxNode* op, std::ostream& os) final;
+  void VisitExpr_(const EQNode* op, std::ostream& os) final;
+  void VisitExpr_(const NENode* op, std::ostream& os) final;
+  void VisitExpr_(const LTNode* op, std::ostream& os) final;
+  void VisitExpr_(const LENode* op, std::ostream& os) final;
+  void VisitExpr_(const GTNode* op, std::ostream& os) final;
+  void VisitExpr_(const GENode* op, std::ostream& os) final;
+  void VisitExpr_(const AndNode* op, std::ostream& os) final;
+  void VisitExpr_(const OrNode* op, std::ostream& os) final;
+  void VisitExpr_(const NotNode* op, std::ostream& os) final;
   void VisitStmt_(const EvaluateNode* op) final;
   void VisitStmt_(const AllocateNode* op) final;
   void VisitStmt_(const AttrStmtNode* op) final;