diff --git a/CMakeLists.txt b/CMakeLists.txt
index db3525a811..450a0d67de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,6 +107,7 @@ message(DEBUG "INS_ENB             : ${INS_ENB}")
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
+option(GGML_CPU_FFAST_MATH   "ggml: use approximate math" OFF)
 option(GGML_SSE42            "ggml: enable SSE 4.2"          ${INS_ENB})
 option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
diff --git a/src/ggml-cpu/CMakeLists.txt b/src/ggml-cpu/CMakeLists.txt
index bf4fe79a95..9a4c684354 100644
--- a/src/ggml-cpu/CMakeLists.txt
+++ b/src/ggml-cpu/CMakeLists.txt
@@ -28,6 +28,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         ggml-cpu/binary-ops.cpp
         ggml-cpu/unary-ops.h
         ggml-cpu/unary-ops.cpp
+        ggml-cpu/unary-ops.inc
+        ggml-cpu/unary-ops-ffast-math.cpp
         ggml-cpu/simd-mappings.h
         ggml-cpu/vec.h
         ggml-cpu/vec.cpp
@@ -64,6 +66,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     endif()
 
+    if (GGML_CPU_FFAST_MATH AND NOT MSVC)
+        set_source_files_properties(ggml-cpu/unary-ops-ffast-math.cpp PROPERTIES COMPILE_FLAGS "-ffast-math $<$<CONFIG:RelWithDebInfo>:-O3>")
+    endif()
+
     if (GGML_LLAMAFILE)
         target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
 
diff --git a/src/ggml-cpu/unary-ops-ffast-math.cpp b/src/ggml-cpu/unary-ops-ffast-math.cpp
new file mode 100644
index 0000000000..98ee1ec0c5
--- /dev/null
+++ b/src/ggml-cpu/unary-ops-ffast-math.cpp
@@ -0,0 +1,21 @@
+#include "unary-ops.inc"
+
+// This file is compiled with -ffast-math only ifdef GGML_CPU_FFAST_MATH.
+// libmvec allows sine/cos vectorization but not bit-identically to libm.
+// Backends (e.g. CUDA) aren't bit-identical either, but more people expect the CPU backend to be.
+
+static inline float op_sin(float x) {
+    return sinf(x);
+}
+
+static inline float op_cos(float x) {
+    return cosf(x);
+}
+
+void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sin>(params, dst);
+}
+
+void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_cos>(params, dst);
+}
diff --git a/src/ggml-cpu/unary-ops.cpp b/src/ggml-cpu/unary-ops.cpp
index 4fce569b3b..9b4b785f94 100644
--- a/src/ggml-cpu/unary-ops.cpp
+++ b/src/ggml-cpu/unary-ops.cpp
@@ -1,4 +1,4 @@
-#include "unary-ops.h"
+#include "unary-ops.inc"
 
 static inline float op_abs(float x) {
     return fabsf(x);
@@ -52,75 +52,10 @@ static inline float op_sqrt(float x) {
     return sqrtf(x);
 }
 
-static inline float op_sin(float x) {
-    return sinf(x);
-}
-
-static inline float op_cos(float x) {
-    return cosf(x);
-}
-
 static inline float op_log(float x) {
     return logf(x);
 }
 
-template <float (*op)(float), typename src0_t, typename dst_t>
-static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
-    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
-    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
-
-    for (int i = 0; i < n; i++) {
-        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
-    }
-}
-
-template <float (*op)(float), typename src0_t, typename dst_t>
-static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(dst_t));
-    GGML_ASSERT(nb00 == sizeof(src0_t));
-
-    const auto [ir0, ir1] = get_thread_range(params, src0);
-
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        const int64_t i03 = ir/(ne02*ne01);
-        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-        vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
-    }
-}
-
-// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
-template <float (*op)(float)>
-static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
-        apply_unary_op<op, float, float>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
-        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
-        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
-        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
-        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type));
-        GGML_ABORT("fatal error");
-    }
-}
-
 void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
     unary_op<op_abs>(params, dst);
 }
@@ -173,14 +108,6 @@ void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor *
     unary_op<op_sqrt>(params, dst);
 }
 
-void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_sin>(params, dst);
-}
-
-void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op<op_cos>(params, dst);
-}
-
 void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
     unary_op<op_log>(params, dst);
 }
diff --git a/src/ggml-cpu/unary-ops.inc b/src/ggml-cpu/unary-ops.inc
new file mode 100644
index 0000000000..c0fba78743
--- /dev/null
+++ b/src/ggml-cpu/unary-ops.inc
@@ -0,0 +1,58 @@
+#include "unary-ops.h"
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
+    }
+}
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+        vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float)>
+static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}