@@ -1469,10 +1469,19 @@ SIMSIMD_PUBLIC void simsimd_fma_bf16_skylake(
1469
1469
1470
1470
#if SIMSIMD_TARGET_SAPPHIRE
1471
1471
#pragma GCC push_options
1472
- #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512fp16")
1473
- #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512fp16"))), \
1472
+ #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512fp16", "f16c" )
1473
+ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512fp16,f16c "))), \
1474
1474
apply_to = function)
1475
1475
1476
+ /**
1477
+ * Using `_mm512_set1_ph((_Float16)1.f)` results in compilation warnings if we are pedantic.
1478
+ * https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-8/details-about-intrinsics-for-half-floats.html
1479
+ */
1480
+ SIMSIMD_INTERNAL __m512h _mm512_set1_ph_from_ps (float a ) {
1481
+ unsigned short h = _cvtss_sh (a , _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
1482
+ return (__m512h )_mm512_set1_epi16 (h );
1483
+ }
1484
+
1476
1485
SIMSIMD_PUBLIC void simsimd_sum_f16_sapphire (simsimd_f16_t const * a , simsimd_f16_t const * b , simsimd_size_t n ,
1477
1486
simsimd_f16_t * result ) {
1478
1487
__mmask32 mask = 0xFFFFFFFF ;
@@ -1500,7 +1509,7 @@ SIMSIMD_PUBLIC void simsimd_scale_f16_sapphire(simsimd_f16_t const *a, simsimd_s
1500
1509
simsimd_f16_t * result ) {
1501
1510
1502
1511
__mmask32 mask = 0xFFFFFFFF ;
1503
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1512
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1504
1513
__m512h a_f16_vec , b_f16_vec ;
1505
1514
__m512h sum_f16_vec ;
1506
1515
simsimd_scale_f16_sapphire_cycle :
@@ -1540,8 +1549,8 @@ SIMSIMD_PUBLIC void simsimd_wsum_f16_sapphire( //
1540
1549
1541
1550
// The general case.
1542
1551
__mmask32 mask = 0xFFFFFFFF ;
1543
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1544
- __m512h beta_vec = _mm512_set1_ph (( _Float16 ) beta );
1552
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1553
+ __m512h beta_vec = _mm512_set1_ph_from_ps ( beta );
1545
1554
__m512h a_f16_vec , b_f16_vec ;
1546
1555
__m512h a_scaled_f16_vec , sum_f16_vec ;
1547
1556
simsimd_wsum_f16_sapphire_cycle :
@@ -1568,8 +1577,8 @@ SIMSIMD_PUBLIC void simsimd_fma_f16_sapphire(
1568
1577
simsimd_distance_t alpha , simsimd_distance_t beta , simsimd_f16_t * result ) {
1569
1578
1570
1579
__mmask32 mask = 0xFFFFFFFF ;
1571
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1572
- __m512h beta_vec = _mm512_set1_ph (( _Float16 ) beta );
1580
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1581
+ __m512h beta_vec = _mm512_set1_ph_from_ps ( beta );
1573
1582
__m512h a_f16_vec , b_f16_vec , c_f16_vec ;
1574
1583
__m512h ab_f16_vec , ab_scaled_f16_vec , sum_f16_vec ;
1575
1584
simsimd_fma_f16_sapphire_cycle :
@@ -1619,7 +1628,7 @@ SIMSIMD_PUBLIC void simsimd_sum_u8_sapphire(simsimd_u8_t const *a, simsimd_u8_t
1619
1628
SIMSIMD_PUBLIC void simsimd_scale_u8_sapphire (simsimd_u8_t const * a , simsimd_size_t n , simsimd_distance_t alpha ,
1620
1629
simsimd_u8_t * result ) {
1621
1630
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull ;
1622
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1631
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1623
1632
__m512i a_u8_vec , b_u8_vec , sum_u8_vec ;
1624
1633
__m512h a_f16_low_vec , a_f16_high_vec ;
1625
1634
__m512h a_scaled_f16_low_vec , a_scaled_f16_high_vec , sum_f16_low_vec , sum_f16_high_vec ;
@@ -1670,8 +1679,8 @@ SIMSIMD_PUBLIC void simsimd_wsum_u8_sapphire( //
1670
1679
1671
1680
// The general case.
1672
1681
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull ;
1673
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1674
- __m512h beta_vec = _mm512_set1_ph (( _Float16 ) beta );
1682
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1683
+ __m512h beta_vec = _mm512_set1_ph_from_ps ( beta );
1675
1684
__m512i a_u8_vec , b_u8_vec , sum_u8_vec ;
1676
1685
__m512h a_f16_low_vec , a_f16_high_vec , b_f16_low_vec , b_f16_high_vec ;
1677
1686
__m512h a_scaled_f16_low_vec , a_scaled_f16_high_vec , sum_f16_low_vec , sum_f16_high_vec ;
@@ -1739,7 +1748,7 @@ SIMSIMD_PUBLIC void simsimd_scale_i8_sapphire(simsimd_i8_t const *a, simsimd_siz
1739
1748
simsimd_i8_t * result ) {
1740
1749
1741
1750
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull ;
1742
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1751
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1743
1752
__m512i a_i8_vec , sum_i8_vec ;
1744
1753
__m512h a_f16_low_vec , a_f16_high_vec ;
1745
1754
__m512h sum_f16_low_vec , sum_f16_high_vec ;
@@ -1791,8 +1800,8 @@ SIMSIMD_PUBLIC void simsimd_wsum_i8_sapphire( //
1791
1800
1792
1801
// The general case.
1793
1802
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull ;
1794
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1795
- __m512h beta_vec = _mm512_set1_ph (( _Float16 ) beta );
1803
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1804
+ __m512h beta_vec = _mm512_set1_ph_from_ps ( beta );
1796
1805
__m512i a_i8_vec , b_i8_vec , sum_i8_vec ;
1797
1806
__m512h a_f16_low_vec , a_f16_high_vec , b_f16_low_vec , b_f16_high_vec ;
1798
1807
__m512h a_scaled_f16_low_vec , a_scaled_f16_high_vec , sum_f16_low_vec , sum_f16_high_vec ;
@@ -1836,8 +1845,8 @@ SIMSIMD_PUBLIC void simsimd_fma_i8_sapphire(
1836
1845
simsimd_distance_t alpha , simsimd_distance_t beta , simsimd_i8_t * result ) {
1837
1846
1838
1847
__mmask64 mask = 0xFFFFFFFFFFFFFFFF ;
1839
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1840
- __m512h beta_vec = _mm512_set1_ph (( _Float16 ) beta );
1848
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1849
+ __m512h beta_vec = _mm512_set1_ph_from_ps ( beta );
1841
1850
__m512i a_i8_vec , b_i8_vec , c_i8_vec , sum_i8_vec ;
1842
1851
__m512h a_f16_low_vec , a_f16_high_vec , b_f16_low_vec , b_f16_high_vec ;
1843
1852
__m512h c_f16_low_vec , c_f16_high_vec , ab_f16_low_vec , ab_f16_high_vec ;
@@ -1889,8 +1898,8 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_sapphire(
1889
1898
simsimd_distance_t alpha , simsimd_distance_t beta , simsimd_u8_t * result ) {
1890
1899
1891
1900
__mmask64 mask = 0xFFFFFFFFFFFFFFFF ;
1892
- __m512h alpha_vec = _mm512_set1_ph (( _Float16 ) alpha );
1893
- __m512h beta_vec = _mm512_set1_ph (( _Float16 ) beta );
1901
+ __m512h alpha_vec = _mm512_set1_ph_from_ps ( alpha );
1902
+ __m512h beta_vec = _mm512_set1_ph_from_ps ( beta );
1894
1903
__m512i a_u8_vec , b_u8_vec , c_u8_vec , sum_u8_vec ;
1895
1904
__m512h a_f16_low_vec , a_f16_high_vec , b_f16_low_vec , b_f16_high_vec ;
1896
1905
__m512h c_f16_low_vec , c_f16_high_vec , ab_f16_low_vec , ab_f16_high_vec ;
0 commit comments