forked from uTensor/uTensor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMatrixOps.hpp
161 lines (138 loc) · 5.4 KB
/
MatrixOps.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#ifndef UTENSOR_MATRIX_OPS
#define UTENSOR_MATRIX_OPS
#include <cmath>
#include <cstdlib>
#include <limits>
#include "quantization_utils.hpp"
#include "tensor.hpp"
// tensorflow/tensorflow/core/kernels/reference_gemm.h
template <class T1, class T2, class T3>
void ReferenceGemmuImpl(bool transpose_a, bool transpose_b, bool transpose_c,
size_t m, size_t n, size_t k, const T1* a,
int32_t offset_a, size_t lda, const T2* b, int offset_b,
size_t ldb, T3* c, int shift_c, int offset_c,
int mult_c, size_t ldc) {
int a_i_stride = lda;
int a_l_stride = 1;
if (transpose_a) {
a_i_stride = 1;
a_l_stride = lda;
}
int b_j_stride = 1;
int b_l_stride = ldb;
if (transpose_b) {
b_j_stride = ldb;
b_l_stride = 1;
}
int c_i_stride = ldc;
int c_j_stride = 1;
if (transpose_c) {
c_i_stride = 1;
c_j_stride = ldc;
}
const int32_t highest = static_cast<int32_t>(std::numeric_limits<T3>::max());
const int32_t lowest = static_cast<int32_t>(std::numeric_limits<T3>::min());
const int32_t rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1));
size_t i, j, l;
for (j = 0; j < n; j++) {
for (i = 0; i < m; i++) {
int32_t total = 0;
for (l = 0; l < k; l++) {
const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
const int32_t a_value = static_cast<int32_t>(a[a_index]) - offset_a;
const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
const int32_t b_value = static_cast<int32_t>(b[b_index]) - offset_b;
total += (a_value * b_value);
}
const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
int32_t output = ((((total + offset_c) * mult_c) + rounding) >> shift_c);
if (output > highest) {
output = highest;
}
if (output < lowest) {
output = lowest;
}
c[c_index] = static_cast<T3>(output);
}
}
}
template <class T>
float FloatForOneQuantizedLevel(
float range_min,
float
range_max) // NT: information loss if float_for_one_quantized_level < 1
{
const int64_t highest = static_cast<int64_t>(std::numeric_limits<T>::max());
const int64_t lowest = static_cast<int64_t>(std::numeric_limits<T>::lowest());
const float float_for_one_quantized_level =
(range_max - range_min) / (highest - lowest);
return float_for_one_quantized_level;
}
template <class T1, class T2, class T3>
void QuantizationRangeForMultiplication(float min_a, float max_a, float min_b,
float max_b, float* min_c,
float* max_c) {
const float a_float_for_one_quant_level =
FloatForOneQuantizedLevel<T1>(min_a, max_a);
const float b_float_for_one_quant_level =
FloatForOneQuantizedLevel<T2>(min_b, max_b);
const int64_t c_highest =
static_cast<int64_t>(std::numeric_limits<T3>::max());
const int64_t c_lowest =
static_cast<int64_t>(std::numeric_limits<T3>::lowest());
const float c_float_for_one_quant_level =
a_float_for_one_quant_level * b_float_for_one_quant_level;
*min_c = c_float_for_one_quant_level * c_lowest; // NT: this resulting in
// taking only the necessary
// quantize range
*max_c = c_float_for_one_quant_level * c_highest;
}
template <class T1, class T2, class Toutput>
void QuantizedMatMul(Tensor<T1> A, Tensor<T2> B, Tensor<Toutput> &C,
Tensor<float> mina, Tensor<float> minb, Tensor<float> maxa,
Tensor<float> maxb, Tensor<float> outmin,
Tensor<float> outmax, bool transpose_a = false,
bool transpose_b = false) {
const float min_a = *(mina.getPointer({}));
const float max_a = *(maxa.getPointer({}));
const float min_b = *(minb.getPointer({}));
const float max_b = *(maxb.getPointer({}));
//auto tensor allocation
Shape c_shape;
c_shape.push_back((A.getShape())[0]);
c_shape.push_back((B.getShape())[1]);
tensorChkAlloc(C, c_shape);
const int32_t offset_a = FloatToQuantizedUnclamped<T1>(
0.0f, min_a, max_a); // NT: what 0 quantized to; depends on
// Eigen::NumTraits<T>::lowest()
const int32_t offset_b = FloatToQuantizedUnclamped<T2>(0.0f, min_b, max_b);
const int32_t offset_c = 0;
const int32_t mult_c = 1;
const int32_t shift_c = 0;
int first = transpose_a ? 0 : 1;
int second = transpose_b ? 1 : 0;
int a_dim_remaining = 1 - first;
int b_dim_remaining = 1 - second;
T1* A_Data = A.getPointer({});
T2* B_Data = B.getPointer({});
Toutput* C_Data = C.getPointer({});
const bool transpose_c = false;
const size_t m = A.getShape()[a_dim_remaining];
const size_t n = B.getShape()[b_dim_remaining];
const size_t k = A.getShape()[first];
const size_t lda = A.getShape()[1];
const size_t ldb = B.getShape()[1];
const size_t ldc = n;
ReferenceGemmuImpl<T1, T2, Toutput>(
transpose_a, transpose_b, transpose_c, m, n, k, A_Data, offset_a, lda,
B_Data, offset_b, ldb, C_Data, shift_c, offset_c, mult_c, ldc);
float min_c_value;
float max_c_value;
QuantizationRangeForMultiplication<T1, T2, Toutput>(
min_a, max_a, min_b, max_b, &min_c_value, &max_c_value);
float* c_min = outmin.getPointer({});
*c_min = min_c_value;
float* c_max = outmax.getPointer({});
*c_max = max_c_value;
}
#endif