Skip to content

Commit

Permalink
Remove Q & DQ since Onnx wrapper model has them already (#405)
Browse files Browse the repository at this point in the history
* The gen_qnn_ctx_onnx_model.py script updated so that the wrapper Onnx model generated from native QNN context binary will have Q, DQ nodes if the inputs/outputs are quantized data. So the example application doesn't need to quantize inputs or dequantize the outputs any more.

* remove methods QuantizedData & DequantizedData which are not required
  • Loading branch information
HectorSVC authored Apr 3, 2024
1 parent 3d09496 commit 8fcc97e
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 66 deletions.
76 changes: 10 additions & 66 deletions c_cxx/QNN_EP/mobilenetv2_classification/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,50 +22,8 @@ bool CheckStatus(const OrtApi* g_ort, OrtStatus* status) {
return true;
}

template <typename T_QuantType>
void QuantizedData(T_QuantType* out, const float* in, int32_t offset, float scale, size_t num_elements) {
static_assert(std::is_unsigned<T_QuantType>::value, "QuantizedData supports unsigned only!");

if (nullptr == out || nullptr == in) {
throw Ort::Exception("Received a nullptr", OrtErrorCode::ORT_EP_FAIL);
}

size_t data_type_size_in_bytes = sizeof(T_QuantType);
size_t bit_width = data_type_size_in_bytes * 8;
double true_bit_width_max = pow(2, bit_width) - 1;
double encoding_min = offset * scale;
double encoding_max = (true_bit_width_max + offset) * scale;
double encoding_range = encoding_max - encoding_min;

for (size_t i = 0; i < num_elements; ++i) {
int quantized_value = static_cast<int>(round(true_bit_width_max * (in[i] - encoding_min) / encoding_range));
if (quantized_value < 0) {
quantized_value = 0;
} else if (quantized_value > (int)true_bit_width_max) {
quantized_value = (int)true_bit_width_max;
}
out[i] = static_cast<T_QuantType>(quantized_value);
}
}


template <typename T_QuantType>
void DequantizedData(float* out, const T_QuantType* in, int32_t offset, float scale, size_t num_elements) {
static_assert(std::is_unsigned<T_QuantType>::value, "DequantizedData supports unsigned only!");

if (nullptr == out || nullptr == in) {
throw Ort::Exception("Received a nullptr", OrtErrorCode::ORT_EP_FAIL);
}

for (size_t i = 0; i < num_elements; i++) {
double quantized_value = static_cast<double>(in[i]);
double offset_double = static_cast<double>(offset);
out[i] = static_cast<float>((quantized_value + offset_double) * scale);
}
}

void run_ort_qnn_ep(const std::string& backend, const std::string& model_path, const std::string& input_path,
bool generated_from_native_qnn, bool generate_ctx, bool float32_model) {
bool generate_ctx, bool float32_model) {
std::wstring model_path_wstr = std::wstring(model_path.begin(), model_path.end());

const OrtApi* g_ort = OrtGetApiBase()->GetApi(ORT_API_VERSION);
Expand Down Expand Up @@ -195,19 +153,13 @@ void run_ort_qnn_ep(const std::string& backend, const std::string& model_path, c
input_raw_file.read(reinterpret_cast<char*>(&input_data[0]), num_elements * sizeof(float));

CheckStatus(g_ort, g_ort->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &memory_info));
// QNN native tool chain generated quantized model use quantized data as inputs & outputs
if (generated_from_native_qnn) {
size_t input_data_length = input_data_size * sizeof(uint8_t);
QuantizedData(quantized_input_data.data(), input_data.data(), -116, 0.015875209f, input_data_size);
CheckStatus(g_ort, g_ort->CreateTensorWithDataAsOrtValue(
memory_info, reinterpret_cast<void*>(quantized_input_data.data()), input_data_length,
input_node_dims[0].data(), input_node_dims[0].size(), input_types[0], &input_tensors[0]));
} else { // Ort generate QDQ model still use float32 data as inputs & outputs
size_t input_data_length = input_data_size * sizeof(float);
CheckStatus(g_ort, g_ort->CreateTensorWithDataAsOrtValue(
memory_info, reinterpret_cast<void*>(input_data.data()), input_data_length,
input_node_dims[0].data(), input_node_dims[0].size(), input_types[0], &input_tensors[0]));
}
// QNN native tool chain generated quantized model use quantized data as inputs & outputs by default,
// We wrapped it with Q and DQ node in gen_qnn_ctx_onnx_model.py, so the inputs & outputs are still float
// Ort generate QDQ model still use float32 data as inputs & outputs
size_t input_data_length = input_data_size * sizeof(float);
CheckStatus(g_ort, g_ort->CreateTensorWithDataAsOrtValue(
memory_info, reinterpret_cast<void*>(input_data.data()), input_data_length,
input_node_dims[0].data(), input_node_dims[0].size(), input_types[0], &input_tensors[0]));
g_ort->ReleaseMemoryInfo(memory_info);

CheckStatus(g_ort, g_ort->Run(session, nullptr, input_node_names.data(), (const OrtValue* const*)input_tensors.data(),
Expand All @@ -219,13 +171,7 @@ void run_ort_qnn_ep(const std::string& backend, const std::string& model_path, c
void* output_buffer;
CheckStatus(g_ort, g_ort->GetTensorMutableData(output_tensors[0], &output_buffer));
float* float_buffer = nullptr;
if (generated_from_native_qnn) {
uint8_t* buffer = reinterpret_cast<uint8_t*>(output_buffer);
DequantizedData(output_data.data(), buffer, -86, 0.08069417f, output_data_size);
float_buffer = output_data.data();
} else {
float_buffer = reinterpret_cast<float*>(output_buffer);
}
float_buffer = reinterpret_cast<float*>(output_buffer);

auto max = std::max_element(float_buffer, float_buffer + output_data_size);
int max_index = static_cast<int>(std::distance(float_buffer, max));
Expand Down Expand Up @@ -278,7 +224,6 @@ int main(int argc, char* argv[]) {
}

std::string backend = "";
bool generated_from_native_qnn = false;
bool float32_model = false;
if (strcmp(argv[1], CPUBACKEDN) == 0) {
backend = "QnnCpu.dll";
Expand All @@ -290,7 +235,6 @@ int main(int argc, char* argv[]) {
backend = "QnnHtp.dll";
} else if (strcmp(argv[1], QNNCTXBINARY) == 0) {
backend = "QnnHtp.dll";
generated_from_native_qnn = true;
if (generate_ctx) {
std::cout << "--gen_ctx won't work with --qnn." << std::endl;
return 1;
Expand All @@ -309,6 +253,6 @@ int main(int argc, char* argv[]) {
std::string model_path(argv[2]);
std::string input_path(argv[3]);

run_ort_qnn_ep(backend, model_path, input_path, generated_from_native_qnn, generate_ctx, float32_model);
run_ort_qnn_ep(backend, model_path, input_path, generate_ctx, float32_model);
return 0;
}
2 changes: 2 additions & 0 deletions c_cxx/QNN_EP/mobilenetv2_classification/run_qnn_ep_sample.bat
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,11 @@ IF EXIST mobilenetv2-12_quant_shape.onnx_ctx.onnx (
REM run mobilenetv2-12_net_qnn_ctx.onnx (generated from native QNN) with QNN HTP backend
qnn_ep_sample.exe --qnn mobilenetv2-12_net_qnn_ctx.onnx kitten_input_nhwc.raw

REM only works for v73 and higher
REM run mobilenetv2-12_shape.onnx (float32 model) with QNN HTP backend with FP16 precision
qnn_ep_sample.exe --fp32 mobilenetv2-12_shape.onnx kitten_input.raw

REM only works for v73 and higher
REM run mobilenetv2-12_shape_fp16.onnx (float16 model with float32 IO) with QNN HTP backend
qnn_ep_sample.exe --fp16 mobilenetv2-12_shape_fp16.onnx kitten_input.raw

Expand Down

0 comments on commit 8fcc97e

Please sign in to comment.