Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into snnn/remove_nsync
Browse files Browse the repository at this point in the history
  • Loading branch information
snnn committed Apr 24, 2024
2 parents cb3f16b + a5182a2 commit d1f00f6
Show file tree
Hide file tree
Showing 31 changed files with 1,684 additions and 96 deletions.
6 changes: 4 additions & 2 deletions cmake/onnxruntime_providers_coreml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,12 @@ endif()
if (APPLE)
file(GLOB
onnxruntime_providers_coreml_objcc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.h"
"${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.mm"
"${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.h"
"${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.mm"
"${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.h"
"${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.mm"
"${ONNXRUNTIME_ROOT}/core/providers/coreml/model/objc_str_utils.h"
"${ONNXRUNTIME_ROOT}/core/providers/coreml/model/objc_str_utils.mm"
)
else()
# add the Model implementation that uses the protobuf types but excludes any actual CoreML dependencies
Expand Down
4 changes: 3 additions & 1 deletion docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,9 @@ Do not modify directly.*
|||[16, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
|||[13, 15]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
|ScatterND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *in* updates:**T**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|ScatterND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *in* updates:**T**<br> *out* output:**T**|18+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[16, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[13, 15]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Selu|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
|SequenceAt|*in* input_sequence:**S**<br> *in* position:**I**<br> *out* tensor:**T**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
Expand Down
59 changes: 31 additions & 28 deletions js/web/lib/wasm/jsep/webgpu/ops/attention.ts
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ const createInPlaceSoftmaxProgramInfo = (_context: ComputeContext, input: Tensor
let local_offset = local_idx * uniforms.elements_per_thread;
let offset = workgroup_id.x * uniforms.d_comp + local_offset;
var thread_max_vector = ${inputHelper.type.value}(-3.402823e+38f);
var thread_max_vector = ${f32Type}(-3.402823e+38f);
for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < uniforms.d_comp; i++) {
thread_max_vector = max(${f32Type}(x[offset + i]), thread_max_vector);
}
Expand All @@ -282,12 +282,12 @@ const createInPlaceSoftmaxProgramInfo = (_context: ComputeContext, input: Tensor
})()};
workgroupBarrier();
var max_value: f32 = -3.402823e+38f;
var max_value = -3.402823e+38f;
for (var i = 0u; i < ${WG}; i++) {
max_value = max(thread_max[i], max_value);
}
var sum_vector = ${inputHelper.type.value}(${0});
var sum_vector = ${f32Type}(${0});
for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < uniforms.d_comp; i++) {
sum_vector += exp(${f32Type}(x[offset + i]) - max_value);
}
Expand Down Expand Up @@ -333,9 +333,9 @@ const createInPlaceSoftmaxProgramInfo = (_context: ComputeContext, input: Tensor

const createAttentionProbsProgramInfo =
(_context: ComputeContext, q: TensorView, key: TensorView, relativePositionBias: TensorView|undefined,
parameters: AttentionParameters, attributes: AttentionAttrs) => {
const probsShape =
[parameters.batchSize, parameters.numHeads, parameters.sequenceLength, parameters.totalSequenceLength];
parameters: AttentionParameters, attributes: AttentionAttrs, pastSequenceLength: number) => {
const totalSequenceLength = pastSequenceLength + parameters.kvSequenceLength;
const probsShape = [parameters.batchSize, parameters.numHeads, parameters.sequenceLength, totalSequenceLength];

// TODO: handle mask

Expand All @@ -344,14 +344,13 @@ const createAttentionProbsProgramInfo =
const vectorizedHeadSize = parameters.headSize / components;
const TILE_SIZE = 12;
const dispatch = {
x: Math.ceil(parameters.totalSequenceLength / TILE_SIZE),
x: Math.ceil(totalSequenceLength / TILE_SIZE),
y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
z: parameters.batchSize * parameters.numHeads
};
const programUniforms: ProgramUniform[] = [
{type: DataType.uint32, data: parameters.sequenceLength}, {type: DataType.uint32, data: vectorizedHeadSize},
{type: DataType.uint32, data: parameters.totalSequenceLength},
{type: DataType.uint32, data: parameters.numHeads}, {type: DataType.uint32, data: parameters.kvSequenceLength},
{type: DataType.uint32, data: totalSequenceLength}, {type: DataType.uint32, data: parameters.numHeads},
{type: q.dataType, data: alpha}
];

Expand All @@ -376,8 +375,7 @@ const createAttentionProbsProgramInfo =

const uniforms: UniformsArrayType = [
{name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
{name: 'num_heads', type: 'u32'}, {name: 'kv_sequence_length', type: 'u32'},
{name: 'alpha', type: dataType as UniformDataElementType}
{name: 'num_heads', type: 'u32'}, {name: 'alpha', type: dataType as UniformDataElementType}
];
return `
const beta: ${dataType} = 1.0;
Expand All @@ -394,7 +392,7 @@ const createAttentionProbsProgramInfo =
let m = workgroup_id.y * TILE_SIZE;
let n = workgroup_id.x * TILE_SIZE;
let qOffset = uniforms.M * uniforms.K * headIdx + m * uniforms.K;
let kOffset = uniforms.kv_sequence_length * uniforms.K * headIdx + n * uniforms.K;
let kOffset = uniforms.N * uniforms.K * headIdx + n * uniforms.K;
var value = ${qInput.type.value}(0);
for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
Expand Down Expand Up @@ -456,7 +454,9 @@ const createAttentionProbsProgramInfo =


const createVxAttentionScoreProgramInfo =
(_context: ComputeContext, probs: TensorView, v: TensorView, params: AttentionParameters) => {
(_context: ComputeContext, probs: TensorView, v: TensorView, params: AttentionParameters,
pastSequenceLength: number) => {
const totalSequenceLength = pastSequenceLength + params.kvSequenceLength;
const outputShape = [params.batchSize, params.sequenceLength, params.vHiddenSize];
const TILE_SIZE = 12;
const dispatch = {
Expand All @@ -465,7 +465,7 @@ const createVxAttentionScoreProgramInfo =
z: params.batchSize * params.numHeads
};
const programUniforms: ProgramUniform[] = [
{type: DataType.uint32, data: params.sequenceLength}, {type: DataType.uint32, data: params.totalSequenceLength},
{type: DataType.uint32, data: params.sequenceLength}, {type: DataType.uint32, data: totalSequenceLength},
{type: DataType.uint32, data: params.vHeadSize}, {type: DataType.uint32, data: params.numHeads},
{type: DataType.uint32, data: params.vHiddenSize}
];
Expand Down Expand Up @@ -537,24 +537,25 @@ export const applyAttention =
(context: ComputeContext, q: TensorView, k: TensorView, v: TensorView, _maskIndex: TensorView|undefined,
_past: TensorView|undefined, pastKey: TensorView|undefined, pastValue: TensorView|undefined,
relativePositionBias: TensorView|undefined, parameters: AttentionParameters, attributes: AttentionAttrs) => {
const outputPresentKey = context.outputCount > 1;
const outputPresentValue = context.outputCount > 2;
const pastSequenceLength = (outputPresentKey && outputPresentValue) ? parameters.pastSequenceLength : 0;
const totalSequenceLength = pastSequenceLength + parameters.kvSequenceLength;
// Concatinate pastKey and K to produce presentKey.
const presentKeyShape =
[parameters.batchSize, parameters.numHeads, parameters.totalSequenceLength, parameters.headSize];
const presentKeyShape = [parameters.batchSize, parameters.numHeads, totalSequenceLength, parameters.headSize];
const concatKeyInputs = pastKey ? [pastKey, k] : [k];
const key = (context.outputCount > 1 || pastKey) ?
context.compute(
createConcatProgramInfo(concatKeyInputs, 2, presentKeyShape, k.dataType),
{inputs: concatKeyInputs, outputs: [context.outputCount > 1 ? 1 : -1]})[0] :
k;
const key = outputPresentKey ? context.compute(
createConcatProgramInfo(concatKeyInputs, 2, presentKeyShape, k.dataType),
{inputs: concatKeyInputs, outputs: [1]})[0] :
k;

// Concatinate pastValue and V to produce presentValue.
const presentValueShape =
[parameters.batchSize, parameters.numHeads, parameters.totalSequenceLength, parameters.headSize];
const presentValueShape = [parameters.batchSize, parameters.numHeads, totalSequenceLength, parameters.headSize];
const concatValueInputs = pastValue ? [pastValue, v] : [v];
const value = (context.outputCount > 2 || pastValue) ?
const value = outputPresentValue ?
context.compute(
createConcatProgramInfo(concatValueInputs, 2, presentValueShape, v.dataType),
{inputs: concatValueInputs, outputs: [context.outputCount > 2 ? 2 : -1]})[0] :
{inputs: concatValueInputs, outputs: [2]})[0] :
v;
const inputsK = [q, key];
if (relativePositionBias) {
Expand All @@ -563,20 +564,22 @@ export const applyAttention =

// Run AttentionProbs
const probs = context.compute(
createAttentionProbsProgramInfo(context, q, key, relativePositionBias, parameters, attributes),
createAttentionProbsProgramInfo(
context, q, key, relativePositionBias, parameters, attributes, pastSequenceLength),
{inputs: inputsK, outputs: [-1]})[0];

// Run Softmax
context.compute(
createInPlaceSoftmaxProgramInfo(
context, probs, parameters.batchSize * parameters.numHeads * parameters.sequenceLength,
parameters.totalSequenceLength),
totalSequenceLength),
{inputs: [probs], outputs: []});

// Run AttrionScore
const inputsV = [probs, value];
context.compute(
createVxAttentionScoreProgramInfo(context, probs, value, parameters), {inputs: inputsV, outputs: [0]});
createVxAttentionScoreProgramInfo(context, probs, value, parameters, pastSequenceLength),
{inputs: inputsV, outputs: [0]});
};

const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
Expand Down
2 changes: 1 addition & 1 deletion js/web/lib/wasm/jsep/webgpu/ops/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ export const castToF32 = (dataType: string, components: number, value: string) =
return `f32(${value})`;
}

return `vec${components}f32(${value})`;
return `vec${components}<f32>(${value})`;
};

/**
Expand Down
4 changes: 3 additions & 1 deletion js/web/script/test-runner-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,9 @@ async function main() {
if (args.noSandbox) {
karmaArgs.push('--no-sandbox');
}
if (webgpu || webnn) {

// When using BrowserStack with Safari, we need NOT to use 'localhost' as the hostname.
if (!(browser.startsWith('BS_') && browser.includes('Safari'))) {
karmaArgs.push('--force-localhost');
}
if (webgpu) {
Expand Down
Loading

0 comments on commit d1f00f6

Please sign in to comment.