From c0033eb1e55a7af9ec45e1a8408d3d0b840d189e Mon Sep 17 00:00:00 2001
From: Pranshu Pant <32600304+pranshupant@users.noreply.github.com>
Date: Mon, 10 Jul 2023 15:32:50 -0400
Subject: [PATCH] Implement GELU as function op (#5277)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
These changes have been made to support the GELU operator as a function
op.

### Motivation and Context
Support for [GELU: Gaussian Error Linear
Unit](https://paperswithcode.com/method/gelu) activation function, which
was requested in #4933.
#4423 also mentions this under the new ops section of `Contributions
Welcome`.

As per the discussion in #4933, I have added GELU as a context-dependent
function-op, that uses the attribute `approximate` to return one of the
two possible function-body definitions.

The first function definition is the regular GELU:
`GELU(x)=x∗Φ(x) = 0.5 * x * (1 + erf(x / sqrt(2)))`

The second is the fast approximation based on `tanh`:
`GELU(x)=0.5 ∗ x ∗ (1+Tanh( sqrt(2/π) ∗ (x + 0.044715 ∗ x^3)))`

This implementation uses the [PyTorch docs for
GELU](https://pytorch.org/docs/stable/generated/torch.nn.GELU.html?highlight=gelu#torch.nn.GELU)
as a reference.

PS: I also refactored `onnx/defs/math/defs.cc` to bring the operator
implementation of `mish` right next to its doc string.

---------

Signed-off-by: pranshupant <pranshupant@gmail.com>
Co-authored-by: G. Ramalingam <grama@microsoft.com>
---
 docs/Changelog.md                             |  42 +++++++
 docs/Operators.md                             |  96 ++++++++++++++++
 docs/TestCoverage.md                          |  52 ++++++++-
 onnx/backend/test/case/node/gelu.py           |  51 +++++++++
 .../data/node/test_gelu_default_1/model.onnx  | Bin 0 -> 93 bytes
 .../test_data_set_0/input_0.pb                | Bin 0 -> 21 bytes
 .../test_data_set_0/output_0.pb               | Bin 0 -> 21 bytes
 .../test_gelu_default_1_expanded/model.onnx   | Bin 0 -> 1429 bytes
 .../test_data_set_0/input_0.pb                | Bin 0 -> 21 bytes
 .../test_data_set_0/output_0.pb               | Bin 0 -> 21 bytes
 .../data/node/test_gelu_default_2/model.onnx  | Bin 0 -> 109 bytes
 .../test_data_set_0/input_0.pb                |   1 +
 .../test_data_set_0/output_0.pb               |   3 +
 .../test_gelu_default_2_expanded/model.onnx   | Bin 0 -> 1445 bytes
 .../test_data_set_0/input_0.pb                |   1 +
 .../test_data_set_0/output_0.pb               |   3 +
 .../data/node/test_gelu_tanh_1/model.onnx     | Bin 0 -> 114 bytes
 .../test_data_set_0/input_0.pb                | Bin 0 -> 21 bytes
 .../test_data_set_0/output_0.pb               | Bin 0 -> 21 bytes
 .../node/test_gelu_tanh_1_expanded/model.onnx | Bin 0 -> 2239 bytes
 .../test_data_set_0/input_0.pb                | Bin 0 -> 21 bytes
 .../test_data_set_0/output_0.pb               | Bin 0 -> 21 bytes
 .../data/node/test_gelu_tanh_2/model.onnx     | Bin 0 -> 130 bytes
 .../test_data_set_0/input_0.pb                |   1 +
 .../test_data_set_0/output_0.pb               | Bin 0 -> 254 bytes
 .../node/test_gelu_tanh_2_expanded/model.onnx | Bin 0 -> 2255 bytes
 .../test_data_set_0/input_0.pb                |   1 +
 .../test_data_set_0/output_0.pb               | Bin 0 -> 254 bytes
 onnx/defs/math/defs.cc                        | 108 ++++++++++++++++--
 onnx/defs/operator_sets.h                     |   2 +
 onnx/test/automatic_upgrade_test.py           |   6 +
 onnx/test/test_backend_onnxruntime.py         |   1 +
 32 files changed, 355 insertions(+), 13 deletions(-)
 create mode 100644 onnx/backend/test/case/node/gelu.py
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_1/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_1/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_1/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_1_expanded/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_1_expanded/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_1_expanded/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_2/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_2/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_2/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_2_expanded/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_2_expanded/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_default_2_expanded/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_1/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_1/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_1/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_1_expanded/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_1_expanded/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_1_expanded/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_2/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_2/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_2/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_2_expanded/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_2_expanded/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_gelu_tanh_2_expanded/test_data_set_0/output_0.pb

diff --git a/docs/Changelog.md b/docs/Changelog.md
index afeb3331fd2..e7c9a05dee4 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -23919,6 +23919,48 @@ This version of the operator has been available since version 20 of the default
 <dd>Constrain output types to be numerics.</dd>
 </dl>
 
+### <a name="Gelu-20"></a>**Gelu-20**</a>
+
+  Gelu takes one input data (Tensor<T>) and produces one
+  output data (Tensor<T>) where the gaussian error linear units function,
+  $y = 0.5 * x * (1 + erf(x/sqrt(2)))$ is applied to the tensor elementwise.
+  If the attribute "approximate" is set to "tanh", the function estimation,
+  $y = 0.5 * x * (1 + Tanh(sqrt(2/\pi) * (x + 0.044715 * x^3)))$ is used and applied
+  to the tensor elementwise.
+
+
+#### Version
+
+This version of the operator has been available since version 20 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>approximate</tt> : string (default is none)</dt>
+<dd>Gelu approximation algorithm: `"tanh"`, `"none"`(default).`"none"`: do not use approximation.`"tanh"`: use tanh approximation.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
 ### <a name="GridSample-20"></a>**GridSample-20**</a>
 
   Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from the `grid`.
diff --git a/docs/Operators.md b/docs/Operators.md
index 1e2eb08671c..0efd01da237 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -170,6 +170,7 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Clip">Clip</a>|<a href="Changelog.md#Clip-13">13</a>, <a href="Changelog.md#Clip-12">12</a>, <a href="Changelog.md#Clip-11">11</a>, <a href="Changelog.md#Clip-6">6</a>, <a href="Changelog.md#Clip-1">1</a>|13|
 |<a href="#DynamicQuantizeLinear">DynamicQuantizeLinear</a>|<a href="Changelog.md#DynamicQuantizeLinear-11">11</a>|11|
 |<a href="#Elu">Elu</a>|<a href="Changelog.md#Elu-6">6</a>, <a href="Changelog.md#Elu-1">1</a>|18|
+|<a href="#Gelu">Gelu</a>|<a href="Changelog.md#Gelu-20">20</a>|20|
 |<a href="#GreaterOrEqual">GreaterOrEqual</a>|<a href="Changelog.md#GreaterOrEqual-16">16</a>, <a href="Changelog.md#GreaterOrEqual-12">12</a>|16|
 |<a href="#GroupNormalization">GroupNormalization</a>|<a href="Changelog.md#GroupNormalization-18">18</a>|18|
 |<a href="#HammingWindow">HammingWindow</a>|<a href="Changelog.md#HammingWindow-17">17</a>|17|
@@ -9410,6 +9411,101 @@ expect(
 </details>
 
 
+### <a name="Gelu"></a><a name="gelu">**Gelu**</a>
+
+  Gelu takes one input data (Tensor<T>) and produces one
+  output data (Tensor<T>) where the gaussian error linear units function,
+  $y = 0.5 * x * (1 + erf(x/sqrt(2)))$ is applied to the tensor elementwise.
+  If the attribute "approximate" is set to "tanh", the function estimation,
+  $y = 0.5 * x * (1 + Tanh(sqrt(2/\pi) * (x + 0.044715 * x^3)))$ is used and applied
+  to the tensor elementwise.
+
+
+#### Version
+
+This version of the operator has been available since version 20 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>approximate</tt> : string (default is none)</dt>
+<dd>Gelu approximation algorithm: `"tanh"`, `"none"`(default).`"none"`: do not use approximation.`"tanh"`: use tanh approximation.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+
+#### Examples
+
+<details>
+<summary>gelu_default</summary>
+
+```python
+node = onnx.helper.make_node("Gelu", inputs=["x"], outputs=["y"])
+
+x = np.array([-1, 0, 1]).astype(np.float32)
+# expected output [-0.15865526, 0., 0.84134474]
+y = (0.5 * x * (1 + np.vectorize(math.erf)(x / np.sqrt(2)))).astype(np.float32)
+expect(node, inputs=[x], outputs=[y], name="test_gelu_default_1")
+
+x = np.random.randn(3, 4, 5).astype(np.float32)
+# expected output [2.99595031, 3.99987331, 4.99999857]
+y = (0.5 * x * (1 + np.vectorize(math.erf)(x / np.sqrt(2)))).astype(np.float32)
+expect(node, inputs=[x], outputs=[y], name="test_gelu_default_2")
+```
+
+</details>
+
+
+<details>
+<summary>gelu_tanh</summary>
+
+```python
+node = onnx.helper.make_node(
+    "Gelu", inputs=["x"], outputs=["y"], approximate="tanh"
+)
+
+x = np.array([-1, 0, 1]).astype(np.float32)
+# expected output [-0.158808, 0., 0.841192]
+y = (
+    0.5
+    * x
+    * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+).astype(np.float32)
+expect(node, inputs=[x], outputs=[y], name="test_gelu_tanh_1")
+
+x = np.random.randn(3, 4, 5).astype(np.float32)
+# expected output [2.9963627, 3.99993, 4.9999995]
+y = (
+    0.5
+    * x
+    * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+).astype(np.float32)
+expect(node, inputs=[x], outputs=[y], name="test_gelu_tanh_2")
+```
+
+</details>
+
+
 ### <a name="Gemm"></a><a name="gemm">**Gemm**</a>
 
   General Matrix multiplication:
diff --git a/docs/TestCoverage.md b/docs/TestCoverage.md
index f59159e2a4c..7b8d0cd2d9e 100644
--- a/docs/TestCoverage.md
+++ b/docs/TestCoverage.md
@@ -6,7 +6,7 @@
 * [Overall Test Coverage](#overall-test-coverage)
 # Node Test Coverage
 ## Summary
-Node tests have covered 173/186 (93.01%, 5 generators excluded) common operators.
+Node tests have covered 174/187 (93.05%, 5 generators excluded) common operators.
 
 Node tests have covered 0/0 (N/A) experimental operators.
 
@@ -6241,6 +6241,56 @@ expect(
 </details>
 
 
+### Gelu
+There are 2 test cases, listed as following:
+<details>
+<summary>gelu_default</summary>
+
+```python
+node = onnx.helper.make_node("Gelu", inputs=["x"], outputs=["y"])
+
+x = np.array([-1, 0, 1]).astype(np.float32)
+# expected output [-0.15865526, 0., 0.84134474]
+y = (0.5 * x * (1 + np.vectorize(math.erf)(x / np.sqrt(2)))).astype(np.float32)
+expect(node, inputs=[x], outputs=[y], name="test_gelu_default_1")
+
+x = np.random.randn(3, 4, 5).astype(np.float32)
+# expected output [2.99595031, 3.99987331, 4.99999857]
+y = (0.5 * x * (1 + np.vectorize(math.erf)(x / np.sqrt(2)))).astype(np.float32)
+expect(node, inputs=[x], outputs=[y], name="test_gelu_default_2")
+```
+
+</details>
+<details>
+<summary>gelu_tanh</summary>
+
+```python
+node = onnx.helper.make_node(
+    "Gelu", inputs=["x"], outputs=["y"], approximate="tanh"
+)
+
+x = np.array([-1, 0, 1]).astype(np.float32)
+# expected output [-0.158808, 0., 0.841192]
+y = (
+    0.5
+    * x
+    * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+).astype(np.float32)
+expect(node, inputs=[x], outputs=[y], name="test_gelu_tanh_1")
+
+x = np.random.randn(3, 4, 5).astype(np.float32)
+# expected output [2.9963627, 3.99993, 4.9999995]
+y = (
+    0.5
+    * x
+    * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+).astype(np.float32)
+expect(node, inputs=[x], outputs=[y], name="test_gelu_tanh_2")
+```
+
+</details>
+
+
 ### Gemm
 There are 11 test cases, listed as following:
 <details>
diff --git a/onnx/backend/test/case/node/gelu.py b/onnx/backend/test/case/node/gelu.py
new file mode 100644
index 00000000000..cc93a4f5471
--- /dev/null
+++ b/onnx/backend/test/case/node/gelu.py
@@ -0,0 +1,51 @@
+# Copyright (c) ONNX Project Contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+
+import numpy as np
+
+import onnx
+from onnx.backend.test.case.base import Base
+from onnx.backend.test.case.node import expect
+
+
+class Gelu(Base):
+    @staticmethod
+    def export_gelu_tanh() -> None:
+        node = onnx.helper.make_node(
+            "Gelu", inputs=["x"], outputs=["y"], approximate="tanh"
+        )
+
+        x = np.array([-1, 0, 1]).astype(np.float32)
+        # expected output [-0.158808, 0., 0.841192]
+        y = (
+            0.5
+            * x
+            * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+        ).astype(np.float32)
+        expect(node, inputs=[x], outputs=[y], name="test_gelu_tanh_1")
+
+        x = np.random.randn(3, 4, 5).astype(np.float32)
+        # expected output [2.9963627, 3.99993, 4.9999995]
+        y = (
+            0.5
+            * x
+            * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+        ).astype(np.float32)
+        expect(node, inputs=[x], outputs=[y], name="test_gelu_tanh_2")
+
+    @staticmethod
+    def export_gelu_default() -> None:
+        node = onnx.helper.make_node("Gelu", inputs=["x"], outputs=["y"])
+
+        x = np.array([-1, 0, 1]).astype(np.float32)
+        # expected output [-0.15865526, 0., 0.84134474]
+        y = (0.5 * x * (1 + np.vectorize(math.erf)(x / np.sqrt(2)))).astype(np.float32)
+        expect(node, inputs=[x], outputs=[y], name="test_gelu_default_1")
+
+        x = np.random.randn(3, 4, 5).astype(np.float32)
+        # expected output [2.99595031, 3.99987331, 4.99999857]
+        y = (0.5 * x * (1 + np.vectorize(math.erf)(x / np.sqrt(2)))).astype(np.float32)
+        expect(node, inputs=[x], outputs=[y], name="test_gelu_default_2")
diff --git a/onnx/backend/test/data/node/test_gelu_default_1/model.onnx b/onnx/backend/test/data/node/test_gelu_default_1/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..ada8f652bed5fdba31049c4a5363d16c902a76fe
GIT binary patch
literal 93
zcmd<!6yixrOwLZtOVKS!EiSQg<>KLDtPo<XRAO;Y%_$WU28qU}1DWwDscDI&IVJIi
fQT!k!TwEL+j6y72OdQNfK!HkR0VftN1_2QOPOTFj

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_default_1/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_gelu_default_1/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..8a9445744b63f66e76c3ef4fce746606ffc6f47e
GIT binary patch
literal 21
Ycmd;J7GQK@tnlJtU})IS00s^A03TBWEC2ui

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_default_1/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_gelu_default_1/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..b12e822f15a5648d6c9c8f16d2ac4470c3534a8f
GIT binary patch
literal 21
acmd;J7GQK@tn}h(D^uFX00ePK;r0M2Qv_iE

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_default_1_expanded/model.onnx b/onnx/backend/test/data/node/test_gelu_default_1_expanded/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..ffee90beaf2288ed808fd440b22f0db19dbe931c
GIT binary patch
literal 1429
zcmbtU%}c{D6i;l0`VbsJJbCJI;P-78WmEK^C~U&;60$YbvYK{lGPj$5%KlIfzKl);
zq44n1CV}_+cwf<u*d3|yM4RNu>dJP$0G`Fkt<IhFclqj1iB6Tvtvr)@Q7V(@M5fM+
zZB`h0r}8w8dWEU1GPZpH+jEsW-G&yAL&n%;m#ui%Vc>uY3w$9ih*Em0vM2|?_t``v
z?LZt8ARdiwF!T9irUD|W_i|1X@=q&+kD|c}yV}%+W(85HRxsp`l@ch)s1ZmB>`!Ob
zry%qP?1K_zlSpd-aZ0G#&<DlAbv8#v1LROryWh;xnh#INx8fv!*y_G4K$)-&C26Lf
z-m*C8yLXg0C9(O{vX+RmLrLWW?y6xN;14NtQWe&rq}fJyf-h?}aoDspU-tay07MZN
SA#6p#2p^WeAG#s1*8VpQeEQx1

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_default_1_expanded/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_gelu_default_1_expanded/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..8a9445744b63f66e76c3ef4fce746606ffc6f47e
GIT binary patch
literal 21
Ycmd;J7GQK@tnlJtU})IS00s^A03TBWEC2ui

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_default_1_expanded/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_gelu_default_1_expanded/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..b12e822f15a5648d6c9c8f16d2ac4470c3534a8f
GIT binary patch
literal 21
acmd;J7GQK@tn}h(D^uFX00ePK;r0M2Qv_iE

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_default_2/model.onnx b/onnx/backend/test/data/node/test_gelu_default_2/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..c03f4701e47a4232ff057b67b2fee68c6ba294ff
GIT binary patch
literal 109
zcmd<!6yixrOwLZtOVKS!EiSPN<>KLDtPo<XRAO;Y%_$WU28qU}1DWwDscDI&IVJH%
kQQ{yaLR<nIj6ytIOdQNW%mT!$NkEB8Y!Xf^Tnqvt0EV0s(EtDd

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_default_2/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_gelu_default_2/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..bae0ffd6324
--- /dev/null
+++ b/onnx/backend/test/data/node/test_gelu_default_2/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+BxJ�x��?h��>��z?�j@$�?�.z��8s?b��hdӽ�9�>(�>�%�?^�B?�0�=B�>]ת>�=�?R�iJ�>�Z�/d#��S'?�K]?��=��C@�(��Hm;= �?�2�?��?��>���>�Ec������!��� >*z�?��?�Oƾmǚ��6��&õ�gڿ��?�x�FKྙ[���	G?4�ο��Y�L=e��> �����k��QN�>.:�=�ݚ>�b"�6���
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_gelu_default_2/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_gelu_default_2/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..c55aea167f7
--- /dev/null
+++ b/onnx/backend/test/data/node/test_gelu_default_2/test_data_set_0/output_0.pb
@@ -0,0 +1,3 @@
+ByJ��?�K�>��Q?A�@t��?V$���I?�W��uB���>�d�=��?VQ?V��=q��>�~W>�Q�?�x�G>d�+�8^_�$��>�o2?�.�ޓ@�3ٽxD�<+7����?п�?24�=I�z>jL*��'A������=D�?D�?��
+�|��
+��B����KR�?h@�:U�Z����?T��;%��;�)����>�f�����a�J�>�s=��?>��*�S �
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_gelu_default_2_expanded/model.onnx b/onnx/backend/test/data/node/test_gelu_default_2_expanded/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..1988c1b6297e78111e1b62265dd3b2f27301d617
GIT binary patch
literal 1445
zcmbtUO-sW-5Z#y*;@Cr2L@%ClEcija^`f*EJtzvTP`oVLW}~5TlQz4xz4-(D5&fyo
zmeh+<7(8q;%QElHo3C)6H^*`^RXRB}sxqCg19;}=w<@>7-Nmc35|v7u8_^bOSxTL#
zM5I<vOjc-dC-XE8dxfry)TVU=jfKpuYC+`10b}f{%hq1dVPJs@3*M4n5<}_9%3wJ7
zz0al!X@~rb0P(2R4l$oUVk#h_IxTHlApevy{3v!vVb|Nb(4-(LRSLTNl2QUC88rea
zfrHuHxD*7=z&<E3Y!Yb=z%K|@8#$-gzsVNJsDbQCYWMy;t@+SGzO_fvhpq0)0+b2c
zQj%us=`D->fqlmiXCyYaS~el#>`+qqfV*lG`}jl3oK%I4D`~b-9rso>8$a2#G+&P0
b=x~Tak2hg2^zj6E8e@D|{q@idfJKMjk9hki

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_default_2_expanded/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_gelu_default_2_expanded/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..bae0ffd6324
--- /dev/null
+++ b/onnx/backend/test/data/node/test_gelu_default_2_expanded/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+BxJ�x��?h��>��z?�j@$�?�.z��8s?b��hdӽ�9�>(�>�%�?^�B?�0�=B�>]ת>�=�?R�iJ�>�Z�/d#��S'?�K]?��=��C@�(��Hm;= �?�2�?��?��>���>�Ec������!��� >*z�?��?�Oƾmǚ��6��&õ�gڿ��?�x�FKྙ[���	G?4�ο��Y�L=e��> �����k��QN�>.:�=�ݚ>�b"�6���
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_gelu_default_2_expanded/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_gelu_default_2_expanded/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..c55aea167f7
--- /dev/null
+++ b/onnx/backend/test/data/node/test_gelu_default_2_expanded/test_data_set_0/output_0.pb
@@ -0,0 +1,3 @@
+ByJ��?�K�>��Q?A�@t��?V$���I?�W��uB���>�d�=��?VQ?V��=q��>�~W>�Q�?�x�G>d�+�8^_�$��>�o2?�.�ޓ@�3ٽxD�<+7����?п�?24�=I�z>jL*��'A������=D�?D�?��
+�|��
+��B����KR�?h@�:U�Z����?T��;%��;�)����>�f�����a�J�>�s=��?>��*�S �
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_gelu_tanh_1/model.onnx b/onnx/backend/test/data/node/test_gelu_tanh_1/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..cbb06f2b052747f6aa0a281ec0e25ef28a8fb20c
GIT binary patch
literal 114
zcmd<!6yixrOwLZtOVKS!EiSQ&;!@#atPo<XRAO;Y%_-Fq<Kj*%C@9LW$jnVFNmXJg
zNzBVwz{o5l08$;F4pb5k5{fsB;s@#D;^N?76k_3G;$Th!3REHsII(as2#5dx&e$1(

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_1/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_gelu_tanh_1/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..8a9445744b63f66e76c3ef4fce746606ffc6f47e
GIT binary patch
literal 21
Ycmd;J7GQK@tnlJtU})IS00s^A03TBWEC2ui

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_1/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_gelu_tanh_1/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..0f554cc42e247392ae38456c23643f18032a9088
GIT binary patch
literal 21
bcmd;J7GQK@tn}iUFi&Y80}#YSgxdoED_sPK

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_1_expanded/model.onnx b/onnx/backend/test/data/node/test_gelu_tanh_1_expanded/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..254f702635caeb04cb5f0e8932f0bf4e1329481b
GIT binary patch
literal 2239
zcmbVOO-sW-5KU}^I0)8N1h2UW^{XC45X5Q_6_i$6uy_e+v$dt^rcF1sy$SvbFXBP`
zkKWvEo2Z1!te221J2UUSc{>|(fh^YCdYdwDo6|l&{8)xtvUN`V$dTX9!}!f*O=r)c
z!>-Fb>N$;w)p;vm&V}nYEYl8HpSz6b)?j|%`Vq}RR<{gI(~eKIH{CeYAONMDpbtr2
z1%&PPxd0-bSFJV`1#ieEHonN{9~_<%4joWQDJkqKKu9sTBYW8J%41L*(6H2+$>aIc
zN_dPwJ+Wl7uM8)}sYG(vBzAr#nU#%$qu5w&P{=4*?BD^U;1A$`8sMcX_d@>NO+$VP
zGndIW7Imc=(!cCnD8$P3KrN{h4{sa^Bc)$Q<WMkQoCDvpsvXs+XJtykqmb=I2gPpV
z_fIO}hty=hFm>iS^2Y;FNP!-4aWuXGRXZPxQI0yI;Jc1D?=KhP8<y0>!>ov#<&t$H
z9_zyJ<&s!@xwEiw((Q)9&@yDSkmXee)hUffn3TgKEHztGgqca-+IW}9+gP4Cc~a45
zTiB6TWA%OfOc=-ErAT-YeKf5`)@S}<+%HKE0H&!E1Ln+3O)QLlE}R;mWmmrdx0M{x

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_1_expanded/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_gelu_tanh_1_expanded/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..8a9445744b63f66e76c3ef4fce746606ffc6f47e
GIT binary patch
literal 21
Ycmd;J7GQK@tnlJtU})IS00s^A03TBWEC2ui

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_1_expanded/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_gelu_tanh_1_expanded/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..0f554cc42e247392ae38456c23643f18032a9088
GIT binary patch
literal 21
bcmd;J7GQK@tn}iUFi&Y80}#YSgxdoED_sPK

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_2/model.onnx b/onnx/backend/test/data/node/test_gelu_tanh_2/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..887e5c52023cb40139da847694442f1a46320594
GIT binary patch
literal 130
zcmd<!6yixrOwLZtOVKS!EiSRj;!@#atPo<XRAO;Y%_-Fq<Kj*%C@9LW$jnVFNmXJg
zNzBVwz{o5l08$;F4pb5k5{fs95(nuM;u7Fs6yo7x;$Q}179eI#0!mb3lW=0;Vh|7k
E0B&X)H2?qr

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_2/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_gelu_tanh_2/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..bae0ffd6324
--- /dev/null
+++ b/onnx/backend/test/data/node/test_gelu_tanh_2/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+BxJ�x��?h��>��z?�j@$�?�.z��8s?b��hdӽ�9�>(�>�%�?^�B?�0�=B�>]ת>�=�?R�iJ�>�Z�/d#��S'?�K]?��=��C@�(��Hm;= �?�2�?��?��>���>�Ec������!��� >*z�?��?�Oƾmǚ��6��&õ�gڿ��?�x�FKྙ[���	G?4�ο��Y�L=e��> �����k��QN�>.:�=�ݚ>�b"�6���
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_gelu_tanh_2/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_gelu_tanh_2/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..d9ba5e0c6fbe5ba8123405c2bcd15ff1fa98a415
GIT binary patch
literal 254
zcmV<a00I990|*2N1rPy30eMRB0T=_>KZr_(KK^!5KOdwGKme%cKl^(mzPy%6KMPoh
zy?+Bjz5JPqKIdelJ(G&8Kb%Y#KTN5JJ^YlIKCyjQK3Yn$KOXO|y|J!GJ_-CQzUk*z
zyj{EaKGkS4KP4G1z7v@bK(wmay>~>yJYYAYz1iQlKX|yVKN&Q#J+P~KJ`-vxzF^Wo
zz0Rruz6JiXJ=#2sKMg&IKemSozSjioy<j&Uz6iYHy)E*Yy&Y8ZKR!htz5rMhz7-q?
zzJ-b#KdN)Ey^|%iy_WzgzOvtdK5%OvzUl1`zMQRLyaNJ~KCp8SJ^QRbK2X#uzOW$$
EzM8pl;s5{u

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_2_expanded/model.onnx b/onnx/backend/test/data/node/test_gelu_tanh_2_expanded/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..6456042ec3620ff3548aef75a16c2f9386c30af9
GIT binary patch
literal 2255
zcmbVOO-sW-5KU|fb?Kq5qIk_ks8*{WLP4wzqJq+B3l=XSZMU{0-L&bZA2-2Y;YB?7
z8~ktH+-;kP#FkkP2?@J1^WK}clcZL0x^6cdLYp~82CVS8hVJp+C2<2wyjxGxC!4md
zBa4iCHf@ro)e2~Xbv$Za*=|cu86F)lo3iXCN)Bx|AX$`Abxl!}vy$?zY6S&(NGc~%
z$2czo!mtO71L60Zjzf6CJG={xFH^FF!*j%;i)2y?3I`GpVhkSeAvC-?^2$Tvo1KL`
zUcU5*#{|hI7Hp2C;lwzZI15c;<QI}z-#IxAjpZPTjDp2J96${I5dBXBxOA1i&wje8
ziC1ps3eG`MSKB^WX6IrtRIZQYk_z$Q#t|@L`gM5@3G;<H;61AuL7g=9gz1gAY_BSk
z>^6M=tP*@k&GvIsCvF3GIuN-O;1L%_!y8aD@}U@&pvwz>sBrWCav{87QB63^D*v)v
z(_X+rT`0U<6bmnR;kV9vJ>MJanuzAIys1cbO2ZLG<=_a-c4v+-G3%QHcZv8Ki!&!q
zD)?-3JF;4+z6+lT(>S;k5ihJx=GE}_!at11Ex`e!OiIORw315jOXF9v&KD+sEtE8*
HWH!D5t*RZU

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_gelu_tanh_2_expanded/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_gelu_tanh_2_expanded/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..bae0ffd6324
--- /dev/null
+++ b/onnx/backend/test/data/node/test_gelu_tanh_2_expanded/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+BxJ�x��?h��>��z?�j@$�?�.z��8s?b��hdӽ�9�>(�>�%�?^�B?�0�=B�>]ת>�=�?R�iJ�>�Z�/d#��S'?�K]?��=��C@�(��Hm;= �?�2�?��?��>���>�Ec������!��� >*z�?��?�Oƾmǚ��6��&õ�gڿ��?�x�FKྙ[���	G?4�ο��Y�L=e��> �����k��QN�>.:�=�ݚ>�b"�6���
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_gelu_tanh_2_expanded/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_gelu_tanh_2_expanded/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..d9ba5e0c6fbe5ba8123405c2bcd15ff1fa98a415
GIT binary patch
literal 254
zcmV<a00I990|*2N1rPy30eMRB0T=_>KZr_(KK^!5KOdwGKme%cKl^(mzPy%6KMPoh
zy?+Bjz5JPqKIdelJ(G&8Kb%Y#KTN5JJ^YlIKCyjQK3Yn$KOXO|y|J!GJ_-CQzUk*z
zyj{EaKGkS4KP4G1z7v@bK(wmay>~>yJYYAYz1iQlKX|yVKN&Q#J+P~KJ`-vxzF^Wo
zz0Rruz6JiXJ=#2sKMg&IKemSozSjioy<j&Uz6iYHy)E*Yy&Y8ZKR!htz5rMhz7-q?
zzJ-b#KdN)Ey^|%iy_WzgzOvtdK5%OvzUl1`zMQRLyaNJ~KCp8SJ^QRbK2X#uzOW$$
EzM8pl;s5{u

literal 0
HcmV?d00001

diff --git a/onnx/defs/math/defs.cc b/onnx/defs/math/defs.cc
index f9ba01ddbaa..830d2105c6e 100644
--- a/onnx/defs/math/defs.cc
+++ b/onnx/defs/math/defs.cc
@@ -470,6 +470,26 @@ mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
 ```
 )DOC";
 
+ONNX_OPERATOR_SET_SCHEMA(
+    Mish,
+    18,
+    OpSchema()
+        .SetDoc(mish_ver18_doc)
+        .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input X and output types to float tensors.")
+        .FunctionBody(R"ONNX(
+          {
+            Softplus_X = Softplus (X)
+            TanHSoftplusX = Tanh (Softplus_X)
+            Y = Mul (X, TanHSoftplusX)
+           }
+        )ONNX")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
 static const char* celu_ver12_doc = R"DOC(
 Continuously Differentiable Exponential Linear Units:
 Perform the linear unit element-wise on the input tensor X
@@ -538,24 +558,88 @@ ONNX_OPERATOR_SET_SCHEMA(
         .SetContextDependentFunctionBodyBuilder(BuildContextDependentFunctionBodyCelu)
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
+static const char* gelu_ver20_doc = R"DOC(
+Gelu takes one input data (Tensor<T>) and produces one
+output data (Tensor<T>) where the gaussian error linear units function,
+$y = 0.5 * x * (1 + erf(x/sqrt(2)))$ is applied to the tensor elementwise.
+If the attribute "approximate" is set to "tanh", the function estimation,
+$y = 0.5 * x * (1 + Tanh(sqrt(2/\pi) * (x + 0.044715 * x^3)))$ is used and applied
+to the tensor elementwise.
+
+)DOC";
+
+static std::string gelu_default_approx = "none";
+
+bool BuildContextDependentFunctionBodyGelu(
+    const FunctionBodyBuildContext& ctx,
+    const OpSchema& schema,
+    FunctionProto& functionProto) {
+  auto approx_attr_proto = ctx.getAttribute("approximate");
+  std::string approximate =
+      approx_attr_proto != nullptr && approx_attr_proto->has_s() ? approx_attr_proto->s() : gelu_default_approx;
+  FunctionBuilder builder(functionProto);
+
+  if (approximate == "tanh") {
+    builder.Add(R"(
+              Half = Constant <value = float {0.5}>()
+              HalfCast = CastLike (Half, X)
+              One = Constant <value = float {1.0}>()
+              OneCast = CastLike (One, X)
+              TwoOverPi = Constant <value = float {0.63661977236}>()
+              TwoOverPiCast = CastLike (TwoOverPi, X)
+              C0 = Constant <value = float {0.044715}>()
+              C0Cast = CastLike (C0, X)
+              SqrtTwoOverPi = Sqrt (TwoOverPiCast)
+              Three = Constant <value = float {3.0}>()
+              ThreeCast = CastLike (Three, X)
+              XCubed = Pow (X, ThreeCast)
+              XCubedC0 = Mul (C0Cast, XCubed)
+              XC0XCubed = Sum (X, XCubedC0)
+              TanhInput = Mul (SqrtTwoOverPi, XC0XCubed)
+              ErfApprox = Tanh (TanhInput)
+              PhiApprox = Sum (OneCast, ErfApprox)
+              MultX = Mul (HalfCast, X)
+              Y = Mul (MultX, PhiApprox)
+              )");
+  } else {
+    builder.Add(R"(
+              Half = Constant <value = float {0.5}>()
+              HalfCast = CastLike (Half, X)
+              One = Constant <value = float {1.0}>()
+              OneCast = CastLike (One, X)
+              Two = Constant <value = float {2.0}>()
+              TwoCast = CastLike (Two, X)
+              SqrtTwo = Sqrt (TwoCast)
+              XSqrt = Div (X, SqrtTwo)
+              ErfXSqrt = Erf(XSqrt)
+              Phi = Sum (OneCast, ErfXSqrt)
+              MultX = Mul (HalfCast, X)
+              Y = Mul (MultX, Phi)
+              )");
+  }
+  schema.BuildFunction(functionProto);
+  return true;
+}
+
 ONNX_OPERATOR_SET_SCHEMA(
-    Mish,
-    18,
+    Gelu,
+    20,
     OpSchema()
-        .SetDoc(mish_ver18_doc)
+        .SetDoc(gelu_ver20_doc)
         .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Attr(
+            "approximate",
+            "Gelu approximation algorithm: `\"tanh\"`, `\"none\"`(default)."
+            "`\"none\"`: do not use approximation."
+            "`\"tanh\"`: use tanh approximation.",
+            AttributeProto::STRING,
+            gelu_default_approx)
         .TypeConstraint(
             "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input X and output types to float tensors.")
-        .FunctionBody(R"ONNX(
-          {
-            Softplus_X = Softplus (X)
-            TanHSoftplusX = Tanh (Softplus_X)
-            Y = Mul (X, TanHSoftplusX)
-           }
-        )ONNX")
+            {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
+            "Constrain input and output types to float tensors.")
+        .SetContextDependentFunctionBodyBuilder(BuildContextDependentFunctionBodyGelu)
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
 static const char* Exp_ver13_doc = R"DOC(
diff --git a/onnx/defs/operator_sets.h b/onnx/defs/operator_sets.h
index 76473d2f9e0..a83adfd194f 100644
--- a/onnx/defs/operator_sets.h
+++ b/onnx/defs/operator_sets.h
@@ -1103,6 +1103,7 @@ class OpSet_Onnx_ver19 {
 
 // Forward declarations for ai.onnx version 20
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, GridSample);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, Gelu);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, ConstantOfShape);
 
 // Iterate over schema from ai.onnx version 20
@@ -1110,6 +1111,7 @@ class OpSet_Onnx_ver20 {
  public:
   static void ForEachSchema(std::function<void(OpSchema&&)> fn) {
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, GridSample)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, Gelu)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 20, ConstantOfShape)>());
   }
 };
diff --git a/onnx/test/automatic_upgrade_test.py b/onnx/test/automatic_upgrade_test.py
index 248f338ce82..0277e068a79 100644
--- a/onnx/test/automatic_upgrade_test.py
+++ b/onnx/test/automatic_upgrade_test.py
@@ -464,6 +464,12 @@ def test_GatherElements(self) -> None:
     def test_GatherND(self) -> None:
         self._test_op_upgrade("GatherND", 11, [[1, 2, 3], [1, 2, 3]], [[1, 2]])
 
+    def test_Gelu_approximate_tanh(self) -> None:
+        self._test_op_upgrade("Gelu", 20, attrs={"approximate": "tanh"})
+
+    def test_Gelu(self) -> None:
+        self._test_op_upgrade("Gelu", 20)
+
     def test_Gemm(self) -> None:
         self._test_op_upgrade("Gemm", 1, [[5, 4], [4, 3], [3]], [[5, 3]])
 
diff --git a/onnx/test/test_backend_onnxruntime.py b/onnx/test/test_backend_onnxruntime.py
index 06811d7c1d7..9a87309c15a 100644
--- a/onnx/test/test_backend_onnxruntime.py
+++ b/onnx/test/test_backend_onnxruntime.py
@@ -249,6 +249,7 @@ def run_node(cls, node, inputs, device=None, outputs_info=None, **kwargs):
         "|equal"
         "|identity"
         "|reshape"
+        "|gelu"
         ")"
     )