diff --git a/neural/neuralnetwork.pas b/neural/neuralnetwork.pas
index d71459d..edb6543 100644
--- a/neural/neuralnetwork.pas
+++ b/neural/neuralnetwork.pas
@@ -596,6 +596,26 @@   TNNetHardSwish = class(TNNetReLUBase)
     procedure Compute(); override;
   end;
 
+  /// Gaussian Error Linear Unit (GELU) activation function - This is an experimental layer. Do not use it.
+  // A smooth activation function popular in transformer models like BERT and GPT.
+  // Uses the tanh approximation formula: GELU(x) = 0.5*x*(1 + tanh(sqrt(2/pi)*(x + 0.044715*x^3)))
+  // https://arxiv.org/abs/1606.08415
+  TNNetGELU = class(TNNetReLUBase)
+  public
+    procedure Compute(); override;
+    procedure Backpropagate(); override;
+  end;
+
+  /// Mish activation function - This is an experimental layer. Do not use it.
+  // A smooth, non-monotonic self-regularizing activation function.
+  // Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+  // https://arxiv.org/abs/1908.08681
+  TNNetMish = class(TNNetReLUBase)
+  public
+    procedure Compute(); override;
+    procedure Backpropagate(); override;
+  end;
+
   /// Swish activation function with maximum limit of 6
   TNNetSwish6 = class(TNNetReLUBase)
   public
@@ -3309,6 +3329,173 @@ procedure TNNetSwish.Compute();
   FForwardTime := FForwardTime + (Now() - StartTime);
 end;
 
+{ TNNetGELU }
+
+procedure TNNetGELU.Compute();
+var
+  SizeM1: integer;
+  LocalPrevOutput: TNNetVolume;
+  OutputCnt: integer;
+  StartTime: double;
+  x: TNeuralFloat;
+  x3: TNeuralFloat;
+  tanhArg: TNeuralFloat;
+  tanhVal: TNeuralFloat;
+  outputVal: TNeuralFloat;
+  cdf: TNeuralFloat;
+const
+  // sqrt(2/pi) ≈ 0.7978845608
+  SQRT_2_OVER_PI = 0.7978845608;
+  GELU_CONST = 0.044715;
+begin
+  StartTime := Now();
+  LocalPrevOutput := FPrevLayer.Output;
+  SizeM1 := LocalPrevOutput.Size - 1;
+
+  if (FOutput.Size = FOutputError.Size) and (FOutputErrorDeriv.Size = FOutput.Size) then
+  begin
+    for OutputCnt := 0 to SizeM1 do
+    begin
+      x := LocalPrevOutput.FData[OutputCnt];
+      x3 := x * x * x;
+      tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x3);
+      tanhVal := Tanh(tanhArg);
+      cdf := 0.5 * (1 + tanhVal);
+      outputVal := x * cdf;
+      FOutput.FData[OutputCnt] := outputVal;
+      // Derivative: GELU'(x) = cdf + x * pdf, where pdf is derivative of cdf
+      // pdf = 0.5 * (1 - tanh^2) * sqrt(2/pi) * (1 + 3*0.044715*x^2)
+      FOutputErrorDeriv.FData[OutputCnt] := cdf + 0.5 * x * (1 - tanhVal * tanhVal) *
+        SQRT_2_OVER_PI * (1 + 3 * GELU_CONST * x * x);
+    end;
+  end
+  else
+  begin
+    // can't calculate error on input layers.
+    for OutputCnt := 0 to SizeM1 do
+    begin
+      x := LocalPrevOutput.FData[OutputCnt];
+      x3 := x * x * x;
+      tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x3);
+      FOutput.FData[OutputCnt] := 0.5 * x * (1 + Tanh(tanhArg));
+    end;
+  end;
+  FForwardTime := FForwardTime + (Now() - StartTime);
+end;
+
+procedure TNNetGELU.Backpropagate();
+var
+  StartTime: double;
+begin
+  StartTime := Now();
+  Inc(FBackPropCallCurrentCnt);
+  if FBackPropCallCurrentCnt < FDepartingBranchesCnt then exit;
+  TestBackPropCallCurrCnt();
+  // Apply chain rule: multiply error by derivative computed in Compute()
+  if (FOutput.Size = FOutputError.Size) and (FOutputErrorDeriv.Size = FOutput.Size) then
+  begin
+    FOutputError.Mul(FOutputErrorDeriv);
+  end;
+  FBackwardTime := FBackwardTime + (Now() - StartTime);
+  inherited BackpropagateNoTest();
+end;
+
+{ TNNetMish }
+
+procedure TNNetMish.Compute();
+var
+  SizeM1: integer;
+  LocalPrevOutput: TNNetVolume;
+  OutputCnt: integer;
+  StartTime: double;
+  x: TNeuralFloat;
+  softplus: TNeuralFloat;
+  expVal: TNeuralFloat;
+  tanhSP: TNeuralFloat;
+  outputVal: TNeuralFloat;
+  omega: TNeuralFloat;
+  delta: TNeuralFloat;
+begin
+  StartTime := Now();
+  LocalPrevOutput := FPrevLayer.Output;
+  SizeM1 := LocalPrevOutput.Size - 1;
+
+  if (FOutput.Size = FOutputError.Size) and (FOutputErrorDeriv.Size = FOutput.Size) then
+  begin
+    for OutputCnt := 0 to SizeM1 do
+    begin
+      x := LocalPrevOutput.FData[OutputCnt];
+      // Numerical stability: for large positive x, exp(x) overflows
+      // softplus(x) = ln(1 + exp(x)) ≈ x for large x
+      if x > 20 then
+      begin
+        // For large x: softplus(x) ≈ x, tanh(x) ≈ 1, sigmoid(x) ≈ 1
+        // Mish(x) ≈ x * 1 = x
+        // Mish'(x) ≈ 1 + x * sech^2(x) * 1 ≈ 1 (since sech^2(x) → 0 for large x)
+        FOutput.FData[OutputCnt] := x;
+        FOutputErrorDeriv.FData[OutputCnt] := 1.0;
+      end
+      else if x < -20 then
+      begin
+        // For very negative x: softplus(x) ≈ exp(x) ≈ 0, tanh(0) = 0
+        // Mish(x) ≈ 0
+        // Mish'(x) ≈ 0
+        FOutput.FData[OutputCnt] := 0;
+        FOutputErrorDeriv.FData[OutputCnt] := 0;
+      end
+      else
+      begin
+        expVal := Exp(x);
+        softplus := Ln(1 + expVal);
+        tanhSP := Tanh(softplus);
+        outputVal := x * tanhSP;
+        FOutput.FData[OutputCnt] := outputVal;
+        // Derivative: Mish'(x) = tanh(softplus(x)) + x * sigmoid(x) * (1 - tanh^2(softplus(x)))
+        // = tanh(sp) + x * sech^2(sp) * sigmoid(x)
+        // Using omega = exp(x) and delta = 1 + exp(x)
+        // sigmoid(x) = omega / delta
+        omega := expVal;
+        delta := 1 + expVal;
+        FOutputErrorDeriv.FData[OutputCnt] := tanhSP + x * (1 - tanhSP * tanhSP) * omega / delta;
+      end;
+    end;
+  end
+  else
+  begin
+    // can't calculate error on input layers.
+    for OutputCnt := 0 to SizeM1 do
+    begin
+      x := LocalPrevOutput.FData[OutputCnt];
+      // Numerical stability
+      if x > 20 then
+        softplus := x
+      else if x < -20 then
+        softplus := Exp(x)
+      else
+        softplus := Ln(1 + Exp(x));
+      FOutput.FData[OutputCnt] := x * Tanh(softplus);
+    end;
+  end;
+  FForwardTime := FForwardTime + (Now() - StartTime);
+end;
+
+procedure TNNetMish.Backpropagate();
+var
+  StartTime: double;
+begin
+  StartTime := Now();
+  Inc(FBackPropCallCurrentCnt);
+  if FBackPropCallCurrentCnt < FDepartingBranchesCnt then exit;
+  TestBackPropCallCurrCnt();
+  // Apply chain rule: multiply error by derivative computed in Compute()
+  if (FOutput.Size = FOutputError.Size) and (FOutputErrorDeriv.Size = FOutput.Size) then
+  begin
+    FOutputError.Mul(FOutputErrorDeriv);
+  end;
+  FBackwardTime := FBackwardTime + (Now() - StartTime);
+  inherited BackpropagateNoTest();
+end;
+
 { TNNetInterleaveChannels }
 
 procedure TNNetInterleaveChannels.SetPrevLayer(pPrevLayer: TNNetLayer);
@@ -12511,6 +12698,8 @@ function TNNet.CreateLayer(strData: string): TNNetLayer;
       'TNNetReLUP' :                Result := TNNetReLUP.Create();
       'TNNetSwish' :                Result := TNNetSwish.Create();
       'TNNetHardSwish' :            Result := TNNetHardSwish.Create();
+      'TNNetGELU' :                 Result := TNNetGELU.Create();
+      'TNNetMish' :                 Result := TNNetMish.Create();
       'TNNetSwish6' :               Result := TNNetSwish6.Create();
       'TNNetReLUSqrt':              Result := TNNetReLUSqrt.Create();
       'TNNetReLUL' :                Result := TNNetReLUL.Create(St[0], St[1], St[2]);
@@ -12622,6 +12811,8 @@ function TNNet.CreateLayer(strData: string): TNNetLayer;
       if S[0] = 'TNNetReLUP' then Result := TNNetReLUP.Create() else
       if S[0] = 'TNNetSwish' then Result := TNNetSwish.Create() else
       if S[0] = 'TNNetHardSwish' then Result := TNNetHardSwish.Create() else
+      if S[0] = 'TNNetGELU' then Result := TNNetGELU.Create() else
+      if S[0] = 'TNNetMish' then Result := TNNetMish.Create() else
       if S[0] = 'TNNetSwish6' then Result := TNNetSwish6.Create() else
       if S[0] = 'TNNetReLUSqrt' then Result := TNNetReLUSqrt.Create() else
       if S[0] = 'TNNetReLUL' then Result := TNNetReLUL.Create(St[0], St[1], St[2]) else
@@ -17358,3 +17549,4 @@ initialization
 
 end.
 
+
diff --git a/tests/TestNeuralLayers.pas b/tests/TestNeuralLayers.pas
index 137619a..de5d36d 100644
--- a/tests/TestNeuralLayers.pas
+++ b/tests/TestNeuralLayers.pas
@@ -42,6 +42,12 @@   TTestNeuralLayers = class(TTestCase)
     procedure TestSwishActivation;
     procedure TestHyperbolicTangent;
     procedure TestSELUActivation;
+    procedure TestGELUActivation;
+    procedure TestMishActivation;
+    procedure TestGELUSaveLoad;
+    procedure TestMishSaveLoad;
+    procedure TestGELUBackpropagation;
+    procedure TestMishBackpropagation;
     // Additional pooling tests
     procedure TestMaxChannel;
     procedure TestAvgChannel;
@@ -904,6 +910,304 @@ procedure TTestNeuralLayers.TestSELUActivation;
   end;
 end;
 
+procedure TTestNeuralLayers.TestGELUActivation;
+var
+  NN: TNNet;
+  Input: TNNetVolume;
+  OutputLayer: TNNetLayer;
+  ExpectedGELU0, ExpectedGELU1, ExpectedGELUNeg1: TNeuralFloat;
+const
+  SQRT_2_OVER_PI = 0.7978845608;
+  GELU_CONST = 0.044715;
+begin
+  NN := TNNet.Create();
+  Input := TNNetVolume.Create(5, 1, 1);
+  try
+    NN.AddLayer(TNNetInput.Create(5));
+    NN.AddLayer(TNNetGELU.Create());
+
+    // Test values: 0, 1, -1, 2, -2
+    Input.Raw[0] := 0.0;
+    Input.Raw[1] := 1.0;
+    Input.Raw[2] := -1.0;
+    Input.Raw[3] := 2.0;
+    Input.Raw[4] := -2.0;
+
+    NN.Compute(Input);
+
+    OutputLayer := NN.GetLastLayer;
+
+    // GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715*x^3)))
+    // GELU(0) = 0
+    ExpectedGELU0 := 0.0;
+    AssertEquals('GELU of 0 should be 0', ExpectedGELU0, OutputLayer.Output.Raw[0], 0.0001);
+
+    // GELU(1) ≈ 0.8413 (approximately)
+    ExpectedGELU1 := 0.5 * 1.0 * (1 + Tanh(SQRT_2_OVER_PI * (1.0 + GELU_CONST * 1.0)));
+    AssertEquals('GELU of 1 should match approximation', ExpectedGELU1, OutputLayer.Output.Raw[1], 0.001);
+    AssertTrue('GELU of 1 should be around 0.84', Abs(OutputLayer.Output.Raw[1] - 0.841) < 0.01);
+
+    // GELU(-1) ≈ -0.1587 (approximately - close to 0 but negative)
+    ExpectedGELUNeg1 := 0.5 * (-1.0) * (1 + Tanh(SQRT_2_OVER_PI * (-1.0 + GELU_CONST * (-1.0))));
+    AssertEquals('GELU of -1 should match approximation', ExpectedGELUNeg1, OutputLayer.Output.Raw[2], 0.001);
+    AssertTrue('GELU of -1 should be around -0.16', Abs(OutputLayer.Output.Raw[2] - (-0.159)) < 0.02);
+
+    // GELU(2) should be close to 2 (almost linear for large positive values)
+    AssertTrue('GELU of 2 should be close to 2', Abs(OutputLayer.Output.Raw[3] - 1.96) < 0.1);
+
+    // GELU(-2) should be very small (close to 0)
+    AssertTrue('GELU of -2 should be close to 0', Abs(OutputLayer.Output.Raw[4]) < 0.05);
+
+    // GELU is indeed monotonic: GELU(-2) > GELU(-1) (both negative, but -2 is closer to 0)
+    // Order: GELU(-1) < GELU(-2) < GELU(0) < GELU(1) < GELU(2)
+    AssertTrue('GELU should be monotonic', 
+      (OutputLayer.Output.Raw[2] < OutputLayer.Output.Raw[4]) and
+      (OutputLayer.Output.Raw[4] < OutputLayer.Output.Raw[0]) and
+      (OutputLayer.Output.Raw[0] < OutputLayer.Output.Raw[1]) and
+      (OutputLayer.Output.Raw[1] < OutputLayer.Output.Raw[3]));
+
+  finally
+    NN.Free;
+    Input.Free;
+  end;
+end;
+
+procedure TTestNeuralLayers.TestMishActivation;
+var
+  NN: TNNet;
+  Input: TNNetVolume;
+  OutputLayer: TNNetLayer;
+  ExpectedMish0, ExpectedMish1, ExpectedMishNeg1: TNeuralFloat;
+begin
+  NN := TNNet.Create();
+  Input := TNNetVolume.Create(5, 1, 1);
+  try
+    NN.AddLayer(TNNetInput.Create(5));
+    NN.AddLayer(TNNetMish.Create());
+
+    // Test values: 0, 1, -1, 2, -2
+    Input.Raw[0] := 0.0;
+    Input.Raw[1] := 1.0;
+    Input.Raw[2] := -1.0;
+    Input.Raw[3] := 2.0;
+    Input.Raw[4] := -2.0;
+
+    NN.Compute(Input);
+
+    OutputLayer := NN.GetLastLayer;
+
+    // Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+    // Mish(0) = 0 * tanh(ln(2)) = 0
+    ExpectedMish0 := 0.0;
+    AssertEquals('Mish of 0 should be 0', ExpectedMish0, OutputLayer.Output.Raw[0], 0.0001);
+
+    // Mish(1) ≈ 0.8651
+    ExpectedMish1 := 1.0 * Tanh(Ln(1 + Exp(1.0)));
+    AssertEquals('Mish of 1 should match formula', ExpectedMish1, OutputLayer.Output.Raw[1], 0.001);
+    AssertTrue('Mish of 1 should be around 0.865', Abs(OutputLayer.Output.Raw[1] - 0.865) < 0.01);
+
+    // Mish(-1) ≈ -0.3034
+    ExpectedMishNeg1 := -1.0 * Tanh(Ln(1 + Exp(-1.0)));
+    AssertEquals('Mish of -1 should match formula', ExpectedMishNeg1, OutputLayer.Output.Raw[2], 0.001);
+    AssertTrue('Mish of -1 should be around -0.30', Abs(OutputLayer.Output.Raw[2] - (-0.303)) < 0.02);
+
+    // Mish(2) should be close to 2 (almost linear for large positive values)
+    AssertTrue('Mish of 2 should be close to 2', Abs(OutputLayer.Output.Raw[3] - 1.94) < 0.1);
+
+    // Mish(-2) ≈ -0.2525 (negative but not close to 0)
+    AssertTrue('Mish of -2 should be around -0.25', Abs(OutputLayer.Output.Raw[4] - (-0.252)) < 0.05);
+
+    // Test non-monotonicity for negative values (a characteristic of Mish)
+    // For very negative values, Mish approaches 0 from below
+    // Mish(-1) is more negative than Mish(-2) which is closer to 0
+    // So |Mish(-1)| > |Mish(-2)|
+    AssertTrue('Mish shows non-monotonic behavior for negative values',
+      Abs(OutputLayer.Output.Raw[2]) > Abs(OutputLayer.Output.Raw[4]));
+
+  finally
+    NN.Free;
+    Input.Free;
+  end;
+end;
+
+procedure TTestNeuralLayers.TestGELUSaveLoad;
+var
+  NN, NN2: TNNet;
+  Input: TNNetVolume;
+  StructStr: string;
+  Output1, Output2: TNeuralFloat;
+begin
+  NN := TNNet.Create();
+  NN2 := TNNet.Create();
+  Input := TNNetVolume.Create(3, 1, 1);
+  try
+    NN.AddLayer(TNNetInput.Create(3));
+    NN.AddLayer(TNNetFullConnectLinear.Create(2));
+    NN.AddLayer(TNNetGELU.Create());
+
+    Input.Raw[0] := 0.5;
+    Input.Raw[1] := -0.5;
+    Input.Raw[2] := 1.0;
+
+    NN.Compute(Input);
+    Output1 := NN.GetLastLayer.Output.Raw[0];
+
+    // Save and load
+    StructStr := NN.SaveToString();
+    NN2.LoadFromString(StructStr);
+
+    NN2.Compute(Input);
+    Output2 := NN2.GetLastLayer.Output.Raw[0];
+
+    AssertEquals('GELU output should be same after save/load', Output1, Output2, 0.0001);
+    AssertEquals('Layer count should match after load', NN.CountLayers(), NN2.CountLayers());
+
+  finally
+    NN.Free;
+    NN2.Free;
+    Input.Free;
+  end;
+end;
+
+procedure TTestNeuralLayers.TestMishSaveLoad;
+var
+  NN, NN2: TNNet;
+  Input: TNNetVolume;
+  StructStr: string;
+  Output1, Output2: TNeuralFloat;
+begin
+  NN := TNNet.Create();
+  NN2 := TNNet.Create();
+  Input := TNNetVolume.Create(3, 1, 1);
+  try
+    NN.AddLayer(TNNetInput.Create(3));
+    NN.AddLayer(TNNetFullConnectLinear.Create(2));
+    NN.AddLayer(TNNetMish.Create());
+
+    Input.Raw[0] := 0.5;
+    Input.Raw[1] := -0.5;
+    Input.Raw[2] := 1.0;
+
+    NN.Compute(Input);
+    Output1 := NN.GetLastLayer.Output.Raw[0];
+
+    // Save and load
+    StructStr := NN.SaveToString();
+    NN2.LoadFromString(StructStr);
+
+    NN2.Compute(Input);
+    Output2 := NN2.GetLastLayer.Output.Raw[0];
+
+    AssertEquals('Mish output should be same after save/load', Output1, Output2, 0.0001);
+    AssertEquals('Layer count should match after load', NN.CountLayers(), NN2.CountLayers());
+
+  finally
+    NN.Free;
+    NN2.Free;
+    Input.Free;
+  end;
+end;
+
+procedure TTestNeuralLayers.TestGELUBackpropagation;
+var
+  NN: TNNet;
+  Input, Target: TNNetVolume;
+  ErrorBefore, ErrorAfter: TNeuralFloat;
+  Epoch: integer;
+begin
+  NN := TNNet.Create();
+  Input := TNNetVolume.Create(2, 1, 1);
+  Target := TNNetVolume.Create(1, 1, 1);
+  try
+    // Create a simple network with GELU activation
+    NN.AddLayer(TNNetInput.Create(2));
+    NN.AddLayer(TNNetFullConnectLinear.Create(4));
+    NN.AddLayer(TNNetGELU.Create());
+    NN.AddLayer(TNNetFullConnectLinear.Create(1));
+
+    NN.SetLearningRate(0.1, 0.0);
+
+    // XOR-like problem
+    Input.Raw[0] := 1.0;
+    Input.Raw[1] := 0.0;
+    Target.Raw[0] := 1.0;
+
+    // Compute initial error
+    NN.Compute(Input);
+    ErrorBefore := Abs(NN.GetLastLayer.Output.Raw[0] - Target.Raw[0]);
+
+    // Train for multiple epochs
+    for Epoch := 1 to 100 do
+    begin
+      NN.Compute(Input);
+      NN.Backpropagate(Target);
+    end;
+
+    // Compute final error
+    NN.Compute(Input);
+    ErrorAfter := Abs(NN.GetLastLayer.Output.Raw[0] - Target.Raw[0]);
+
+    // Error should decrease (learning is happening through backpropagation)
+    AssertTrue('GELU network should learn (error should decrease)',
+      (ErrorAfter < ErrorBefore) or (ErrorAfter < 0.5));
+
+  finally
+    NN.Free;
+    Input.Free;
+    Target.Free;
+  end;
+end;
+
+procedure TTestNeuralLayers.TestMishBackpropagation;
+var
+  NN: TNNet;
+  Input, Target: TNNetVolume;
+  ErrorBefore, ErrorAfter: TNeuralFloat;
+  Epoch: integer;
+begin
+  NN := TNNet.Create();
+  Input := TNNetVolume.Create(2, 1, 1);
+  Target := TNNetVolume.Create(1, 1, 1);
+  try
+    // Create a simple network with Mish activation
+    NN.AddLayer(TNNetInput.Create(2));
+    NN.AddLayer(TNNetFullConnectLinear.Create(4));
+    NN.AddLayer(TNNetMish.Create());
+    NN.AddLayer(TNNetFullConnectLinear.Create(1));
+
+    NN.SetLearningRate(0.1, 0.0);
+
+    // XOR-like problem
+    Input.Raw[0] := 1.0;
+    Input.Raw[1] := 0.0;
+    Target.Raw[0] := 1.0;
+
+    // Compute initial error
+    NN.Compute(Input);
+    ErrorBefore := Abs(NN.GetLastLayer.Output.Raw[0] - Target.Raw[0]);
+
+    // Train for multiple epochs
+    for Epoch := 1 to 100 do
+    begin
+      NN.Compute(Input);
+      NN.Backpropagate(Target);
+    end;
+
+    // Compute final error
+    NN.Compute(Input);
+    ErrorAfter := Abs(NN.GetLastLayer.Output.Raw[0] - Target.Raw[0]);
+
+    // Error should decrease (learning is happening through backpropagation)
+    AssertTrue('Mish network should learn (error should decrease)',
+      (ErrorAfter < ErrorBefore) or (ErrorAfter < 0.5));
+
+  finally
+    NN.Free;
+    Input.Free;
+    Target.Free;
+  end;
+end;
+
 procedure TTestNeuralLayers.TestMaxChannel;
 var
   NN: TNNet;
diff --git a/tests/TestNeuralNumerical.pas b/tests/TestNeuralNumerical.pas
index a280cad..34b7c16 100644
--- a/tests/TestNeuralNumerical.pas
+++ b/tests/TestNeuralNumerical.pas
@@ -34,6 +34,10 @@   TTestNeuralNumerical = class(TTestCase)
     procedure TestTanhNumericalRange;
     procedure TestSwishNumericalValues;
     procedure TestHardSwishNumericalValues;
+    procedure TestGELUNumericalValues;
+    procedure TestMishNumericalValues;
+    procedure TestGELUGradientCheck;
+    procedure TestMishGradientCheck;
     
     // Depthwise convolution numerical tests
     procedure TestDepthwiseConvNumerical;
@@ -728,6 +732,235 @@ procedure TTestNeuralNumerical.TestHardSwishNumericalValues;
   end;
 end;
 
+procedure TTestNeuralNumerical.TestGELUNumericalValues;
+var
+  NN: TNNet;
+  Input: TNNetVolume;
+  x, tanhArg, tanhVal, expected: TNeuralFloat;
+const
+  SQRT_2_OVER_PI = 0.7978845608;
+  GELU_CONST = 0.044715;
+begin
+  NN := TNNet.Create();
+  Input := TNNetVolume.Create(7, 1, 1);
+  try
+    NN.AddLayer(TNNetInput.Create(7));
+    NN.AddLayer(TNNetGELU.Create());
+
+    // Test a range of values
+    Input.Raw[0] := 0.0;
+    Input.Raw[1] := 1.0;
+    Input.Raw[2] := -1.0;
+    Input.Raw[3] := 2.0;
+    Input.Raw[4] := -2.0;
+    Input.Raw[5] := 0.5;
+    Input.Raw[6] := -0.5;
+
+    NN.Compute(Input);
+
+    // GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715*x^3)))
+    // Test each value against the formula
+    x := 0.0;
+    tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x * x * x);
+    tanhVal := Tanh(tanhArg);
+    expected := 0.5 * x * (1 + tanhVal);
+    AssertEquals('GELU(0) should match formula', expected, NN.GetLastLayer.Output.Raw[0], 0.0001);
+
+    x := 1.0;
+    tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x * x * x);
+    tanhVal := Tanh(tanhArg);
+    expected := 0.5 * x * (1 + tanhVal);
+    AssertEquals('GELU(1) should match formula', expected, NN.GetLastLayer.Output.Raw[1], 0.0001);
+
+    x := -1.0;
+    tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x * x * x);
+    tanhVal := Tanh(tanhArg);
+    expected := 0.5 * x * (1 + tanhVal);
+    AssertEquals('GELU(-1) should match formula', expected, NN.GetLastLayer.Output.Raw[2], 0.0001);
+
+    // Verify known approximate values
+    AssertTrue('GELU(1) ≈ 0.841', Abs(NN.GetLastLayer.Output.Raw[1] - 0.841) < 0.01);
+    AssertTrue('GELU(-1) ≈ -0.159', Abs(NN.GetLastLayer.Output.Raw[2] - (-0.159)) < 0.01);
+    AssertTrue('GELU(2) ≈ 1.955', Abs(NN.GetLastLayer.Output.Raw[3] - 1.955) < 0.01);
+
+    // Verify asymptotic behavior
+    AssertTrue('GELU approaches identity for large positive x', NN.GetLastLayer.Output.Raw[3] > 1.9);
+    AssertTrue('GELU approaches 0 for large negative x', Abs(NN.GetLastLayer.Output.Raw[4]) < 0.1);
+
+  finally
+    NN.Free;
+    Input.Free;
+  end;
+end;
+
+procedure TTestNeuralNumerical.TestMishNumericalValues;
+var
+  NN: TNNet;
+  Input: TNNetVolume;
+  x, softplus, expected: TNeuralFloat;
+begin
+  NN := TNNet.Create();
+  Input := TNNetVolume.Create(7, 1, 1);
+  try
+    NN.AddLayer(TNNetInput.Create(7));
+    NN.AddLayer(TNNetMish.Create());
+
+    // Test a range of values
+    Input.Raw[0] := 0.0;
+    Input.Raw[1] := 1.0;
+    Input.Raw[2] := -1.0;
+    Input.Raw[3] := 2.0;
+    Input.Raw[4] := -2.0;
+    Input.Raw[5] := 0.5;
+    Input.Raw[6] := -0.5;
+
+    NN.Compute(Input);
+
+    // Mish(x) = x * tanh(ln(1 + exp(x)))
+    // Test each value against the formula
+    x := 0.0;
+    softplus := Ln(1 + Exp(x));
+    expected := x * Tanh(softplus);
+    AssertEquals('Mish(0) should match formula', expected, NN.GetLastLayer.Output.Raw[0], 0.0001);
+
+    x := 1.0;
+    softplus := Ln(1 + Exp(x));
+    expected := x * Tanh(softplus);
+    AssertEquals('Mish(1) should match formula', expected, NN.GetLastLayer.Output.Raw[1], 0.0001);
+
+    x := -1.0;
+    softplus := Ln(1 + Exp(x));
+    expected := x * Tanh(softplus);
+    AssertEquals('Mish(-1) should match formula', expected, NN.GetLastLayer.Output.Raw[2], 0.0001);
+
+    // Verify known approximate values
+    AssertTrue('Mish(0) = 0', Abs(NN.GetLastLayer.Output.Raw[0]) < 0.0001);
+    AssertTrue('Mish(1) ≈ 0.865', Abs(NN.GetLastLayer.Output.Raw[1] - 0.865) < 0.01);
+    AssertTrue('Mish(-1) ≈ -0.303', Abs(NN.GetLastLayer.Output.Raw[2] - (-0.303)) < 0.01);
+
+    // Verify asymptotic behavior
+    AssertTrue('Mish approaches identity for large positive x', NN.GetLastLayer.Output.Raw[3] > 1.9);
+    AssertTrue('Mish is non-monotonic for negative x', 
+      Abs(NN.GetLastLayer.Output.Raw[2]) > Abs(NN.GetLastLayer.Output.Raw[4]));
+
+  finally
+    NN.Free;
+    Input.Free;
+  end;
+end;
+
+procedure TTestNeuralNumerical.TestGELUGradientCheck;
+var
+  NN: TNNet;
+  Input, InputPlus, InputMinus: TNNetVolume;
+  epsilon: TNeuralFloat;
+  numericalGrad, analyticalGrad: TNeuralFloat;
+  i: integer;
+begin
+  NN := TNNet.Create();
+  Input := TNNetVolume.Create(3, 1, 1);
+  InputPlus := TNNetVolume.Create(3, 1, 1);
+  InputMinus := TNNetVolume.Create(3, 1, 1);
+  epsilon := 0.0001;
+  try
+    NN.AddLayer(TNNetInput.Create(3, 1, 1, 1)); // pError=1 resizes error volumes
+    NN.AddLayer(TNNetGELU.Create());
+
+    Input.Raw[0] := 0.5;
+    Input.Raw[1] := -0.5;
+    Input.Raw[2] := 1.0;
+
+    // Compute forward pass to get the derivative
+    NN.Compute(Input);
+    
+    // Check gradient at each input position
+    for i := 0 to 2 do
+    begin
+      // Compute f(x + epsilon)
+      InputPlus.Copy(Input);
+      InputPlus.Raw[i] := Input.Raw[i] + epsilon;
+      NN.Compute(InputPlus);
+      numericalGrad := NN.GetLastLayer.Output.Raw[i];
+
+      // Compute f(x - epsilon)
+      InputMinus.Copy(Input);
+      InputMinus.Raw[i] := Input.Raw[i] - epsilon;
+      NN.Compute(InputMinus);
+      numericalGrad := (numericalGrad - NN.GetLastLayer.Output.Raw[i]) / (2 * epsilon);
+
+      // Get analytical gradient from the layer's error derivative
+      NN.Compute(Input);
+      analyticalGrad := NN.GetLastLayer.OutputErrorDeriv.Raw[i];
+
+      // Compare numerical and analytical gradients
+      AssertTrue('GELU gradient check at position ' + IntToStr(i),
+        Abs(numericalGrad - analyticalGrad) < 0.01);
+    end;
+
+  finally
+    NN.Free;
+    Input.Free;
+    InputPlus.Free;
+    InputMinus.Free;
+  end;
+end;
+
+procedure TTestNeuralNumerical.TestMishGradientCheck;
+var
+  NN: TNNet;
+  Input, InputPlus, InputMinus: TNNetVolume;
+  epsilon: TNeuralFloat;
+  numericalGrad, analyticalGrad: TNeuralFloat;
+  i: integer;
+begin
+  NN := TNNet.Create();
+  Input := TNNetVolume.Create(3, 1, 1);
+  InputPlus := TNNetVolume.Create(3, 1, 1);
+  InputMinus := TNNetVolume.Create(3, 1, 1);
+  epsilon := 0.0001;
+  try
+    NN.AddLayer(TNNetInput.Create(3, 1, 1, 1)); // pError=1 resizes error volumes
+    NN.AddLayer(TNNetMish.Create());
+
+    Input.Raw[0] := 0.5;
+    Input.Raw[1] := -0.5;
+    Input.Raw[2] := 1.0;
+
+    // Compute forward pass to get the derivative
+    NN.Compute(Input);
+    
+    // Check gradient at each input position
+    for i := 0 to 2 do
+    begin
+      // Compute f(x + epsilon)
+      InputPlus.Copy(Input);
+      InputPlus.Raw[i] := Input.Raw[i] + epsilon;
+      NN.Compute(InputPlus);
+      numericalGrad := NN.GetLastLayer.Output.Raw[i];
+
+      // Compute f(x - epsilon)
+      InputMinus.Copy(Input);
+      InputMinus.Raw[i] := Input.Raw[i] - epsilon;
+      NN.Compute(InputMinus);
+      numericalGrad := (numericalGrad - NN.GetLastLayer.Output.Raw[i]) / (2 * epsilon);
+
+      // Get analytical gradient from the layer's error derivative
+      NN.Compute(Input);
+      analyticalGrad := NN.GetLastLayer.OutputErrorDeriv.Raw[i];
+
+      // Compare numerical and analytical gradients
+      AssertTrue('Mish gradient check at position ' + IntToStr(i),
+        Abs(numericalGrad - analyticalGrad) < 0.01);
+    end;
+
+  finally
+    NN.Free;
+    Input.Free;
+    InputPlus.Free;
+    InputMinus.Free;
+  end;
+end;
+
 procedure TTestNeuralNumerical.TestDepthwiseConvNumerical;
 var
   NN: TNNet;