diff --git a/neural/neuralnetwork.pas b/neural/neuralnetwork.pas index d71459d..edb6543 100644 --- a/neural/neuralnetwork.pas +++ b/neural/neuralnetwork.pas @@ -596,6 +596,26 @@ TNNetHardSwish = class(TNNetReLUBase) procedure Compute(); override; end; + /// Gaussian Error Linear Unit (GELU) activation function - This is an experimental layer. Do not use it. + // A smooth activation function popular in transformer models like BERT and GPT. + // Uses the tanh approximation formula: GELU(x) = 0.5*x*(1 + tanh(sqrt(2/pi)*(x + 0.044715*x^3))) + // https://arxiv.org/abs/1606.08415 + TNNetGELU = class(TNNetReLUBase) + public + procedure Compute(); override; + procedure Backpropagate(); override; + end; + + /// Mish activation function - This is an experimental layer. Do not use it. + // A smooth, non-monotonic self-regularizing activation function. + // Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x))) + // https://arxiv.org/abs/1908.08681 + TNNetMish = class(TNNetReLUBase) + public + procedure Compute(); override; + procedure Backpropagate(); override; + end; + /// Swish activation function with maximum limit of 6 TNNetSwish6 = class(TNNetReLUBase) public @@ -3309,6 +3329,173 @@ procedure TNNetSwish.Compute(); FForwardTime := FForwardTime + (Now() - StartTime); end; +{ TNNetGELU } + +procedure TNNetGELU.Compute(); +var + SizeM1: integer; + LocalPrevOutput: TNNetVolume; + OutputCnt: integer; + StartTime: double; + x: TNeuralFloat; + x3: TNeuralFloat; + tanhArg: TNeuralFloat; + tanhVal: TNeuralFloat; + outputVal: TNeuralFloat; + cdf: TNeuralFloat; +const + // sqrt(2/pi) ≈ 0.7978845608 + SQRT_2_OVER_PI = 0.7978845608; + GELU_CONST = 0.044715; +begin + StartTime := Now(); + LocalPrevOutput := FPrevLayer.Output; + SizeM1 := LocalPrevOutput.Size - 1; + + if (FOutput.Size = FOutputError.Size) and (FOutputErrorDeriv.Size = FOutput.Size) then + begin + for OutputCnt := 0 to SizeM1 do + begin + x := LocalPrevOutput.FData[OutputCnt]; + x3 := x * x * x; + tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x3); + tanhVal := Tanh(tanhArg); + cdf := 0.5 * (1 + tanhVal); + outputVal := x * cdf; + FOutput.FData[OutputCnt] := outputVal; + // Derivative: GELU'(x) = cdf + x * pdf, where pdf is derivative of cdf + // pdf = 0.5 * (1 - tanh^2) * sqrt(2/pi) * (1 + 3*0.044715*x^2) + FOutputErrorDeriv.FData[OutputCnt] := cdf + 0.5 * x * (1 - tanhVal * tanhVal) * + SQRT_2_OVER_PI * (1 + 3 * GELU_CONST * x * x); + end; + end + else + begin + // can't calculate error on input layers. + for OutputCnt := 0 to SizeM1 do + begin + x := LocalPrevOutput.FData[OutputCnt]; + x3 := x * x * x; + tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x3); + FOutput.FData[OutputCnt] := 0.5 * x * (1 + Tanh(tanhArg)); + end; + end; + FForwardTime := FForwardTime + (Now() - StartTime); +end; + +procedure TNNetGELU.Backpropagate(); +var + StartTime: double; +begin + StartTime := Now(); + Inc(FBackPropCallCurrentCnt); + if FBackPropCallCurrentCnt < FDepartingBranchesCnt then exit; + TestBackPropCallCurrCnt(); + // Apply chain rule: multiply error by derivative computed in Compute() + if (FOutput.Size = FOutputError.Size) and (FOutputErrorDeriv.Size = FOutput.Size) then + begin + FOutputError.Mul(FOutputErrorDeriv); + end; + FBackwardTime := FBackwardTime + (Now() - StartTime); + inherited BackpropagateNoTest(); +end; + +{ TNNetMish } + +procedure TNNetMish.Compute(); +var + SizeM1: integer; + LocalPrevOutput: TNNetVolume; + OutputCnt: integer; + StartTime: double; + x: TNeuralFloat; + softplus: TNeuralFloat; + expVal: TNeuralFloat; + tanhSP: TNeuralFloat; + outputVal: TNeuralFloat; + omega: TNeuralFloat; + delta: TNeuralFloat; +begin + StartTime := Now(); + LocalPrevOutput := FPrevLayer.Output; + SizeM1 := LocalPrevOutput.Size - 1; + + if (FOutput.Size = FOutputError.Size) and (FOutputErrorDeriv.Size = FOutput.Size) then + begin + for OutputCnt := 0 to SizeM1 do + begin + x := LocalPrevOutput.FData[OutputCnt]; + // Numerical stability: for large positive x, exp(x) overflows + // softplus(x) = ln(1 + exp(x)) ≈ x for large x + if x > 20 then + begin + // For large x: softplus(x) ≈ x, tanh(x) ≈ 1, sigmoid(x) ≈ 1 + // Mish(x) ≈ x * 1 = x + // Mish'(x) ≈ 1 + x * sech^2(x) * 1 ≈ 1 (since sech^2(x) → 0 for large x) + FOutput.FData[OutputCnt] := x; + FOutputErrorDeriv.FData[OutputCnt] := 1.0; + end + else if x < -20 then + begin + // For very negative x: softplus(x) ≈ exp(x) ≈ 0, tanh(0) = 0 + // Mish(x) ≈ 0 + // Mish'(x) ≈ 0 + FOutput.FData[OutputCnt] := 0; + FOutputErrorDeriv.FData[OutputCnt] := 0; + end + else + begin + expVal := Exp(x); + softplus := Ln(1 + expVal); + tanhSP := Tanh(softplus); + outputVal := x * tanhSP; + FOutput.FData[OutputCnt] := outputVal; + // Derivative: Mish'(x) = tanh(softplus(x)) + x * sigmoid(x) * (1 - tanh^2(softplus(x))) + // = tanh(sp) + x * sech^2(sp) * sigmoid(x) + // Using omega = exp(x) and delta = 1 + exp(x) + // sigmoid(x) = omega / delta + omega := expVal; + delta := 1 + expVal; + FOutputErrorDeriv.FData[OutputCnt] := tanhSP + x * (1 - tanhSP * tanhSP) * omega / delta; + end; + end; + end + else + begin + // can't calculate error on input layers. + for OutputCnt := 0 to SizeM1 do + begin + x := LocalPrevOutput.FData[OutputCnt]; + // Numerical stability + if x > 20 then + softplus := x + else if x < -20 then + softplus := Exp(x) + else + softplus := Ln(1 + Exp(x)); + FOutput.FData[OutputCnt] := x * Tanh(softplus); + end; + end; + FForwardTime := FForwardTime + (Now() - StartTime); +end; + +procedure TNNetMish.Backpropagate(); +var + StartTime: double; +begin + StartTime := Now(); + Inc(FBackPropCallCurrentCnt); + if FBackPropCallCurrentCnt < FDepartingBranchesCnt then exit; + TestBackPropCallCurrCnt(); + // Apply chain rule: multiply error by derivative computed in Compute() + if (FOutput.Size = FOutputError.Size) and (FOutputErrorDeriv.Size = FOutput.Size) then + begin + FOutputError.Mul(FOutputErrorDeriv); + end; + FBackwardTime := FBackwardTime + (Now() - StartTime); + inherited BackpropagateNoTest(); +end; + { TNNetInterleaveChannels } procedure TNNetInterleaveChannels.SetPrevLayer(pPrevLayer: TNNetLayer); @@ -12511,6 +12698,8 @@ function TNNet.CreateLayer(strData: string): TNNetLayer; 'TNNetReLUP' : Result := TNNetReLUP.Create(); 'TNNetSwish' : Result := TNNetSwish.Create(); 'TNNetHardSwish' : Result := TNNetHardSwish.Create(); + 'TNNetGELU' : Result := TNNetGELU.Create(); + 'TNNetMish' : Result := TNNetMish.Create(); 'TNNetSwish6' : Result := TNNetSwish6.Create(); 'TNNetReLUSqrt': Result := TNNetReLUSqrt.Create(); 'TNNetReLUL' : Result := TNNetReLUL.Create(St[0], St[1], St[2]); @@ -12622,6 +12811,8 @@ function TNNet.CreateLayer(strData: string): TNNetLayer; if S[0] = 'TNNetReLUP' then Result := TNNetReLUP.Create() else if S[0] = 'TNNetSwish' then Result := TNNetSwish.Create() else if S[0] = 'TNNetHardSwish' then Result := TNNetHardSwish.Create() else + if S[0] = 'TNNetGELU' then Result := TNNetGELU.Create() else + if S[0] = 'TNNetMish' then Result := TNNetMish.Create() else if S[0] = 'TNNetSwish6' then Result := TNNetSwish6.Create() else if S[0] = 'TNNetReLUSqrt' then Result := TNNetReLUSqrt.Create() else if S[0] = 'TNNetReLUL' then Result := TNNetReLUL.Create(St[0], St[1], St[2]) else @@ -17358,3 +17549,4 @@ initialization end. + diff --git a/tests/TestNeuralLayers.pas b/tests/TestNeuralLayers.pas index 137619a..de5d36d 100644 --- a/tests/TestNeuralLayers.pas +++ b/tests/TestNeuralLayers.pas @@ -42,6 +42,12 @@ TTestNeuralLayers = class(TTestCase) procedure TestSwishActivation; procedure TestHyperbolicTangent; procedure TestSELUActivation; + procedure TestGELUActivation; + procedure TestMishActivation; + procedure TestGELUSaveLoad; + procedure TestMishSaveLoad; + procedure TestGELUBackpropagation; + procedure TestMishBackpropagation; // Additional pooling tests procedure TestMaxChannel; procedure TestAvgChannel; @@ -904,6 +910,304 @@ procedure TTestNeuralLayers.TestSELUActivation; end; end; +procedure TTestNeuralLayers.TestGELUActivation; +var + NN: TNNet; + Input: TNNetVolume; + OutputLayer: TNNetLayer; + ExpectedGELU0, ExpectedGELU1, ExpectedGELUNeg1: TNeuralFloat; +const + SQRT_2_OVER_PI = 0.7978845608; + GELU_CONST = 0.044715; +begin + NN := TNNet.Create(); + Input := TNNetVolume.Create(5, 1, 1); + try + NN.AddLayer(TNNetInput.Create(5)); + NN.AddLayer(TNNetGELU.Create()); + + // Test values: 0, 1, -1, 2, -2 + Input.Raw[0] := 0.0; + Input.Raw[1] := 1.0; + Input.Raw[2] := -1.0; + Input.Raw[3] := 2.0; + Input.Raw[4] := -2.0; + + NN.Compute(Input); + + OutputLayer := NN.GetLastLayer; + + // GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715*x^3))) + // GELU(0) = 0 + ExpectedGELU0 := 0.0; + AssertEquals('GELU of 0 should be 0', ExpectedGELU0, OutputLayer.Output.Raw[0], 0.0001); + + // GELU(1) ≈ 0.8413 (approximately) + ExpectedGELU1 := 0.5 * 1.0 * (1 + Tanh(SQRT_2_OVER_PI * (1.0 + GELU_CONST * 1.0))); + AssertEquals('GELU of 1 should match approximation', ExpectedGELU1, OutputLayer.Output.Raw[1], 0.001); + AssertTrue('GELU of 1 should be around 0.84', Abs(OutputLayer.Output.Raw[1] - 0.841) < 0.01); + + // GELU(-1) ≈ -0.1587 (approximately - close to 0 but negative) + ExpectedGELUNeg1 := 0.5 * (-1.0) * (1 + Tanh(SQRT_2_OVER_PI * (-1.0 + GELU_CONST * (-1.0)))); + AssertEquals('GELU of -1 should match approximation', ExpectedGELUNeg1, OutputLayer.Output.Raw[2], 0.001); + AssertTrue('GELU of -1 should be around -0.16', Abs(OutputLayer.Output.Raw[2] - (-0.159)) < 0.02); + + // GELU(2) should be close to 2 (almost linear for large positive values) + AssertTrue('GELU of 2 should be close to 2', Abs(OutputLayer.Output.Raw[3] - 1.96) < 0.1); + + // GELU(-2) should be very small (close to 0) + AssertTrue('GELU of -2 should be close to 0', Abs(OutputLayer.Output.Raw[4]) < 0.05); + + // GELU is indeed monotonic: GELU(-2) > GELU(-1) (both negative, but -2 is closer to 0) + // Order: GELU(-1) < GELU(-2) < GELU(0) < GELU(1) < GELU(2) + AssertTrue('GELU should be monotonic', + (OutputLayer.Output.Raw[2] < OutputLayer.Output.Raw[4]) and + (OutputLayer.Output.Raw[4] < OutputLayer.Output.Raw[0]) and + (OutputLayer.Output.Raw[0] < OutputLayer.Output.Raw[1]) and + (OutputLayer.Output.Raw[1] < OutputLayer.Output.Raw[3])); + + finally + NN.Free; + Input.Free; + end; +end; + +procedure TTestNeuralLayers.TestMishActivation; +var + NN: TNNet; + Input: TNNetVolume; + OutputLayer: TNNetLayer; + ExpectedMish0, ExpectedMish1, ExpectedMishNeg1: TNeuralFloat; +begin + NN := TNNet.Create(); + Input := TNNetVolume.Create(5, 1, 1); + try + NN.AddLayer(TNNetInput.Create(5)); + NN.AddLayer(TNNetMish.Create()); + + // Test values: 0, 1, -1, 2, -2 + Input.Raw[0] := 0.0; + Input.Raw[1] := 1.0; + Input.Raw[2] := -1.0; + Input.Raw[3] := 2.0; + Input.Raw[4] := -2.0; + + NN.Compute(Input); + + OutputLayer := NN.GetLastLayer; + + // Mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x))) + // Mish(0) = 0 * tanh(ln(2)) = 0 + ExpectedMish0 := 0.0; + AssertEquals('Mish of 0 should be 0', ExpectedMish0, OutputLayer.Output.Raw[0], 0.0001); + + // Mish(1) ≈ 0.8651 + ExpectedMish1 := 1.0 * Tanh(Ln(1 + Exp(1.0))); + AssertEquals('Mish of 1 should match formula', ExpectedMish1, OutputLayer.Output.Raw[1], 0.001); + AssertTrue('Mish of 1 should be around 0.865', Abs(OutputLayer.Output.Raw[1] - 0.865) < 0.01); + + // Mish(-1) ≈ -0.3034 + ExpectedMishNeg1 := -1.0 * Tanh(Ln(1 + Exp(-1.0))); + AssertEquals('Mish of -1 should match formula', ExpectedMishNeg1, OutputLayer.Output.Raw[2], 0.001); + AssertTrue('Mish of -1 should be around -0.30', Abs(OutputLayer.Output.Raw[2] - (-0.303)) < 0.02); + + // Mish(2) should be close to 2 (almost linear for large positive values) + AssertTrue('Mish of 2 should be close to 2', Abs(OutputLayer.Output.Raw[3] - 1.94) < 0.1); + + // Mish(-2) ≈ -0.2525 (negative but not close to 0) + AssertTrue('Mish of -2 should be around -0.25', Abs(OutputLayer.Output.Raw[4] - (-0.252)) < 0.05); + + // Test non-monotonicity for negative values (a characteristic of Mish) + // For very negative values, Mish approaches 0 from below + // Mish(-1) is more negative than Mish(-2) which is closer to 0 + // So |Mish(-1)| > |Mish(-2)| + AssertTrue('Mish shows non-monotonic behavior for negative values', + Abs(OutputLayer.Output.Raw[2]) > Abs(OutputLayer.Output.Raw[4])); + + finally + NN.Free; + Input.Free; + end; +end; + +procedure TTestNeuralLayers.TestGELUSaveLoad; +var + NN, NN2: TNNet; + Input: TNNetVolume; + StructStr: string; + Output1, Output2: TNeuralFloat; +begin + NN := TNNet.Create(); + NN2 := TNNet.Create(); + Input := TNNetVolume.Create(3, 1, 1); + try + NN.AddLayer(TNNetInput.Create(3)); + NN.AddLayer(TNNetFullConnectLinear.Create(2)); + NN.AddLayer(TNNetGELU.Create()); + + Input.Raw[0] := 0.5; + Input.Raw[1] := -0.5; + Input.Raw[2] := 1.0; + + NN.Compute(Input); + Output1 := NN.GetLastLayer.Output.Raw[0]; + + // Save and load + StructStr := NN.SaveToString(); + NN2.LoadFromString(StructStr); + + NN2.Compute(Input); + Output2 := NN2.GetLastLayer.Output.Raw[0]; + + AssertEquals('GELU output should be same after save/load', Output1, Output2, 0.0001); + AssertEquals('Layer count should match after load', NN.CountLayers(), NN2.CountLayers()); + + finally + NN.Free; + NN2.Free; + Input.Free; + end; +end; + +procedure TTestNeuralLayers.TestMishSaveLoad; +var + NN, NN2: TNNet; + Input: TNNetVolume; + StructStr: string; + Output1, Output2: TNeuralFloat; +begin + NN := TNNet.Create(); + NN2 := TNNet.Create(); + Input := TNNetVolume.Create(3, 1, 1); + try + NN.AddLayer(TNNetInput.Create(3)); + NN.AddLayer(TNNetFullConnectLinear.Create(2)); + NN.AddLayer(TNNetMish.Create()); + + Input.Raw[0] := 0.5; + Input.Raw[1] := -0.5; + Input.Raw[2] := 1.0; + + NN.Compute(Input); + Output1 := NN.GetLastLayer.Output.Raw[0]; + + // Save and load + StructStr := NN.SaveToString(); + NN2.LoadFromString(StructStr); + + NN2.Compute(Input); + Output2 := NN2.GetLastLayer.Output.Raw[0]; + + AssertEquals('Mish output should be same after save/load', Output1, Output2, 0.0001); + AssertEquals('Layer count should match after load', NN.CountLayers(), NN2.CountLayers()); + + finally + NN.Free; + NN2.Free; + Input.Free; + end; +end; + +procedure TTestNeuralLayers.TestGELUBackpropagation; +var + NN: TNNet; + Input, Target: TNNetVolume; + ErrorBefore, ErrorAfter: TNeuralFloat; + Epoch: integer; +begin + NN := TNNet.Create(); + Input := TNNetVolume.Create(2, 1, 1); + Target := TNNetVolume.Create(1, 1, 1); + try + // Create a simple network with GELU activation + NN.AddLayer(TNNetInput.Create(2)); + NN.AddLayer(TNNetFullConnectLinear.Create(4)); + NN.AddLayer(TNNetGELU.Create()); + NN.AddLayer(TNNetFullConnectLinear.Create(1)); + + NN.SetLearningRate(0.1, 0.0); + + // XOR-like problem + Input.Raw[0] := 1.0; + Input.Raw[1] := 0.0; + Target.Raw[0] := 1.0; + + // Compute initial error + NN.Compute(Input); + ErrorBefore := Abs(NN.GetLastLayer.Output.Raw[0] - Target.Raw[0]); + + // Train for multiple epochs + for Epoch := 1 to 100 do + begin + NN.Compute(Input); + NN.Backpropagate(Target); + end; + + // Compute final error + NN.Compute(Input); + ErrorAfter := Abs(NN.GetLastLayer.Output.Raw[0] - Target.Raw[0]); + + // Error should decrease (learning is happening through backpropagation) + AssertTrue('GELU network should learn (error should decrease)', + (ErrorAfter < ErrorBefore) or (ErrorAfter < 0.5)); + + finally + NN.Free; + Input.Free; + Target.Free; + end; +end; + +procedure TTestNeuralLayers.TestMishBackpropagation; +var + NN: TNNet; + Input, Target: TNNetVolume; + ErrorBefore, ErrorAfter: TNeuralFloat; + Epoch: integer; +begin + NN := TNNet.Create(); + Input := TNNetVolume.Create(2, 1, 1); + Target := TNNetVolume.Create(1, 1, 1); + try + // Create a simple network with Mish activation + NN.AddLayer(TNNetInput.Create(2)); + NN.AddLayer(TNNetFullConnectLinear.Create(4)); + NN.AddLayer(TNNetMish.Create()); + NN.AddLayer(TNNetFullConnectLinear.Create(1)); + + NN.SetLearningRate(0.1, 0.0); + + // XOR-like problem + Input.Raw[0] := 1.0; + Input.Raw[1] := 0.0; + Target.Raw[0] := 1.0; + + // Compute initial error + NN.Compute(Input); + ErrorBefore := Abs(NN.GetLastLayer.Output.Raw[0] - Target.Raw[0]); + + // Train for multiple epochs + for Epoch := 1 to 100 do + begin + NN.Compute(Input); + NN.Backpropagate(Target); + end; + + // Compute final error + NN.Compute(Input); + ErrorAfter := Abs(NN.GetLastLayer.Output.Raw[0] - Target.Raw[0]); + + // Error should decrease (learning is happening through backpropagation) + AssertTrue('Mish network should learn (error should decrease)', + (ErrorAfter < ErrorBefore) or (ErrorAfter < 0.5)); + + finally + NN.Free; + Input.Free; + Target.Free; + end; +end; + procedure TTestNeuralLayers.TestMaxChannel; var NN: TNNet; diff --git a/tests/TestNeuralNumerical.pas b/tests/TestNeuralNumerical.pas index a280cad..34b7c16 100644 --- a/tests/TestNeuralNumerical.pas +++ b/tests/TestNeuralNumerical.pas @@ -34,6 +34,10 @@ TTestNeuralNumerical = class(TTestCase) procedure TestTanhNumericalRange; procedure TestSwishNumericalValues; procedure TestHardSwishNumericalValues; + procedure TestGELUNumericalValues; + procedure TestMishNumericalValues; + procedure TestGELUGradientCheck; + procedure TestMishGradientCheck; // Depthwise convolution numerical tests procedure TestDepthwiseConvNumerical; @@ -728,6 +732,235 @@ procedure TTestNeuralNumerical.TestHardSwishNumericalValues; end; end; +procedure TTestNeuralNumerical.TestGELUNumericalValues; +var + NN: TNNet; + Input: TNNetVolume; + x, tanhArg, tanhVal, expected: TNeuralFloat; +const + SQRT_2_OVER_PI = 0.7978845608; + GELU_CONST = 0.044715; +begin + NN := TNNet.Create(); + Input := TNNetVolume.Create(7, 1, 1); + try + NN.AddLayer(TNNetInput.Create(7)); + NN.AddLayer(TNNetGELU.Create()); + + // Test a range of values + Input.Raw[0] := 0.0; + Input.Raw[1] := 1.0; + Input.Raw[2] := -1.0; + Input.Raw[3] := 2.0; + Input.Raw[4] := -2.0; + Input.Raw[5] := 0.5; + Input.Raw[6] := -0.5; + + NN.Compute(Input); + + // GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715*x^3))) + // Test each value against the formula + x := 0.0; + tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x * x * x); + tanhVal := Tanh(tanhArg); + expected := 0.5 * x * (1 + tanhVal); + AssertEquals('GELU(0) should match formula', expected, NN.GetLastLayer.Output.Raw[0], 0.0001); + + x := 1.0; + tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x * x * x); + tanhVal := Tanh(tanhArg); + expected := 0.5 * x * (1 + tanhVal); + AssertEquals('GELU(1) should match formula', expected, NN.GetLastLayer.Output.Raw[1], 0.0001); + + x := -1.0; + tanhArg := SQRT_2_OVER_PI * (x + GELU_CONST * x * x * x); + tanhVal := Tanh(tanhArg); + expected := 0.5 * x * (1 + tanhVal); + AssertEquals('GELU(-1) should match formula', expected, NN.GetLastLayer.Output.Raw[2], 0.0001); + + // Verify known approximate values + AssertTrue('GELU(1) ≈ 0.841', Abs(NN.GetLastLayer.Output.Raw[1] - 0.841) < 0.01); + AssertTrue('GELU(-1) ≈ -0.159', Abs(NN.GetLastLayer.Output.Raw[2] - (-0.159)) < 0.01); + AssertTrue('GELU(2) ≈ 1.955', Abs(NN.GetLastLayer.Output.Raw[3] - 1.955) < 0.01); + + // Verify asymptotic behavior + AssertTrue('GELU approaches identity for large positive x', NN.GetLastLayer.Output.Raw[3] > 1.9); + AssertTrue('GELU approaches 0 for large negative x', Abs(NN.GetLastLayer.Output.Raw[4]) < 0.1); + + finally + NN.Free; + Input.Free; + end; +end; + +procedure TTestNeuralNumerical.TestMishNumericalValues; +var + NN: TNNet; + Input: TNNetVolume; + x, softplus, expected: TNeuralFloat; +begin + NN := TNNet.Create(); + Input := TNNetVolume.Create(7, 1, 1); + try + NN.AddLayer(TNNetInput.Create(7)); + NN.AddLayer(TNNetMish.Create()); + + // Test a range of values + Input.Raw[0] := 0.0; + Input.Raw[1] := 1.0; + Input.Raw[2] := -1.0; + Input.Raw[3] := 2.0; + Input.Raw[4] := -2.0; + Input.Raw[5] := 0.5; + Input.Raw[6] := -0.5; + + NN.Compute(Input); + + // Mish(x) = x * tanh(ln(1 + exp(x))) + // Test each value against the formula + x := 0.0; + softplus := Ln(1 + Exp(x)); + expected := x * Tanh(softplus); + AssertEquals('Mish(0) should match formula', expected, NN.GetLastLayer.Output.Raw[0], 0.0001); + + x := 1.0; + softplus := Ln(1 + Exp(x)); + expected := x * Tanh(softplus); + AssertEquals('Mish(1) should match formula', expected, NN.GetLastLayer.Output.Raw[1], 0.0001); + + x := -1.0; + softplus := Ln(1 + Exp(x)); + expected := x * Tanh(softplus); + AssertEquals('Mish(-1) should match formula', expected, NN.GetLastLayer.Output.Raw[2], 0.0001); + + // Verify known approximate values + AssertTrue('Mish(0) = 0', Abs(NN.GetLastLayer.Output.Raw[0]) < 0.0001); + AssertTrue('Mish(1) ≈ 0.865', Abs(NN.GetLastLayer.Output.Raw[1] - 0.865) < 0.01); + AssertTrue('Mish(-1) ≈ -0.303', Abs(NN.GetLastLayer.Output.Raw[2] - (-0.303)) < 0.01); + + // Verify asymptotic behavior + AssertTrue('Mish approaches identity for large positive x', NN.GetLastLayer.Output.Raw[3] > 1.9); + AssertTrue('Mish is non-monotonic for negative x', + Abs(NN.GetLastLayer.Output.Raw[2]) > Abs(NN.GetLastLayer.Output.Raw[4])); + + finally + NN.Free; + Input.Free; + end; +end; + +procedure TTestNeuralNumerical.TestGELUGradientCheck; +var + NN: TNNet; + Input, InputPlus, InputMinus: TNNetVolume; + epsilon: TNeuralFloat; + numericalGrad, analyticalGrad: TNeuralFloat; + i: integer; +begin + NN := TNNet.Create(); + Input := TNNetVolume.Create(3, 1, 1); + InputPlus := TNNetVolume.Create(3, 1, 1); + InputMinus := TNNetVolume.Create(3, 1, 1); + epsilon := 0.0001; + try + NN.AddLayer(TNNetInput.Create(3, 1, 1, 1)); // pError=1 resizes error volumes + NN.AddLayer(TNNetGELU.Create()); + + Input.Raw[0] := 0.5; + Input.Raw[1] := -0.5; + Input.Raw[2] := 1.0; + + // Compute forward pass to get the derivative + NN.Compute(Input); + + // Check gradient at each input position + for i := 0 to 2 do + begin + // Compute f(x + epsilon) + InputPlus.Copy(Input); + InputPlus.Raw[i] := Input.Raw[i] + epsilon; + NN.Compute(InputPlus); + numericalGrad := NN.GetLastLayer.Output.Raw[i]; + + // Compute f(x - epsilon) + InputMinus.Copy(Input); + InputMinus.Raw[i] := Input.Raw[i] - epsilon; + NN.Compute(InputMinus); + numericalGrad := (numericalGrad - NN.GetLastLayer.Output.Raw[i]) / (2 * epsilon); + + // Get analytical gradient from the layer's error derivative + NN.Compute(Input); + analyticalGrad := NN.GetLastLayer.OutputErrorDeriv.Raw[i]; + + // Compare numerical and analytical gradients + AssertTrue('GELU gradient check at position ' + IntToStr(i), + Abs(numericalGrad - analyticalGrad) < 0.01); + end; + + finally + NN.Free; + Input.Free; + InputPlus.Free; + InputMinus.Free; + end; +end; + +procedure TTestNeuralNumerical.TestMishGradientCheck; +var + NN: TNNet; + Input, InputPlus, InputMinus: TNNetVolume; + epsilon: TNeuralFloat; + numericalGrad, analyticalGrad: TNeuralFloat; + i: integer; +begin + NN := TNNet.Create(); + Input := TNNetVolume.Create(3, 1, 1); + InputPlus := TNNetVolume.Create(3, 1, 1); + InputMinus := TNNetVolume.Create(3, 1, 1); + epsilon := 0.0001; + try + NN.AddLayer(TNNetInput.Create(3, 1, 1, 1)); // pError=1 resizes error volumes + NN.AddLayer(TNNetMish.Create()); + + Input.Raw[0] := 0.5; + Input.Raw[1] := -0.5; + Input.Raw[2] := 1.0; + + // Compute forward pass to get the derivative + NN.Compute(Input); + + // Check gradient at each input position + for i := 0 to 2 do + begin + // Compute f(x + epsilon) + InputPlus.Copy(Input); + InputPlus.Raw[i] := Input.Raw[i] + epsilon; + NN.Compute(InputPlus); + numericalGrad := NN.GetLastLayer.Output.Raw[i]; + + // Compute f(x - epsilon) + InputMinus.Copy(Input); + InputMinus.Raw[i] := Input.Raw[i] - epsilon; + NN.Compute(InputMinus); + numericalGrad := (numericalGrad - NN.GetLastLayer.Output.Raw[i]) / (2 * epsilon); + + // Get analytical gradient from the layer's error derivative + NN.Compute(Input); + analyticalGrad := NN.GetLastLayer.OutputErrorDeriv.Raw[i]; + + // Compare numerical and analytical gradients + AssertTrue('Mish gradient check at position ' + IntToStr(i), + Abs(numericalGrad - analyticalGrad) < 0.01); + end; + + finally + NN.Free; + Input.Free; + InputPlus.Free; + InputMinus.Free; + end; +end; + procedure TTestNeuralNumerical.TestDepthwiseConvNumerical; var NN: TNNet;