diff --git a/alpine/ChargedParticles.hpp b/alpine/ChargedParticles.hpp
index 4c516fd48..99302498d 100644
--- a/alpine/ChargedParticles.hpp
+++ b/alpine/ChargedParticles.hpp
@@ -360,7 +360,8 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
             if (dev > loadbalancethreshold_m) {
                 local = 1;
             }
-            MPI_Allgather(&local, 1, MPI_INT, res.data(), 1, MPI_INT, ippl::Comm->getCommunicator());
+            MPI_Allgather(&local, 1, MPI_INT, res.data(), 1, MPI_INT,
+                          ippl::Comm->getCommunicator());
 
             for (unsigned int i = 0; i < res.size(); i++) {
                 if (res[i] == 1) {
@@ -375,7 +376,8 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
         std::vector<double> imb(ippl::Comm->size());
         double equalPart = (double)totalP / ippl::Comm->size();
         double dev       = (std::abs((double)this->getLocalNum() - equalPart) / totalP) * 100.0;
-        MPI_Gather(&dev, 1, MPI_DOUBLE, imb.data(), 1, MPI_DOUBLE, 0, ippl::Comm->getCommunicator());
+        MPI_Gather(&dev, 1, MPI_DOUBLE, imb.data(), 1, MPI_DOUBLE, 0,
+                   ippl::Comm->getCommunicator());
 
         if (ippl::Comm->rank() == 0) {
             std::stringstream fname;
@@ -605,7 +607,8 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
         kinEnergy *= 0.5;
         double gkinEnergy = 0.0;
 
-        MPI_Reduce(&kinEnergy, &gkinEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, ippl::Comm->getCommunicator());
+        MPI_Reduce(&kinEnergy, &gkinEnergy, 1, MPI_DOUBLE, MPI_SUM, 0,
+                   ippl::Comm->getCommunicator());
 
         const int nghostE = E_m.getNghost();
         auto Eview        = E_m.getView();
diff --git a/alpine/PenningTrap.cpp b/alpine/PenningTrap.cpp
index 0bf3519bf..aeda5668c 100644
--- a/alpine/PenningTrap.cpp
+++ b/alpine/PenningTrap.cpp
@@ -164,14 +164,14 @@ int main(int argc, char* argv[]) {
         auto start            = std::chrono::high_resolution_clock::now();
         Vector_t<int, Dim> nr = {std::atoi(argv[1]), std::atoi(argv[2]), std::atoi(argv[3])};
 
-        static IpplTimings::TimerRef mainTimer           = IpplTimings::getTimer("total");
-        static IpplTimings::TimerRef particleCreation    = IpplTimings::getTimer("particlesCreation");
-        static IpplTimings::TimerRef dumpDataTimer       = IpplTimings::getTimer("dumpData");
-        static IpplTimings::TimerRef PTimer              = IpplTimings::getTimer("pushVelocity");
-        static IpplTimings::TimerRef RTimer              = IpplTimings::getTimer("pushPosition");
-        static IpplTimings::TimerRef updateTimer         = IpplTimings::getTimer("update");
-        static IpplTimings::TimerRef DummySolveTimer     = IpplTimings::getTimer("solveWarmup");
-        static IpplTimings::TimerRef SolveTimer          = IpplTimings::getTimer("Solve");
+        static IpplTimings::TimerRef mainTimer        = IpplTimings::getTimer("total");
+        static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+        static IpplTimings::TimerRef dumpDataTimer    = IpplTimings::getTimer("dumpData");
+        static IpplTimings::TimerRef PTimer           = IpplTimings::getTimer("pushVelocity");
+        static IpplTimings::TimerRef RTimer           = IpplTimings::getTimer("pushPosition");
+        static IpplTimings::TimerRef updateTimer      = IpplTimings::getTimer("update");
+        static IpplTimings::TimerRef DummySolveTimer  = IpplTimings::getTimer("solveWarmup");
+        static IpplTimings::TimerRef SolveTimer       = IpplTimings::getTimer("Solve");
         static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("loadBalance");
 
         IpplTimings::startTimer(mainTimer);
@@ -179,7 +179,8 @@ int main(int argc, char* argv[]) {
         size_type totalP      = std::atol(argv[4]);
         const unsigned int nt = std::atoi(argv[5]);
 
-        msg << "Penning Trap " << endl << "nt " << nt << " Np= " << totalP << " grid = " << nr << endl;
+        msg << "Penning Trap " << endl
+            << "nt " << nt << " Np= " << totalP << " grid = " << nr << endl;
 
         using bunch_type = ChargedParticles<PLayout_t<double, Dim>, double, Dim>;
 
@@ -289,7 +290,8 @@ int main(int argc, char* argv[]) {
         size_type nloc            = (size_type)(factor * totalP);
         size_type Total_particles = 0;
 
-        MPI_Allreduce(&nloc, &Total_particles, 1, MPI_UNSIGNED_LONG, MPI_SUM, ippl::Comm->getCommunicator());
+        MPI_Allreduce(&nloc, &Total_particles, 1, MPI_UNSIGNED_LONG, MPI_SUM,
+                      ippl::Comm->getCommunicator());
 
         int rest = (int)(totalP - Total_particles);
 
@@ -300,7 +302,7 @@ int main(int argc, char* argv[]) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100 * ippl::Comm->rank()));
         Kokkos::parallel_for(
             nloc, generate_random<Vector_t<double, Dim>, Kokkos::Random_XorShift64_Pool<>, Dim>(
-                    P->R.getView(), P->P.getView(), rand_pool64, mu, sd, minU, maxU));
+                      P->R.getView(), P->P.getView(), rand_pool64, mu, sd, minU, maxU));
 
         Kokkos::fence();
         ippl::Comm->barrier();
@@ -355,7 +357,8 @@ int main(int argc, char* argv[]) {
                         -(Rview(j)[0] - 0.5 * rmax[0]) * (V0 / (2 * Kokkos::pow(rmax[2], 2)));
                     double Eext_y =
                         -(Rview(j)[1] - 0.5 * rmax[1]) * (V0 / (2 * Kokkos::pow(rmax[2], 2)));
-                    double Eext_z = (Rview(j)[2] - 0.5 * rmax[2]) * (V0 / (Kokkos::pow(rmax[2], 2)));
+                    double Eext_z =
+                        (Rview(j)[2] - 0.5 * rmax[2]) * (V0 / (Kokkos::pow(rmax[2], 2)));
 
                     Eview(j)[0] += Eext_x;
                     Eview(j)[1] += Eext_y;
@@ -410,21 +413,22 @@ int main(int argc, char* argv[]) {
                         -(R2view(j)[0] - 0.5 * rmax[0]) * (V0 / (2 * Kokkos::pow(rmax[2], 2)));
                     double Eext_y =
                         -(R2view(j)[1] - 0.5 * rmax[1]) * (V0 / (2 * Kokkos::pow(rmax[2], 2)));
-                    double Eext_z = (R2view(j)[2] - 0.5 * rmax[2]) * (V0 / (Kokkos::pow(rmax[2], 2)));
+                    double Eext_z =
+                        (R2view(j)[2] - 0.5 * rmax[2]) * (V0 / (Kokkos::pow(rmax[2], 2)));
 
                     E2view(j)[0] += Eext_x;
                     E2view(j)[1] += Eext_y;
                     E2view(j)[2] += Eext_z;
-                    P2view(j)[0] =
-                        DrInv
-                        * (P2view(j)[0]
-                        + alpha
-                                * (E2view(j)[0] + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]));
-                    P2view(j)[1] =
-                        DrInv
-                        * (P2view(j)[1]
-                        + alpha
-                                * (E2view(j)[1] - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]));
+                    P2view(j)[0] = DrInv
+                                   * (P2view(j)[0]
+                                      + alpha
+                                            * (E2view(j)[0] + P2view(j)[1] * Bext
+                                               + alpha * Bext * E2view(j)[1]));
+                    P2view(j)[1] = DrInv
+                                   * (P2view(j)[1]
+                                      + alpha
+                                            * (E2view(j)[1] - P2view(j)[0] * Bext
+                                               - alpha * Bext * E2view(j)[0]));
                     P2view(j)[2] += alpha * E2view(j)[2];
                 });
             IpplTimings::stopTimer(PTimer);
@@ -437,7 +441,8 @@ int main(int argc, char* argv[]) {
             msg << "Finished time step: " << it + 1 << " time: " << P->time_m << endl;
 
             if (checkSignalHandler()) {
-                msg << "Aborting timestepping loop due to signal " << interruptSignalReceived << endl;
+                msg << "Aborting timestepping loop due to signal " << interruptSignalReceived
+                    << endl;
                 break;
             }
         }
diff --git a/src/Solver/P3MSolver.h b/src/Solver/P3MSolver.h
index c57299566..110cfa472 100644
--- a/src/Solver/P3MSolver.h
+++ b/src/Solver/P3MSolver.h
@@ -88,6 +88,7 @@ namespace ippl {
 
         CxField_t rhotr_m;
         CxField_t grntr_m;
+        CxField_t tempFieldComplex_m;
 
         // fields that facilitate the calculation in greensFunction()
         IField_t grnIField_m[Dim];
diff --git a/src/Solver/P3MSolver.hpp b/src/Solver/P3MSolver.hpp
index a6b730483..e0b8fd841 100644
--- a/src/Solver/P3MSolver.hpp
+++ b/src/Solver/P3MSolver.hpp
@@ -125,6 +125,7 @@ namespace ippl {
         grn_m.initialize(*mesh_mp, *layout_mp);
         rhotr_m.initialize(*meshComplex_m, *layoutComplex_m);
         grntr_m.initialize(*meshComplex_m, *layoutComplex_m);
+        tempFieldComplex_m.initialize(*meshComplex_m, *layoutComplex_m);
 
         // create the FFT object
         fft_m = std::make_unique<FFT_t>(*layout_mp, *layoutComplex_m, this->params_m);
@@ -232,19 +233,85 @@ namespace ippl {
 
         // multiply FFT(rho2)*FFT(green)
         // convolution becomes multiplication in FFT
-        rhotr_m = rhotr_m * grntr_m;
+        rhotr_m = -rhotr_m * grntr_m;
+
+        using index_array_type = typename RangePolicy<Dim>::index_array_type;
+        if ((out == Base::GRAD) || (out == Base::SOL_AND_GRAD)) {
+            // Compute gradient in Fourier space and then
+            // take inverse FFT.
+
+            const Trhs pi              = Kokkos::numbers::pi_v<Trhs>;
+            Kokkos::complex<Trhs> imag = {0.0, 1.0};
+
+            auto view               = rhotr_m.getView();
+            const int nghost        = rhotr_m.getNghost();
+            auto tempview           = tempFieldComplex_m.getView();
+            auto viewRhs            = this->rhs_mp->getView();
+            auto viewLhs            = this->lhs_mp->getView();
+            const int nghostL       = this->lhs_mp->getNghost();
+            const auto& lDomComplex = layoutComplex_m->getLocalNDIndex();
+
+            // define some member variables in local scope for the parallel_for
+            Vector_t hsize     = hr_m;
+            Vector<int, Dim> N = nr_m;
+            Vector_t origin    = mesh_mp->getOrigin();
+
+            for (size_t gd = 0; gd < Dim; ++gd) {
+                ippl::parallel_for(
+                    "Gradient FFTPeriodicPoissonSolver", getRangePolicy(view, nghost),
+                    KOKKOS_LAMBDA(const index_array_type& args) {
+                        Vector<int, Dim> iVec = args - nghost;
+
+                        for (unsigned d = 0; d < Dim; ++d) {
+                            iVec[d] += lDomComplex[d].first();
+                        }
+
+                        Vector_t kVec;
+
+                        for (size_t d = 0; d < Dim; ++d) {
+                            const Trhs Len = N[d] * hsize[d];
+                            bool shift     = (iVec[d] > (N[d] / 2));
+                            bool notMid    = (iVec[d] != (N[d] / 2));
+                            // For the noMid part see
+                            // https://math.mit.edu/~stevenj/fft-deriv.pdf Algorithm 1
+                            kVec[d] = notMid * 2 * pi / Len * (iVec[d] - shift * N[d]);
+                        }
+
+                        Trhs Dr = 0;
+                        for (unsigned d = 0; d < Dim; ++d) {
+                            Dr += kVec[d] * kVec[d];
+                        }
+
+                        apply(tempview, args) = apply(view, args);
+
+                        bool isNotZero = (Dr != 0.0);
+
+                        apply(tempview, args) *= -(isNotZero * imag * kVec[gd]);
+                    });
+
+                fft_m->transform(-1, *this->rhs_mp, tempFieldComplex_m);
+
+                ippl::parallel_for(
+                    "Assign Gradient FFTPeriodicPoissonSolver", getRangePolicy(viewLhs, nghostL),
+                    KOKKOS_LAMBDA(const index_array_type& args) {
+                        apply(viewLhs, args)[gd] = apply(viewRhs, args);
+                    });
+            }
 
-        // inverse FFT of the product and store the electrostatic potential in rho2_mr
-        fft_m->transform(-1, *(this->rhs_mp), rhotr_m);
+            // normalization is double counted due to 2 transforms
+            *(this->lhs_mp) = *(this->lhs_mp) * nr_m[0] * nr_m[1] * nr_m[2];
+            // discretization of integral requires h^3 factor
+            *(this->lhs_mp) = *(this->lhs_mp) * hr_m[0] * hr_m[1] * hr_m[2];
+        }
 
-        // normalization is double counted due to 2 transforms
-        *(this->rhs_mp) = *(this->rhs_mp) * nr_m[0] * nr_m[1] * nr_m[2];
-        // discretization of integral requires h^3 factor
-        *(this->rhs_mp) = *(this->rhs_mp) * hr_m[0] * hr_m[1] * hr_m[2];
+        if ((out == Base::SOL) || (out == Base::SOL_AND_GRAD)) {
+            // inverse FFT of the product and store the electrostatic potential in rho2_mr
+            fft_m->transform(-1, *(this->rhs_mp), rhotr_m);
 
-        // if we want gradient of phi = Efield
-        if (out == Base::SOL_AND_GRAD || out == Base::GRAD) {
-            *(this->lhs_mp) = -grad(*this->rhs_mp);
+            // normalization is double counted due to 2 transforms
+            *(this->rhs_mp) = *(this->rhs_mp) * nr_m[0] * nr_m[1] * nr_m[2];
+            // discretization of integral requires h^3 factor
+            *(this->rhs_mp) = *(this->rhs_mp) * hr_m[0] * hr_m[1] * hr_m[2];
         }
     };
 
@@ -255,6 +322,9 @@ namespace ippl {
     void P3MSolver<FieldLHS, FieldRHS>::greensFunction() {
         grn_m = 0.0;
 
+        // define pi
+        Trhs pi = Kokkos::numbers::pi_v<Trhs>;
+
         // This alpha parameter is a choice for the Green's function
         // it controls the "range" of the Green's function (e.g.
         // for the P3M collision modelling method, it indicates
@@ -274,8 +344,6 @@ namespace ippl {
         const int nghost                 = grn_m.getNghost();
         const auto& ldom                 = layout_mp->getLocalNDIndex();
 
-        constexpr Trhs ke = 2.532638e8;
-
         // Kokkos parallel for loop to find (0,0,0) point and regularize
         Kokkos::parallel_for(
             "Assign Green's function ", ippl::getRangePolicy(view, nghost),
@@ -288,7 +356,7 @@ namespace ippl {
                 const bool isOrig = (ig == 0 && jg == 0 && kg == 0);
 
                 Trhs r        = Kokkos::real(Kokkos::sqrt(view(i, j, k)));
-                view(i, j, k) = (!isOrig) * ke * (Kokkos::erf(alpha * r) / r);
+                view(i, j, k) = (!isOrig) * (-1.0 / (4.0 * pi)) * (Kokkos::erf(alpha * r) / r);
             });
 
         // perform the FFT of the Green's function for the convolution