From d1c60acd6fc20e5d68ac150536aa14540d9cc42e Mon Sep 17 00:00:00 2001 From: "Ellen M. Price" Date: Tue, 17 Dec 2024 12:32:26 -0600 Subject: [PATCH] Notation tweak, add gradient binning support --- assets/barnes_fortney_fig4.png | 4 +- assets/barnes_fortney_fig9.png | 4 +- assets/timing_versus_batman.png | 4 +- assets/validation_with_batman.png | 4 +- demos/gradients.py | 7 +- demos/spherical.py | 4 +- pyproject.toml | 2 +- setup.py | 18 +- src/greenlantern/__init__.py | 2 +- src/greenlantern/ext/ellipsoid_dual.c | 20 +- src/greenlantern/ext/greenlantern_context.c | 7 + .../ext/include/greenlantern_context.h | 1 + src/greenlantern/ext/kernels/ellipsoid.cl | 22 +- .../ext/kernels/ellipsoid_dual.cl | 246 ++++++++++++++---- 14 files changed, 254 insertions(+), 91 deletions(-) diff --git a/assets/barnes_fortney_fig4.png b/assets/barnes_fortney_fig4.png index a86f4c5..2ec6591 100644 --- a/assets/barnes_fortney_fig4.png +++ b/assets/barnes_fortney_fig4.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:946b5aca68e45768500478e681abf99c408ac4988b727b81ee221bce01723185 -size 76474 +oid sha256:792d1d6f28455d0fb02ca8ec3b2df6a422eed6e2d00d0dd10cd198186f8e7fa3 +size 76355 diff --git a/assets/barnes_fortney_fig9.png b/assets/barnes_fortney_fig9.png index e76fda7..289c535 100644 --- a/assets/barnes_fortney_fig9.png +++ b/assets/barnes_fortney_fig9.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28094b4295badc3b3453d2ccbe9afdf2466421c5158c404a1dcba47026bf9a0f -size 81400 +oid sha256:ead09a4469bc0dbbf05e5d7126c4abc64d8eb27d875e75244fd1fb7e91c50aac +size 80030 diff --git a/assets/timing_versus_batman.png b/assets/timing_versus_batman.png index 7ba1975..0c126a5 100644 --- a/assets/timing_versus_batman.png +++ b/assets/timing_versus_batman.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1175ae75cbb7e83d68009050da5bbf76ca84abf68d73c4c3541be4db0a31cf2 -size 23266 +oid sha256:cc1b41a161c12feb735763db450fbe11bd11d6cd4daec47003af7654e4adb9c7 +size 23230 diff --git a/assets/validation_with_batman.png b/assets/validation_with_batman.png index b5fffd9..e748839 100644 --- a/assets/validation_with_batman.png +++ b/assets/validation_with_batman.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab5fd8942cf894a739c42d735c4817d0d9f037830167ba4ccf2c18f9c3e72553 -size 110146 +oid sha256:573beda1b701869631d5a1deada26c7426aa309d5b99c43fc02d1fc40a162146 +size 110457 diff --git a/demos/gradients.py b/demos/gradients.py index 7a21433..10652bd 100644 --- a/demos/gradients.py +++ b/demos/gradients.py @@ -14,7 +14,7 @@ ctx = pocky.Context.default() ctx1 = greenlantern.Context(ctx) -nt = 1000 +nt = 100 time = np.linspace(-0.3, 0.3, nt) time = pocky.BufferPair(ctx, time.astype(np.float32)) time.copy_to_device() @@ -29,7 +29,10 @@ dflux = np.empty((12, nt), dtype=np.float32) dflux = pocky.BufferPair(ctx, dflux) -ctx1.ellipsoid_transit_flux_dual(time, params, flux=flux, dflux=dflux) +ctx1.ellipsoid_transit_flux_dual(time, params, flux=flux, dflux=dflux, binsize=0.01*Porb) plt.plot(time.host, dflux.host.T, lw=2) +ctx1.ellipsoid_transit_flux_dual(time, params, flux=flux, dflux=dflux) +plt.plot(time.host, dflux.host.T, lw=1, ls='dashed') + plt.show() diff --git a/demos/spherical.py b/demos/spherical.py index b832242..309c481 100644 --- a/demos/spherical.py +++ b/demos/spherical.py @@ -60,7 +60,7 @@ params = np.array([[rr, rr, rr, semimajor, t0, beta, zeta, eta, xi, q1, q2, Porb, ecc, omega]], dtype=np.float32) params = pocky.BufferPair(ctx, params) - ctx1.ellipsoid_transit_flux(time_dev, params, flux=flux, eccentric=True) + ctx1.ellipsoid_transit_flux(time_dev, params, flux=flux, eccentric=True, locked=False) axs[0].plot(time, model_flux, lw=2, ls='solid', alpha=0.6, c=f'C{i}') axs[0].plot(time, flux.host[0], lw=2, ls='dashed', alpha=1, c=f'C{i}') @@ -68,7 +68,7 @@ axs[1].scatter(time, 1e6 * (flux.host[0] - model_flux), alpha=0.05, s=8, c=f'C{i}', rasterized=True) - label = rf'$R_p / R_\star = {rr:.2f}, d_\star = {semimajor:.0f}, i = {90.-beta_deg:.0f}^\circ, u_1 = {u1:.2f}, u_2 = {u2:.2f}$' + label = rf'$R_p / R_\star = {rr:.2f}, d / R_\star = {semimajor:.0f}, i = {90.-beta_deg:.0f}^\circ, u_1 = {u1:.2f}, u_2 = {u2:.2f}$' handles.append(Line2D([0], [0], c=f'C{i}', lw=2, label=label)) axs[1].set_xlabel(r'Time $t$ (code units)') diff --git a/pyproject.toml b/pyproject.toml index 9ea432d..14ed1b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ build-backend = "setuptools.build_meta" requires = [ "setuptools", - "numpy<2", + "numpy", "pocky @ git+https://github.com/emprice/pocky.git@main", ] diff --git a/setup.py b/setup.py index d458039..fee00d7 100644 --- a/setup.py +++ b/setup.py @@ -16,11 +16,13 @@ def gen_kernel_paths(): yield kernel_name, kernel_input kernel_paths = list(gen_kernel_paths()) -kernel_frag_template = textwrap.dedent('''\ -const char kernel_{kernel_name}[] = R"===( -{kernel_content} -)===";\n\n''') -kernel_agg_template = textwrap.dedent('''\ + +def make_kernel_frag(kernel_name, kernel_content): + return f'const char kernel_{kernel_name}[] = \n"' + \ + '\\n"\n"'.join(kernel_content.replace('"', '\\"').split('\n')) + '\\n";\n\n' + +def make_kernel_agg(num_kernels, kernel_vars): + return textwrap.dedent(f'''\ const cl_uint num_kernel_frags = {num_kernels}; const char *kernel_frags[] = {{ {kernel_vars} }}; ''') @@ -31,12 +33,10 @@ def run(self): for kernel_name, kernel_input in kernel_paths: with open(kernel_input, 'r') as kernel_in: kernel_content = kernel_in.read() - kernel_frag = kernel_frag_template.format( - kernel_content=kernel_content, kernel_name=kernel_name) + kernel_frag = make_kernel_frag(kernel_name, kernel_content) kernel_def.write(kernel_frag) - kernel_agg = kernel_agg_template.format( - num_kernels=len(kernel_paths), + kernel_agg = make_kernel_agg(len(kernel_paths), kernel_vars=', '.join([f'kernel_{name}' for name, _ in kernel_paths])) kernel_def.write(kernel_agg) diff --git a/src/greenlantern/__init__.py b/src/greenlantern/__init__.py index 1658032..8013d22 100644 --- a/src/greenlantern/__init__.py +++ b/src/greenlantern/__init__.py @@ -1,3 +1,3 @@ from .ext import * -__version__ = '1.5' +__version__ = '1.6' diff --git a/src/greenlantern/ext/ellipsoid_dual.c b/src/greenlantern/ext/ellipsoid_dual.c index b9a8a80..e42480a 100644 --- a/src/greenlantern/ext/ellipsoid_dual.c +++ b/src/greenlantern/ext/ellipsoid_dual.c @@ -22,8 +22,8 @@ PyObject *ellipsoid_transit_flux_dual(greenlantern_context_object *context, cl_event event; cl_ushort locked; - long worksz[1], dfluxsz[2]; - size_t kernsz[1], locsz[2]; + long worksz[2], dfluxsz[2]; + size_t kernsz[2], locsz[2]; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!|$O!O!O!O!O!", keys, pocky_api->bufpair_type, &time, pocky_api->bufpair_type, ¶ms, @@ -56,6 +56,7 @@ PyObject *ellipsoid_transit_flux_dual(greenlantern_context_object *context, /* Construct the global dimensions for the kernel */ worksz[0] = PyArray_DIM((PyArrayObject *) time->host, 0); + worksz[1] = 1; /* Create a flux buffer if needed */ if ((flux == NULL) && @@ -92,7 +93,7 @@ PyObject *ellipsoid_transit_flux_dual(greenlantern_context_object *context, /* Choose the appropriate kernel */ if (!binsize) kernel = context->kernels.ellipsoid_transit_flux_dual; - else kernel = NULL; // FIXME context->kernels.ellipsoid_transit_flux_binned_dual; + else kernel = context->kernels.ellipsoid_transit_flux_binned_dual; /* Copy data to the device */ if (time->dirty == Py_True) @@ -130,13 +131,13 @@ PyObject *ellipsoid_transit_flux_dual(greenlantern_context_object *context, locked = (locked_flag == Py_True) ? 1 : 0; clSetKernelArg(kernel, 2, sizeof(cl_ushort), &locked); - locsz[0] = 32; - if (!binsize) { clSetKernelArg(kernel, 3, sizeof(cl_mem), &(flux->device)); clSetKernelArg(kernel, 4, sizeof(cl_mem), &(dflux->device)); - locsz[1] = 1; + + locsz[0] = 32; /* number of threads working on summation */ + locsz[1] = 1; /* number of points to bin together (unused) */ } else { @@ -144,13 +145,16 @@ PyObject *ellipsoid_transit_flux_dual(greenlantern_context_object *context, clSetKernelArg(kernel, 3, sizeof(cl_float), &binsize_val); clSetKernelArg(kernel, 4, sizeof(cl_mem), &(flux->device)); clSetKernelArg(kernel, 5, sizeof(cl_mem), &(dflux->device)); - locsz[1] = 17; /* number of points to bin together */ + + locsz[0] = 16; /* number of threads working on summation */ + locsz[1] = 17; /* number of points to bin together */ } kernsz[0] = worksz[0] * locsz[0]; + kernsz[1] = worksz[1] * locsz[1]; err = clEnqueueNDRangeKernel(queue, kernel, - 1, NULL, kernsz, locsz, 0, NULL, &event); + 2, NULL, kernsz, locsz, 0, NULL, &event); if (err != CL_SUCCESS) { snprintf(buf, BUFSIZ, greenlantern_ocl_fmt_internal, diff --git a/src/greenlantern/ext/greenlantern_context.c b/src/greenlantern/ext/greenlantern_context.c index 7daf7df..4b2fd65 100644 --- a/src/greenlantern/ext/greenlantern_context.c +++ b/src/greenlantern/ext/greenlantern_context.c @@ -64,6 +64,11 @@ int greenlantern_context_init(greenlantern_context_object *context, if (err != CL_SUCCESS) return -1; clRetainKernel(context->kernels.ellipsoid_transit_flux_dual); + err = pocky_api->opencl_kernel_lookup_by_name(num_kernels, kernels, + "ellipsoid_transit_flux_binned_dual", &(context->kernels.ellipsoid_transit_flux_binned_dual)); + if (err != CL_SUCCESS) return -1; + clRetainKernel(context->kernels.ellipsoid_transit_flux_binned_dual); + /* Release any remaining kernel handles */ for (idx = 0; idx < num_kernels; ++idx) clReleaseKernel(kernels[idx]); free(kernels); @@ -82,7 +87,9 @@ void greenlantern_context_dealloc(greenlantern_context_object *self) clReleaseKernel(self->kernels.ellipsoid_transit_flux_binned_vector); clReleaseKernel(self->kernels.ellipsoid_eccentric_transit_flux_vector); clReleaseKernel(self->kernels.ellipsoid_eccentric_transit_flux_binned_vector); + clReleaseKernel(self->kernels.ellipsoid_transit_flux_dual); + clReleaseKernel(self->kernels.ellipsoid_transit_flux_binned_dual); /* Release other handles */ clReleaseProgram(self->program); diff --git a/src/greenlantern/ext/include/greenlantern_context.h b/src/greenlantern/ext/include/greenlantern_context.h index 075e8f2..8a64f97 100644 --- a/src/greenlantern/ext/include/greenlantern_context.h +++ b/src/greenlantern/ext/include/greenlantern_context.h @@ -18,6 +18,7 @@ typedef struct cl_kernel ellipsoid_eccentric_transit_flux_binned_vector; cl_kernel ellipsoid_transit_flux_dual; + cl_kernel ellipsoid_transit_flux_binned_dual; } kernels; /**< Named kernel handles */ } diff --git a/src/greenlantern/ext/kernels/ellipsoid.cl b/src/greenlantern/ext/kernels/ellipsoid.cl index a37dac5..b55455e 100644 --- a/src/greenlantern/ext/kernels/ellipsoid.cl +++ b/src/greenlantern/ext/kernels/ellipsoid.cl @@ -66,24 +66,24 @@ float ellipsoid_transit_flux_locked_workgroup_body(int gid, int gsz, float2 alpha, float2 beta, ang_params_t p, float3 ax, float ds, float u1, float u2) { - float afac1 = ax.x * (beta.y * (-alpha.x * p.r3 + alpha.y * p.r2) + beta.x * p.r1); - float bfac1 = ax.y * (beta.y * ( alpha.x * p.r6 - alpha.y * p.r5) - beta.x * p.r4); - float cfac1 = ax.z * (beta.y * (-alpha.x * p.r9 + alpha.y * p.r8) + beta.x * p.r7); + float afac1 = ax.x * (beta.y * (alpha.x * p.r3 + alpha.y * p.r2) - beta.x * p.r1); + float bfac1 = ax.y * (beta.y * (alpha.x * p.r6 + alpha.y * p.r5) - beta.x * p.r4); + float cfac1 = ax.z * (beta.y * (alpha.x * p.r9 + alpha.y * p.r8) - beta.x * p.r7); - float afac2 = ax.x * ( alpha.x * p.r2 + alpha.y * p.r3); - float bfac2 = ax.y * (-alpha.x * p.r5 - alpha.y * p.r6); - float cfac2 = ax.z * ( alpha.x * p.r8 + alpha.y * p.r9); + float afac2 = ax.x * (alpha.y * p.r3 - alpha.x * p.r2); + float bfac2 = ax.y * (alpha.y * p.r6 - alpha.x * p.r5); + float cfac2 = ax.z * (alpha.y * p.r9 - alpha.x * p.r8); - float abfac = ax.x * ax.y * (beta.y * p.r7 + beta.x * (alpha.x * p.r9 - alpha.y * p.r8)); - float acfac = ax.x * ax.z * (beta.y * p.r4 + beta.x * (alpha.x * p.r6 - alpha.y * p.r5)); - float bcfac = ax.y * ax.z * (beta.y * p.r1 + beta.x * (alpha.x * p.r3 - alpha.y * p.r2)); + float abfac = ax.x * ax.y * (beta.y * p.r7 + beta.x * (alpha.x * p.r9 + alpha.y * p.r8)); + float acfac = ax.x * ax.z * (beta.y * p.r4 + beta.x * (alpha.x * p.r6 + alpha.y * p.r5)); + float bcfac = ax.y * ax.z * (beta.y * p.r1 + beta.x * (alpha.x * p.r3 + alpha.y * p.r2)); float dx = sqrt(afac1 * afac1 + bfac1 * bfac1 + cfac1 * cfac1); float dy1 = (afac1 * afac2 + bfac1 * bfac2 + cfac1 * cfac2) / dx; float dy2 = sqrt(abfac * abfac + acfac * acfac + bcfac * bcfac) / dx; - float x0 = -ds * alpha.x * beta.y; + float x0 = ds * alpha.x * beta.y; float y0 = ds * alpha.y; float3 Ival = 0; @@ -133,7 +133,7 @@ float ellipsoid_transit_flux_unlocked_workgroup_body(int gid, int gsz, float dy2 = sqrt(abfac * abfac + acfac * acfac + bcfac * bcfac) / dx; float x0 = ds * alpha.x * beta.y; - float y0 = -ds * alpha.y; + float y0 = ds * alpha.y; float3 Ival = 0; diff --git a/src/greenlantern/ext/kernels/ellipsoid_dual.cl b/src/greenlantern/ext/kernels/ellipsoid_dual.cl index 20820bc..4bbf250 100644 --- a/src/greenlantern/ext/kernels/ellipsoid_dual.cl +++ b/src/greenlantern/ext/kernels/ellipsoid_dual.cl @@ -204,28 +204,28 @@ dual_flux_t ellipsoid_transit_flux_dual_locked_workgroup_body(int gid, float diff_abfac[NP1], diff_acfac[NP1], diff_bcfac[NP1]; /* derivatives w.r.t. a, b, c */ - diff_afac1[0] = beta.y * (-alpha.x * p.r3 + alpha.y * p.r2) + beta.x * p.r1; - diff_bfac1[1] = beta.y * ( alpha.x * p.r6 - alpha.y * p.r5) - beta.x * p.r4; - diff_cfac1[2] = beta.y * (-alpha.x * p.r9 + alpha.y * p.r8) + beta.x * p.r7; + diff_afac1[0] = beta.y * (alpha.x * p.r3 + alpha.y * p.r2) - beta.x * p.r1; + diff_bfac1[1] = beta.y * (alpha.x * p.r6 + alpha.y * p.r5) - beta.x * p.r4; + diff_cfac1[2] = beta.y * (alpha.x * p.r9 + alpha.y * p.r8) - beta.x * p.r7; diff_afac1[1] = diff_afac1[2] = 0; diff_bfac1[0] = diff_bfac1[2] = 0; diff_cfac1[0] = diff_cfac1[1] = 0; - diff_afac2[0] = alpha.x * p.r2 + alpha.y * p.r3; - diff_bfac2[1] = -alpha.x * p.r5 - alpha.y * p.r6; - diff_cfac2[2] = alpha.x * p.r8 + alpha.y * p.r9; + diff_afac2[0] = alpha.y * p.r3 - alpha.x * p.r2; + diff_bfac2[1] = alpha.y * p.r6 - alpha.x * p.r5; + diff_cfac2[2] = alpha.y * p.r9 - alpha.x * p.r8; diff_afac2[1] = diff_afac2[2] = 0; diff_bfac2[0] = diff_bfac2[2] = 0; diff_cfac2[0] = diff_cfac2[1] = 0; - diff_abfac[0] = ax.y * (beta.y * p.r7 + beta.x * (alpha.x * p.r9 - alpha.y * p.r8)); - diff_abfac[1] = ax.x * (beta.y * p.r7 + beta.x * (alpha.x * p.r9 - alpha.y * p.r8)); - diff_acfac[0] = ax.z * (beta.y * p.r4 + beta.x * (alpha.x * p.r6 - alpha.y * p.r5)); - diff_acfac[2] = ax.x * (beta.y * p.r4 + beta.x * (alpha.x * p.r6 - alpha.y * p.r5)); - diff_bcfac[1] = ax.z * (beta.y * p.r1 + beta.x * (alpha.x * p.r3 - alpha.y * p.r2)); - diff_bcfac[2] = ax.y * (beta.y * p.r1 + beta.x * (alpha.x * p.r3 - alpha.y * p.r2)); + diff_abfac[0] = ax.y * (beta.y * p.r7 + beta.x * (alpha.x * p.r9 + alpha.y * p.r8)); + diff_abfac[1] = ax.x * (beta.y * p.r7 + beta.x * (alpha.x * p.r9 + alpha.y * p.r8)); + diff_acfac[0] = ax.z * (beta.y * p.r4 + beta.x * (alpha.x * p.r6 + alpha.y * p.r5)); + diff_acfac[2] = ax.x * (beta.y * p.r4 + beta.x * (alpha.x * p.r6 + alpha.y * p.r5)); + diff_bcfac[1] = ax.z * (beta.y * p.r1 + beta.x * (alpha.x * p.r3 + alpha.y * p.r2)); + diff_bcfac[2] = ax.y * (beta.y * p.r1 + beta.x * (alpha.x * p.r3 + alpha.y * p.r2)); diff_abfac[2] = diff_acfac[1] = diff_bcfac[0] = 0; @@ -248,44 +248,44 @@ dual_flux_t ellipsoid_transit_flux_dual_locked_workgroup_body(int gid, diff_abfac[3] = diff_acfac[3] = diff_bcfac[3] = 0; /* derivatives w.r.t. alpha */ - diff_afac1[4] = ax.x * (beta.y * ( alpha.y * p.r3 + alpha.x * p.r2) + beta.x * p.r1); - diff_bfac1[4] = ax.y * (beta.y * (-alpha.y * p.r6 - alpha.x * p.r5) - beta.x * p.r4); - diff_cfac1[4] = ax.z * (beta.y * ( alpha.y * p.r9 + alpha.x * p.r8) + beta.x * p.r7); + diff_afac1[4] = ax.x * (beta.y * (-alpha.y * p.r3 + alpha.x * p.r2) - beta.x * p.r1); + diff_bfac1[4] = ax.y * (beta.y * (-alpha.y * p.r6 + alpha.x * p.r5) - beta.x * p.r4); + diff_cfac1[4] = ax.z * (beta.y * (-alpha.y * p.r9 + alpha.x * p.r8) - beta.x * p.r7); - diff_afac2[4] = ax.x * (-alpha.y * p.r2 + alpha.x * p.r3); - diff_bfac2[4] = ax.y * ( alpha.y * p.r5 - alpha.x * p.r6); - diff_cfac2[4] = ax.z * (-alpha.y * p.r8 + alpha.x * p.r9); + diff_afac2[4] = ax.x * (alpha.x * p.r3 + alpha.y * p.r2); + diff_bfac2[4] = ax.y * (alpha.x * p.r6 + alpha.y * p.r5); + diff_cfac2[4] = ax.z * (alpha.x * p.r9 + alpha.y * p.r8); - diff_abfac[4] = ax.x * ax.y * (beta.y * p.r7 + beta.x * (-alpha.y * p.r9 - alpha.x * p.r8)); - diff_acfac[4] = ax.x * ax.z * (beta.y * p.r4 + beta.x * (-alpha.y * p.r6 - alpha.x * p.r5)); - diff_bcfac[4] = ax.y * ax.z * (beta.y * p.r1 + beta.x * (-alpha.y * p.r3 - alpha.x * p.r2)); + diff_abfac[4] = ax.x * ax.y * (beta.y * p.r7 + beta.x * (-alpha.y * p.r9 + alpha.x * p.r8)); + diff_acfac[4] = ax.x * ax.z * (beta.y * p.r4 + beta.x * (-alpha.y * p.r6 + alpha.x * p.r5)); + diff_bcfac[4] = ax.y * ax.z * (beta.y * p.r1 + beta.x * (-alpha.y * p.r3 + alpha.x * p.r2)); /* derivatives w.r.t. beta */ - diff_afac1[5] = ax.x * (beta.x * (-alpha.x * p.r3 + alpha.y * p.r2) - beta.y * p.r1); - diff_bfac1[5] = ax.y * (beta.x * ( alpha.x * p.r6 - alpha.y * p.r5) + beta.y * p.r4); - diff_cfac1[5] = ax.z * (beta.x * (-alpha.x * p.r9 + alpha.y * p.r8) - beta.y * p.r7); + diff_afac1[5] = ax.x * (beta.x * (alpha.x * p.r3 + alpha.y * p.r2) + beta.y * p.r1); + diff_bfac1[5] = ax.y * (beta.x * (alpha.x * p.r6 + alpha.y * p.r5) + beta.y * p.r4); + diff_cfac1[5] = ax.z * (beta.x * (alpha.x * p.r9 + alpha.y * p.r8) + beta.y * p.r7); diff_afac2[5] = diff_bfac2[5] = diff_cfac2[5] = 0; - diff_abfac[5] = ax.x * ax.y * (beta.x * p.r7 - beta.y * (alpha.x * p.r9 - alpha.y * p.r8)); - diff_acfac[5] = ax.x * ax.z * (beta.x * p.r4 - beta.y * (alpha.x * p.r6 - alpha.y * p.r5)); - diff_bcfac[5] = ax.y * ax.z * (beta.x * p.r1 - beta.y * (alpha.x * p.r3 - alpha.y * p.r2)); + diff_abfac[5] = ax.x * ax.y * (beta.x * p.r7 - beta.y * (alpha.x * p.r9 + alpha.y * p.r8)); + diff_acfac[5] = ax.x * ax.z * (beta.x * p.r4 - beta.y * (alpha.x * p.r6 + alpha.y * p.r5)); + diff_bcfac[5] = ax.y * ax.z * (beta.x * p.r1 - beta.y * (alpha.x * p.r3 + alpha.y * p.r2)); /* derivatives w.r.t. zeta, eta, xi */ __attribute__((opencl_unroll_hint)) for (int i = 0; i < 3; ++i) { - diff_afac1[i+6] = ax.x * (beta.y * (-alpha.x * p.dr3[i] + alpha.y * p.dr2[i]) + beta.x * p.dr1[i]); - diff_bfac1[i+6] = ax.y * (beta.y * ( alpha.x * p.dr6[i] - alpha.y * p.dr5[i]) - beta.x * p.dr4[i]); - diff_cfac1[i+6] = ax.z * (beta.y * (-alpha.x * p.dr9[i] + alpha.y * p.dr8[i]) + beta.x * p.dr7[i]); + diff_afac1[i+6] = ax.x * (beta.y * (alpha.x * p.dr3[i] + alpha.y * p.dr2[i]) - beta.x * p.dr1[i]); + diff_bfac1[i+6] = ax.y * (beta.y * (alpha.x * p.dr6[i] + alpha.y * p.dr5[i]) - beta.x * p.dr4[i]); + diff_cfac1[i+6] = ax.z * (beta.y * (alpha.x * p.dr9[i] + alpha.y * p.dr8[i]) - beta.x * p.dr7[i]); - diff_afac2[i+6] = ax.x * ( alpha.x * p.dr2[i] + alpha.y * p.dr3[i]); - diff_bfac2[i+6] = ax.y * (-alpha.x * p.dr5[i] - alpha.y * p.dr6[i]); - diff_cfac2[i+6] = ax.z * ( alpha.x * p.dr8[i] + alpha.y * p.dr9[i]); + diff_afac2[i+6] = ax.x * (alpha.y * p.dr3[i] - alpha.x * p.dr2[i]); + diff_bfac2[i+6] = ax.y * (alpha.y * p.dr6[i] - alpha.x * p.dr5[i]); + diff_cfac2[i+6] = ax.z * (alpha.y * p.dr9[i] - alpha.x * p.dr8[i]); - diff_abfac[i+6] = ax.x * ax.y * (beta.y * p.dr7[i] + beta.x * (alpha.x * p.dr9[i] - alpha.y * p.dr8[i])); - diff_acfac[i+6] = ax.x * ax.z * (beta.y * p.dr4[i] + beta.x * (alpha.x * p.dr6[i] - alpha.y * p.dr5[i])); - diff_bcfac[i+6] = ax.y * ax.z * (beta.y * p.dr1[i] + beta.x * (alpha.x * p.dr3[i] - alpha.y * p.dr2[i])); + diff_abfac[i+6] = ax.x * ax.y * (beta.y * p.dr7[i] + beta.x * (alpha.x * p.dr9[i] + alpha.y * p.dr8[i])); + diff_acfac[i+6] = ax.x * ax.z * (beta.y * p.dr4[i] + beta.x * (alpha.x * p.dr6[i] + alpha.y * p.dr5[i])); + diff_bcfac[i+6] = ax.y * ax.z * (beta.y * p.dr1[i] + beta.x * (alpha.x * p.dr3[i] + alpha.y * p.dr2[i])); } q.dx = sqrt(afac1 * afac1 + bfac1 * bfac1 + cfac1 * cfac1); @@ -308,12 +308,12 @@ dual_flux_t ellipsoid_transit_flux_dual_locked_workgroup_body(int gid, bcfac * diff_bcfac[i]) / dy2_tmp / q.dx - q.dy2 * q.diff_dx[i] / q.dx; } - q.x0 = -ds * alpha.x * beta.y; - q.y0 = ds * alpha.y; + q.x0 = ds * alpha.x * beta.y; + q.y0 = ds * alpha.y; - q.diff_x0[3] = -alpha.x * beta.y; - q.diff_x0[4] = ds * alpha.y * beta.y; - q.diff_x0[5] = -ds * alpha.x * beta.x; + q.diff_x0[3] = alpha.x * beta.y; + q.diff_x0[4] = -ds * alpha.y * beta.y; + q.diff_x0[5] = ds * alpha.x * beta.x; q.diff_y0[3] = alpha.y; q.diff_y0[4] = ds * alpha.x; @@ -478,15 +478,15 @@ dual_flux_t ellipsoid_transit_flux_dual_unlocked_workgroup_body(int gid, bcfac * diff_bcfac[i]) / dy2_tmp / q.dx - q.dy2 * q.diff_dx[i] / q.dx; } - q.x0 = ds * alpha.x * beta.y; - q.y0 = -ds * alpha.y; + q.x0 = ds * alpha.x * beta.y; + q.y0 = ds * alpha.y; - q.diff_x0[3] = alpha.x * beta.y; + q.diff_x0[3] = alpha.x * beta.y; q.diff_x0[4] = -ds * alpha.y * beta.y; q.diff_x0[5] = ds * alpha.x * beta.x; - q.diff_y0[3] = -alpha.y; - q.diff_y0[4] = -ds * alpha.x; + q.diff_y0[3] = alpha.y; + q.diff_y0[4] = ds * alpha.x; q.diff_x0[0] = q.diff_x0[1] = q.diff_x0[2] = q.diff_x0[6] = q.diff_x0[7] = q.diff_x0[8] = 0; @@ -652,7 +652,6 @@ kernel void ellipsoid_transit_flux_dual(global const float * restrict time, { if (alpha.x > 0) { - local_dflux[0] = Ival_tot.deriv[0]; /* dIval / da */ local_dflux[1] = Ival_tot.deriv[1]; /* dIval / db */ local_dflux[2] = Ival_tot.deriv[2]; /* dIval / dc */ @@ -680,15 +679,164 @@ kernel void ellipsoid_transit_flux_dual(global const float * restrict time, { flux[sid] = 1; __attribute__((opencl_unroll_hint)) - for (int i = 0; i < LDA; ++i) - dflux[i*ssz+sid] = 0; + for (int i = 0; i < LDA; ++i) local_dflux[i] = 0; } } + work_group_barrier(CLK_LOCAL_MEM_FENCE); for (int off = gid; off < LDA; off += gsz) dflux[off*ssz+sid] = -M_1_PI * local_dflux[off]; } +kernel void ellipsoid_transit_flux_binned_dual(global const float * restrict time, + global const float *params, ushort locked, float dt_bin, global float *flux, + global float * restrict dflux) +{ + int sid = get_global_id(0); /* sample index */ + int ssz = get_global_size(0); /* number of samples */ + + int gid1 = get_local_id(0); /* workgroup dim 0, summation */ + int gsz1 = get_local_size(0); + + int gid2 = get_local_id(1); /* workgroup dim 1, binning */ + int gsz2 = get_local_size(1); + + sid /= gsz1; ssz /= gsz1; + + local float local_params[LDA]; + local float u1, u2, ds, alpha_ang_mid, dalpha_bin; + local float2 beta, zeta, eta, xi; + local float3 ax; + local float porb, q1, sqrt_q1, q2; + + /* pre-load the parameters for this work group */ + for (int off = gid1; (gid2 == 0) && (off < LDA); off += gsz1) + local_params[off] = params[off]; + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + if ((gid1 == 0) && (gid2 == 0)) + { + float a = local_params[0]; /* semiaxis along x */ + float b = local_params[1]; /* semiaxis along y */ + float c = local_params[2]; /* semiaxis along z */ + ax = (float3)(a, b, c); + + ds = local_params[3]; /* distance from ellipsoid to disk */ + porb = local_params[11]; /* orbital period */ + float t0 = local_params[4]; /* midtransit offset in time */ + + float t = time[sid]; + float this_alpha = 2 * M_PI * (t / porb); + float alpha0 = 2 * M_PI * (t0 / porb); + dalpha_bin = 2 * M_PI * (dt_bin / porb); + + alpha_ang_mid = this_alpha - alpha0; + + { + /* precompute trig for beta */ + float beta_ang = local_params[5]; /* complement of inclination */ + float sin_beta, cos_beta; + sin_beta = sincos(beta_ang, &cos_beta); + beta = (float2)(cos_beta, sin_beta); + } + + { + /* precompute trig for zeta */ + float zeta_ang = local_params[6]; /* orientation angle 1 */ + float sin_zeta, cos_zeta; + sin_zeta = sincos(zeta_ang, &cos_zeta); + zeta = (float2)(cos_zeta, sin_zeta); + } + + { + /* precompute trig for eta */ + float eta_ang = local_params[7]; /* orientation angle 2 */ + float sin_eta, cos_eta; + sin_eta = sincos(eta_ang, &cos_eta); + eta = (float2)(cos_eta, sin_eta); + } + + { + /* precompute trig for xi */ + float xi_ang = local_params[8]; /* orientation angle 3 */ + float sin_xi, cos_xi; + sin_xi = sincos(xi_ang, &cos_xi); + xi = (float2)(cos_xi, sin_xi); + } + + /* convert q limb darkening to u limb darkening */ + q1 = local_params[9]; /* limb darkening q1 */ + q2 = local_params[10]; /* limb darkening q2 */ + sqrt_q1 = sqrt(q1); + u1 = 2 * sqrt_q1 * q2; + u2 = sqrt_q1 * (1 - 2 * q2); + } + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + float h_alpha = dalpha_bin / (gsz2 - 1); + float alpha_ang = alpha_ang_mid + + (gid2 - (gsz2 - 1) / 2) * h_alpha; + + /* precompute trig for alpha */ + float sin_alpha, cos_alpha; + sin_alpha = sincos(alpha_ang, &cos_alpha); + float2 alpha = (float2)(cos_alpha, sin_alpha); + + dual_ang_params_t p = fill_dual_angular_parameters(zeta, eta, xi); + dual_flux_t Ival = (locked) ? + ellipsoid_transit_flux_dual_locked_workgroup_body(gid1, gsz1, alpha, beta, p, ax, ds, u1, u2) : + ellipsoid_transit_flux_dual_unlocked_workgroup_body(gid1, gsz1, alpha, beta, p, ax, ds, u1, u2); + + float local_dflux[LDA]; + + if (alpha.x > 0) + { + local_dflux[0] = Ival.deriv[0]; /* dIval / da */ + local_dflux[1] = Ival.deriv[1]; /* dIval / db */ + local_dflux[2] = Ival.deriv[2]; /* dIval / dc */ + local_dflux[3] = Ival.deriv[3]; /* dIval / dds */ + + /* dIval / dt0 from dIval / dalpha */ + local_dflux[4] = -2 * M_PI * Ival.deriv[4] / porb; + + local_dflux[5] = Ival.deriv[5]; /* dIval / dbeta */ + local_dflux[6] = Ival.deriv[6]; /* dIval / dzeta */ + local_dflux[7] = Ival.deriv[7]; /* dIval / deta */ + local_dflux[8] = Ival.deriv[8]; /* dIval / dxi */ + + /* dIval / dq1 and dIval / dq2 */ + local_dflux[9] = (sqrt_q1 == 0) ? 0 : ((q2 * Ival.deriv[9] + + 0.5 * (1 - 2 * q2) * Ival.deriv[10]) / sqrt_q1); + local_dflux[10] = 2 * sqrt_q1 * (Ival.deriv[9] - Ival.deriv[10]); + + /* dIval / dporb from dIval / dalpha */ + local_dflux[11] = -alpha_ang * Ival.deriv[4] / porb; + } + else + { + Ival.val = 0; + __attribute__((opencl_unroll_hint)) + for (int i = 0; i < LDA; ++i) local_dflux[i] = 0; + } + + float weight = ((gid2 == 0) || (gid2 == gsz2 - 1)) ? 1 : ((gid2 & 0x1) ? 4 : 2); + weight /= 3 * (gsz2 - 1); + + Ival.val *= weight; + __attribute__((opencl_unroll_hint)) + for (int i = 0; i < NP1 + 2; ++i) local_dflux[i] *= weight; + + dual_flux_t Ival_tot; + Ival_tot.val = work_group_reduce_add(Ival.val); + __attribute__((opencl_unroll_hint)) + for (int i = 0; i < NP1 + 2; ++i) + Ival_tot.deriv[i] = work_group_reduce_add(local_dflux[i]); + + if ((gid1 == 0) && (gid2 == 0)) flux[sid] = 1 - M_1_PI * Ival_tot.val; + for (int off = gid1; (gid2 == 0) && (off < LDA); off += gsz1) + dflux[off*ssz+sid] = -M_1_PI * Ival_tot.deriv[off]; +} + #undef NP1 #undef LDA #undef NMAX