mapping_acc.impala

fn @compute_config((range_mask_x: i32, range_mask_y: i32), coarsening_factor: i32) -> ((i32, i32, i32), (i32, i32)) {
    fn @compute_config_y (y_max: i32) -> i32 {
        if range_mask_y > 1 { return(y_max) }
        for n in unroll(1, y_max+1) {
            if n * coarsening_factor >= range_mask_y { return(n) }
        }
        y_max
    }

    let config_y_max = 4;   // must be 1 for CPU on Mac OS X
    let config_x_max = 128; // use 256 for AMD GPUs; 512 for Intel MIC

    let threads_y = compute_config_y(config_y_max);
    let threads_x = config_x_max / threads_y;

    let blocks_x = round_up(range_mask_x, threads_x) / threads_x;
    let blocks_y = round_up(range_mask_y, threads_y) / threads_y;

    ((threads_x, threads_y, 1), (blocks_x, blocks_y))
}

fn @set_pixel_fn[T](img: Img) = @|idx: i32, val: T| bitcast[&mut addrspace(1)[T]](img.buf.data)(idx) = val;
fn @get_pixel_fn[T](img: Img) = @|idx: i32| bitcast[&addrspace(1)[T]](img.buf.data)(idx);
fn @set_pixel_shared_fn[T](data: &mut addrspace(3)[T]) = @|idx: i32, val: T| data(idx) = val;
fn @get_pixel_shared_fn[T](data: &addrspace(3)[T]) = @|idx: i32| data(idx);
fn @get_pixel_ldg_fn[T](img: Img) = get_pixel_fn[T](img);
//fn @get_ldg_fn[T]() = bitcast[fn(&addrspace(1)T) -> T, fn(&addrspace(1)f32) -> f32](if is_nvvm() { nvvm_ldg_f32 } else { cuda_ldg_f32 })
//fn @get_pixel_ldg_fn[T](img: Img) = if has_ldg() { @|idx: i32| get_ldg_fn[T]()(&bitcast[&addrspace(1)[T]](img.buf.data)(idx)) } else { get_pixel_fn[T](img) }

fn @iteration[T](body: StencilFn[T]) = @|out: Img, imgs: ImgList, bh_lower: BoundaryFn[T], bh_upper: BoundaryFn[T]| {
    let acc = accelerator(device_id);
    let ext = get_max_ext(imgs);
    let out_gpu  = alloc_img[T](out, acc.alloc);
    let imgs_gpu = for img, _ext in img_list_map(imgs) {
        let img_gpu = alloc_img[T](img, acc.alloc);
        copy_img(img, img_gpu);
        img_gpu
    };

    let coarsening_factor = 1;
    let (block, _) = compute_config(ext, coarsening_factor);
    let grid       = (out.width, out.height / coarsening_factor, 1);

    for benchmark_acc(acc) {
        for work_item in acc.exec(grid, block) {
            let bdim_y = work_item.bdimy();
            let gid_x  = work_item.tidx() + work_item.bdimx() * work_item.bidx();
            let gid_y  = work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;

            let out_acc = get_acc[T](out_gpu, set_pixel_fn[T](out_gpu), get_pixel_fn[T](out_gpu));
            let accs = for img, (ext_x, ext_y) in img_list2acc_map[T](imgs_gpu) {
                if ext_x == 0 && ext_y == 0 {
                    get_acc[T](img, set_pixel_fn[T](img), get_pixel_fn[T](img))
                } else {
                    get_acc_bh[T](img, set_pixel_fn[T](img), get_pixel_ldg_fn[T](img), (Boundary::Unknown, Boundary::Unknown), bh_lower, bh_upper)
                }
            };

            for i in unroll(0, coarsening_factor) {
                @body(gid_x, gid_y + i * bdim_y, out_acc, accs);
            }
        }
    }

    for i in range(0, imgs_gpu.size) {
        release(imgs_gpu.get(i).buf);
    }
    copy_img(out_gpu, out);
    release(out_gpu.buf);
};

fn @iteration_bounds[T](body: StencilFn[T]) = @|out: Img, imgs: ImgList, bh_lower: BoundaryFn[T], bh_upper: BoundaryFn[T]| {
    let acc = accelerator(device_id);
    let ext = get_max_ext(imgs);
    let out_gpu  = alloc_img[T](out, acc.alloc);
    let imgs_gpu = for img, _ext in img_list_map(imgs) {
        let img_gpu = alloc_img[T](img, acc.alloc);
        copy_img(img, img_gpu);
        img_gpu
    };

    // compute the number of blocks required for boundary handling
    let coarsening_factor = 1;
    let (block as (block_x, block_y, _), (bx, by)) = compute_config(ext, coarsening_factor);

    // define if we want to generate one big kernel or multiple kernels
    let big_kernel = false;

    if big_kernel {
        let grid = (out.width, out.height / coarsening_factor, 1);

        for benchmark_acc(acc) {
            for work_item in acc.exec(grid, block) {
                let bdim_y  = work_item.bdimy();
                let bid_x   = work_item.bidx();
                let bid_y   = work_item.bidy();
                let gid_x   = work_item.tidx() + work_item.bdimx() * work_item.bidx();
                let gid_y   = work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;
                let nblk_x  = work_item.nblkx();
                let nblk_y  = work_item.nblky();
                let out_acc = get_acc[T](out_gpu, set_pixel_fn[T](out_gpu), get_pixel_fn[T](out_gpu));

                fn @get_accs_region(region: Region) -> AccList[T] {
                    for img, (ext_x, ext_y) in img_list2acc_map[T](imgs_gpu) {
                        if ext_x == 0 && ext_y == 0 {
                            get_acc[T](img, set_pixel_fn[T](img), get_pixel_fn[T](img))
                        } else {
                            get_acc_bh[T](img, set_pixel_fn[T](img), get_pixel_ldg_fn[T](img), region, bh_lower, bh_upper)
                        }
                    }
                }

                if bid_x < bx && bid_y < by {                            // top-left
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Lower, Boundary::Lower)));
                    }
                } else if bid_x >= nblk_x - bx && bid_y < by {           // top-right
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Upper, Boundary::Lower)));
                    }
                } else if bid_y < by {                                   // top
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Center, Boundary::Lower)));
                    }
                } else if bid_y >= nblk_y - by && bid_x < bx {           // bottom-left
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Lower, Boundary::Upper)));
                    }
                } else if bid_y >= nblk_y - by && bid_x >= nblk_x - bx { // bottom-right
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Upper, Boundary::Upper)));
                    }
                } else if bid_y >= nblk_y - by {                         // bottom
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Center, Boundary::Upper)));
                    }
                } else if bid_x >= nblk_x - bx {                         // right
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Upper, Boundary::Center)));
                    }
                } else if bid_x < bx {                                   // left
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Lower, Boundary::Center)));
                    }
                } else {                                                 // center
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, get_accs_region((Boundary::Center, Boundary::Center)));
                    }
                }
            }
        }
    } else {
        for ((bounds_row_lower, bounds_row_upper), (bounds_col_lower, bounds_col_upper)), region in iterate_2dregion((0, 0), (out.width, out.height / coarsening_factor), (bx * block_x, by * block_y)) {
            let grid = (bounds_row_upper - bounds_row_lower, bounds_col_upper - bounds_col_lower, 1);

            let out_acc = get_acc[T](out_gpu, set_pixel_fn[T](out_gpu), get_pixel_fn[T](out_gpu));
            let accs = for img, (ext_x, ext_y) in img_list2acc_map[T](imgs_gpu) {
                if ext_x == 0 && ext_y == 0 {
                    get_acc[T](img, set_pixel_fn[T](img), get_pixel_fn[T](img))
                } else {
                    get_acc_bh[T](img, set_pixel_fn[T](img), get_pixel_ldg_fn[T](img), region, bh_lower, bh_upper)
                }
            };

            for benchmark_acc(acc) {
                for work_item in acc.exec(grid, block) {
                    let bdim_y = work_item.bdimy();
                    let gid_x  = bounds_row_lower                     + work_item.tidx() + work_item.bdimx() * work_item.bidx();
                    let gid_y  = bounds_col_lower * coarsening_factor + work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;

                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, accs);
                    }
                }
            }
        }
    }

    for i in range(0, imgs_gpu.size) {
        release(imgs_gpu.get(i).buf);
    }
    copy_img(out_gpu, out);
    release(out_gpu.buf);
};

fn @iteration_advanced[T](body: StencilFn[T]) = @|out: Img, imgs: ImgList, bh_lower: BoundaryFn[T], bh_upper: BoundaryFn[T]| {
    let acc = accelerator(device_id);
    let ext = get_max_ext(imgs);
    let out_gpu  = alloc_img[T](out, acc.alloc);
    let imgs_gpu = for img, _ext in img_list_map(imgs) {
        let img_gpu = alloc_img[T](img, acc.alloc);
        copy_img(img, img_gpu);
        img_gpu
    };

    let coarsening_factor = 1;
    let (block as (block_x, block_y, _), _) = compute_config(ext, coarsening_factor);
    let grid = (out.width, out.height / coarsening_factor, 1);

    for benchmark_acc(acc) {
        for work_item in acc.exec(grid, block) {
            let tid_x  = work_item.tidx();
            let tid_y  = work_item.tidy();
            let bid_x  = work_item.bidx();
            let bid_y  = work_item.bidy();
            let bdim_x = work_item.bdimx();
            let bdim_y = work_item.bdimy();
            let gid_x  = work_item.tidx() + work_item.bdimx() * work_item.bidx();
            let gid_y  = work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;

            // quick hack: lists have at max 3 images
            let mut spms : [&mut addrspace(3)[T] * 3];
            let mut spms_stride : [i32 * 3];
            let mut spms_height : [i32 * 3];

            for i in unroll(0, imgs_gpu.size) {
                let img = imgs_gpu.get(i);
                let (ext_x, ext_y) = imgs_gpu.ext(i);
                if ext_x != 0 || ext_y != 0 {
                    let spm_stride =                     block_x + 2 * ext_x;
                    let spm_height = coarsening_factor * block_y + 2 * ext_y;
                    let spm = reserve_shared[T](spm_stride * spm_height);
                    spms(i) = spm;
                    spms_stride(i) = spm_stride;
                    spms_height(i) = spm_height;

                    // compute number of steps required to stage data to shared memory
                    let steps_x   = 2;
                    let offset_y  = if (2 * ext_y) % block_y == 0 { 0 } else { 1 };
                    let steps_y   = coarsening_factor + (2 * ext_y) / block_y + offset_y;

                    for y in unroll(0, steps_y) {
                        let lid_y = tid_y         + y*bdim_y;
                        let idx_y = gid_y - ext_y + y*bdim_y;
                        for x in unroll(0, steps_x) {
                            let lid_x = tid_x         + x*bdim_x;
                            let idx_x = gid_x - ext_x + x*bdim_x;

                            if lid_x < spm_stride && lid_y < spm_height {
                                let gpu_acc = get_acc_bh[T](img, set_pixel_fn[T](img), get_pixel_ldg_fn[T](img), (Boundary::Unknown, Boundary::Unknown), bh_lower, bh_upper); // TODO: set region!
                                let spm_acc = get_acc_memory[T](set_pixel_shared_fn[T](spm), get_pixel_shared_fn[T](spm), spm_stride, spm_height);
                                spm_acc.write(lid_x, lid_y, gpu_acc.read(idx_x, idx_y));
                            }
                        }
                    }
                }
            }

            acc.barrier();

            let mut id = -1;
            for i in unroll(0, coarsening_factor) {
                // index space: block
                let accs = for img, (ext_x, ext_y) in img_list2acc_map[T](imgs_gpu) {
                    id++;
                    if ext_x == 0 && ext_y == 0 {
                        get_acc_offset[T](img, set_pixel_fn[T](img), get_pixel_fn[T](img), bdim_x * bid_x, bdim_y * bid_y * coarsening_factor + i * bdim_y)
                    } else {
                        get_acc_offset_memory[T](set_pixel_shared_fn[T](spms(id)), get_pixel_shared_fn[T](spms(id)), spms_stride(id), spms_height(id), ext_x, ext_y + i * bdim_y)
                    }
                };
                let out_acc = get_acc_offset[T](out_gpu, set_pixel_fn[T](out_gpu), get_pixel_fn[T](out_gpu), bdim_x * bid_x, bdim_y * bid_y * coarsening_factor + i * bdim_y);
                @body(tid_x, tid_y, out_acc, accs);
            }
        }
    }

    for i in range(0, imgs_gpu.size) {
        release(imgs_gpu.get(i).buf);
    }
    copy_img(out_gpu, out);
    release(out_gpu.buf);
};

fn @iteration_sep[T](body: StencilSepFn[T]) = @|out: Img, arr: Img, mask_row: MaskSep, mask_col: MaskSep, bh_lower: BoundaryFn[T], bh_upper: BoundaryFn[T]| {
    let acc = accelerator(device_id);
    let arr_gpu = alloc_img[T](arr, acc.alloc);
    let out_gpu = alloc_img[T](out, acc.alloc);
    let tmp_gpu = alloc_img[T](out, acc.alloc);
    copy_img(arr, arr_gpu);

    let coarsening_factor = 1;
    let grid   = (out.width, out.height / coarsening_factor, 1);
    let (block_row, _) = compute_config((mask_row.size / 2, 1), coarsening_factor);
    let (block_col, _) = compute_config((1, mask_col.size / 2), coarsening_factor);

    for benchmark_acc(acc) {
        for work_item in acc.exec(grid, block_col) {
            let bdim_y = work_item.bdimy();
            let gid_x  = work_item.tidx() + work_item.bdimx() * work_item.bidx();
            let gid_y  = work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;
            let is_row = false;

            let arr_acc = get_acc_bh[T](arr_gpu, set_pixel_fn[T](arr_gpu), get_pixel_ldg_fn[T](arr_gpu), (Boundary::Unknown, Boundary::Unknown), bh_lower, bh_upper);
            let tmp_acc = get_acc[T](tmp_gpu, set_pixel_fn[T](tmp_gpu), get_pixel_fn[T](tmp_gpu));

            for i in unroll(0, coarsening_factor) {
                @body(gid_x, gid_y + i * bdim_y, tmp_acc, arr_acc, mask_col, is_row);
            }
        }
    }
    for benchmark_acc(acc) {
        for work_item in acc.exec(grid, block_row) {
            let bdim_y = work_item.bdimy();
            let gid_x  = work_item.tidx() + work_item.bdimx() * work_item.bidx();
            let gid_y  = work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;
            let is_row = true;

            let tmp_acc = get_acc_bh[T](tmp_gpu, set_pixel_fn[T](tmp_gpu), get_pixel_ldg_fn[T](tmp_gpu), (Boundary::Unknown, Boundary::Unknown), bh_lower, bh_upper);
            let out_acc = get_acc[T](out_gpu, set_pixel_fn[T](out_gpu), get_pixel_fn[T](out_gpu));

            for i in unroll(0, coarsening_factor) {
                @body(gid_x, gid_y + i * bdim_y, out_acc, tmp_acc, mask_row, is_row);
            }
        }
    }

    copy_img(out_gpu, out);
    release(arr_gpu.buf);
    release(out_gpu.buf);
    release(tmp_gpu.buf);
};

fn @iteration_sep_bounds[T](body: StencilSepFn[T]) = @|out: Img, arr: Img, mask_row: MaskSep, mask_col: MaskSep, bh_lower: BoundaryFn[T], bh_upper: BoundaryFn[T]| {
    let acc = accelerator(device_id);
    let arr_gpu = alloc_img[T](arr, acc.alloc);
    let out_gpu = alloc_img[T](out, acc.alloc);
    let tmp_gpu = alloc_img[T](out, acc.alloc);
    copy_img(arr, arr_gpu);

    // compute the number of blocks required for boundary handling
    let coarsening_factor = 1;
    let (block_row as (block_row_x, _block_row_y, _), (bx, _)) = compute_config((mask_row.size / 2, 1), coarsening_factor);
    let (block_col as (_block_col_x, block_col_y, _), (_, by)) = compute_config((1, mask_col.size / 2), coarsening_factor);

    // define if we want to generate one big kernel or multiple kernels
    let big_kernel = false;

    if big_kernel {
        let grid = (out.width, out.height / coarsening_factor, 1);

        // column component
        for benchmark_acc(acc) {
            let tmp_acc = get_acc[T](tmp_gpu, set_pixel_fn[T](tmp_gpu), get_pixel_fn[T](tmp_gpu));
            for work_item in acc.exec(grid, block_col) {
                let bdim_y = work_item.bdimy();
                let bid_y  = work_item.bidy();
                let gid_x  = work_item.tidx() + work_item.bdimx() * work_item.bidx();
                let gid_y  = work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;
                let nblk_y = work_item.nblky();
                let is_row = false;

                if bid_y < by {                  // top
                    let arr_acc = get_acc_bh[T](arr_gpu, set_pixel_fn[T](arr_gpu), get_pixel_ldg_fn[T](arr_gpu), (Boundary::Center, Boundary::Lower), bh_lower, bh_upper);
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, tmp_acc, arr_acc, mask_col, is_row);
                    }
                } else if bid_y >= nblk_y - by { // bottom
                    let arr_acc = get_acc_bh[T](arr_gpu, set_pixel_fn[T](arr_gpu), get_pixel_ldg_fn[T](arr_gpu), (Boundary::Center, Boundary::Upper), bh_lower, bh_upper);
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, tmp_acc, arr_acc, mask_col, is_row);
                    }
                } else {                         // center
                    let arr_acc = get_acc_bh[T](arr_gpu, set_pixel_fn[T](arr_gpu), get_pixel_ldg_fn[T](arr_gpu), (Boundary::Center, Boundary::Center), bh_lower, bh_upper);
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, tmp_acc, arr_acc, mask_col, is_row);
                    }
                }
            }
        }

        // row component
        for benchmark_acc(acc) {
            for work_item in acc.exec(grid, block_row) {
                let bdim_y  = work_item.bdimy();
                let bid_x   = work_item.bidx();
                let gid_x   = work_item.tidx() + work_item.bdimx() * work_item.bidx();
                let gid_y   = work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;
                let nblk_x  = work_item.nblkx();
                let is_row  = true;
                let out_acc = get_acc[T](out_gpu, set_pixel_fn[T](out_gpu), get_pixel_fn[T](out_gpu));

                if bid_x < bx {                  // left
                    let tmp_acc = get_acc_bh[T](tmp_gpu, set_pixel_fn[T](tmp_gpu), get_pixel_ldg_fn[T](tmp_gpu), (Boundary::Lower, Boundary::Center), bh_lower, bh_upper);
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, tmp_acc, mask_row, is_row);
                    }
                } else if bid_x >= nblk_x - bx { // right
                    let tmp_acc = get_acc_bh[T](tmp_gpu, set_pixel_fn[T](tmp_gpu), get_pixel_ldg_fn[T](tmp_gpu), (Boundary::Upper, Boundary::Center), bh_lower, bh_upper);
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, tmp_acc, mask_row, is_row);
                    }
                } else {                         // center
                    let tmp_acc = get_acc_bh[T](tmp_gpu, set_pixel_fn[T](tmp_gpu), get_pixel_ldg_fn[T](tmp_gpu), (Boundary::Center, Boundary::Center), bh_lower, bh_upper);
                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, tmp_acc, mask_row, is_row);
                    }
                }
            }
        }
    } else {
        for (bounds_lower, bounds_upper), boundary_col in iterate_1dregion(0, out.height / coarsening_factor, by * block_col_y) {
            let region = (Boundary::Center, boundary_col);
            let grid   = (out.width, bounds_upper - bounds_lower, 1);

            let arr_acc = get_acc_bh[T](arr_gpu, set_pixel_fn[T](arr_gpu), get_pixel_ldg_fn[T](arr_gpu), region, bh_lower, bh_upper);
            let tmp_acc = get_acc[T](tmp_gpu, set_pixel_fn[T](tmp_gpu), get_pixel_fn[T](tmp_gpu));

            for benchmark_acc(acc) {
                for work_item in acc.exec(grid, block_col) {
                    let bdim_y = work_item.bdimy();
                    let gid_x  =                                    work_item.tidx() + work_item.bdimx() * work_item.bidx();
                    let gid_y  = bounds_lower * coarsening_factor + work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;
                    let is_row = false;

                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, tmp_acc, arr_acc, mask_col, is_row);
                    }
                }
            }
        }

        for (bounds_lower, bounds_upper), boundary_row in iterate_1dregion(0, out.width, bx * block_row_x) {
            let region = (boundary_row, Boundary::Center);
            let grid   = (bounds_upper - bounds_lower, out.height / coarsening_factor, 1);

            let tmp_acc = get_acc_bh[T](tmp_gpu, set_pixel_fn[T](tmp_gpu), get_pixel_ldg_fn[T](tmp_gpu), region, bh_lower, bh_upper);
            let out_acc = get_acc[T](out_gpu, set_pixel_fn[T](out_gpu), get_pixel_fn[T](out_gpu));

            for benchmark_acc(acc) {
                for work_item in acc.exec(grid, block_row) {
                    let bdim_y = work_item.bdimy();
                    let gid_x  = bounds_lower + work_item.tidx() + work_item.bdimx() * work_item.bidx();
                    let gid_y  =                work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;
                    let is_row = true;

                    for i in unroll(0, coarsening_factor) {
                        @body(gid_x, gid_y + i * bdim_y, out_acc, tmp_acc, mask_row, is_row);
                    }
                }
            }
        }
    }

    copy_img(out_gpu, out);
    release(arr_gpu.buf);
    release(out_gpu.buf);
    release(tmp_gpu.buf);
};

fn @iteration_sep_advanced[T](body: StencilSepFn[T]) = @|out: Img, arr: Img, mask_row: MaskSep, mask_col: MaskSep, bh_lower: BoundaryFn[T], bh_upper: BoundaryFn[T]| {
    let acc = accelerator(device_id);
    let arr_gpu = alloc_img[T](arr, acc.alloc);
    let out_gpu = alloc_img[T](out, acc.alloc);
    copy_img(arr, arr_gpu);

    let coarsening_factor = 1;
    let (block as (block_x, block_y, _), _) = compute_config((mask_row.size / 2, mask_col.size / 2), coarsening_factor);
    let grid = (out.width, out.height / coarsening_factor, 1);

    // compute number of steps required to stage data to shared memory
    let range_row = mask_row.size / 2;
    let range_col = mask_col.size / 2;
    let offset_y  = if (mask_col.size-1) % block_y == 0 { 0 } else { 1 };
    let steps_x   = 2;
    let steps_y   = coarsening_factor + (mask_col.size-1) / block_y + offset_y;

    for benchmark_acc(acc) {
        for work_item in acc.exec(grid, block) {
            let tid_x  = work_item.tidx();
            let tid_y  = work_item.tidy();
            let bid_x  = work_item.bidx();
            let bid_y  = work_item.bidy();
            let bdim_x = work_item.bdimx();
            let bdim_y = work_item.bdimy();
            let gid_x  = work_item.tidx() + work_item.bdimx() * work_item.bidx();
            let gid_y  = work_item.tidy() + work_item.bdimy() * work_item.bidy() * coarsening_factor;

            let spm_stride     =                     block_x + 2 * range_row;
            let spm_height_col = coarsening_factor * block_y + 2 * range_col;
            let spm_height_row = coarsening_factor * block_y;
            let spm_col = reserve_shared[T](spm_stride * spm_height_col);
            let spm_row = reserve_shared[T](spm_stride * spm_height_row);

            for y in unroll(0, steps_y) {
                let lid_y = tid_y             + y*bdim_y;
                let idx_y = gid_y - range_col + y*bdim_y;
                for x in unroll(0, steps_x) {
                    let lid_x = tid_x             + x*bdim_x;
                    let idx_x = gid_x - range_row + x*bdim_x;

                    if lid_x < spm_stride && lid_y < spm_height_col {
                        let gpu_acc = get_acc_bh[T](arr_gpu, set_pixel_fn[T](arr_gpu), get_pixel_ldg_fn[T](arr_gpu), (Boundary::Unknown, Boundary::Unknown), bh_lower, bh_upper); // TODO: set region!
                        let spm_acc = get_acc_memory[T](set_pixel_shared_fn[T](spm_col), get_pixel_shared_fn[T](spm_col), spm_stride, spm_height_col);
                        spm_acc.write(lid_x, lid_y, gpu_acc.read(idx_x, idx_y));
                    }
                }
            }

            acc.barrier();

            for i in unroll(0, coarsening_factor) {
                let is_row = false;
                {
                    // index space: block
                    let out_acc = get_acc_offset_memory[T](set_pixel_shared_fn[T](spm_row), get_pixel_shared_fn[T](spm_row), spm_stride, spm_height_row, range_row,             i * bdim_y);
                    let arr_acc = get_acc_offset_memory[T](set_pixel_shared_fn[T](spm_col), get_pixel_shared_fn[T](spm_col), spm_stride, spm_height_col, range_row, range_col + i * bdim_y);
                    @body(tid_x, tid_y, out_acc, arr_acc, mask_col, is_row);
                }
                if tid_x < range_row {
                    // left halo
                    let out_acc = get_acc_offset_memory[T](set_pixel_shared_fn[T](spm_row), get_pixel_shared_fn[T](spm_row), spm_stride, spm_height_row, 0,             i * bdim_y);
                    let arr_acc = get_acc_offset_memory[T](set_pixel_shared_fn[T](spm_col), get_pixel_shared_fn[T](spm_col), spm_stride, spm_height_col, 0, range_col + i * bdim_y);
                    @body(tid_x, tid_y, out_acc, arr_acc, mask_col, is_row);
                }
                if tid_x >= bdim_x - range_row {
                    // right halo
                    let out_acc = get_acc_offset_memory[T](set_pixel_shared_fn[T](spm_row), get_pixel_shared_fn[T](spm_row), spm_stride, spm_height_row, range_row + range_row,             i * bdim_y);
                    let arr_acc = get_acc_offset_memory[T](set_pixel_shared_fn[T](spm_col), get_pixel_shared_fn[T](spm_col), spm_stride, spm_height_col, range_row + range_row, range_col + i * bdim_y);
                    @body(tid_x, tid_y, out_acc, arr_acc, mask_col, is_row);
                }
            }

            acc.barrier();

            for i in unroll(0, coarsening_factor) {
                // index space: block
                let is_row  = true;
                let out_acc =        get_acc_offset[T](out_gpu, set_pixel_fn[T](out_gpu), get_pixel_fn[T](out_gpu), bdim_x * bid_x, bdim_y * bid_y * coarsening_factor + i * bdim_y);
                let arr_acc = get_acc_offset_memory[T](set_pixel_shared_fn[T](spm_row), get_pixel_shared_fn[T](spm_row), spm_stride, spm_height_row, range_row, i * bdim_y);
                @body(tid_x, tid_y, out_acc, arr_acc, mask_row, is_row);
            }
        }
    }

    copy_img(out_gpu, out);
    release(arr_gpu.buf);
    release(out_gpu.buf);
};