diff --git a/src/cdef.rs b/src/cdef.rs index 1d60e92b1..dc6d62efa 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -5,6 +5,7 @@ use crate::include::common::bitdepth::LeftPixelRow2px; use crate::include::common::intops::apply_sign; use crate::include::common::intops::iclip; use crate::include::common::intops::ulog2; +use crate::src::cpu::CpuFlags; use crate::src::tables::dav1d_cdef_directions; use bitflags::bitflags; use libc::ptrdiff_t; @@ -12,12 +13,6 @@ use std::cmp; use std::ffi::c_int; use std::ffi::c_uint; -#[cfg(feature = "asm")] -use cfg_if::cfg_if; - -#[cfg(feature = "asm")] -use crate::src::cpu::{rav1d_get_cpu_flags, CpuFlags}; - #[cfg(feature = "asm")] use crate::include::common::bitdepth::BPC; @@ -1030,98 +1025,6 @@ unsafe fn cdef_find_dir_rust( return best_dir; } -#[inline(always)] -#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64"),))] -unsafe fn cdef_dsp_init_x86(c: *mut Rav1dCdefDSPContext) { - let flags = rav1d_get_cpu_flags(); - - match BD::BPC { - BPC::BPC8 => { - if !flags.contains(CpuFlags::SSE2) { - return; - } - - (*c).fb[0] = dav1d_cdef_filter_8x8_8bpc_sse2; - (*c).fb[1] = dav1d_cdef_filter_4x8_8bpc_sse2; - (*c).fb[2] = dav1d_cdef_filter_4x4_8bpc_sse2; - - if !flags.contains(CpuFlags::SSSE3) { - return; - } - - (*c).dir = dav1d_cdef_dir_8bpc_ssse3; - (*c).fb[0] = dav1d_cdef_filter_8x8_8bpc_ssse3; - (*c).fb[1] = dav1d_cdef_filter_4x8_8bpc_ssse3; - (*c).fb[2] = dav1d_cdef_filter_4x4_8bpc_ssse3; - - if !flags.contains(CpuFlags::SSE41) { - return; - } - - (*c).dir = dav1d_cdef_dir_8bpc_sse4; - (*c).fb[0] = dav1d_cdef_filter_8x8_8bpc_sse4; - (*c).fb[1] = dav1d_cdef_filter_4x8_8bpc_sse4; - (*c).fb[2] = dav1d_cdef_filter_4x4_8bpc_sse4; - - #[cfg(target_arch = "x86_64")] - { - if !flags.contains(CpuFlags::AVX2) { - return; - } - - (*c).dir = dav1d_cdef_dir_8bpc_avx2; - (*c).fb[0] = dav1d_cdef_filter_8x8_8bpc_avx2; - (*c).fb[1] = dav1d_cdef_filter_4x8_8bpc_avx2; - (*c).fb[2] = dav1d_cdef_filter_4x4_8bpc_avx2; - - if !flags.contains(CpuFlags::AVX512ICL) { - return; - } - - (*c).fb[0] = dav1d_cdef_filter_8x8_8bpc_avx512icl; - (*c).fb[1] = dav1d_cdef_filter_4x8_8bpc_avx512icl; - (*c).fb[2] = dav1d_cdef_filter_4x4_8bpc_avx512icl; - } - } - BPC::BPC16 => { - if !flags.contains(CpuFlags::SSSE3) { - return; - } - - (*c).dir = dav1d_cdef_dir_16bpc_ssse3; - (*c).fb[0] = dav1d_cdef_filter_8x8_16bpc_ssse3; - (*c).fb[1] = dav1d_cdef_filter_4x8_16bpc_ssse3; - (*c).fb[2] = dav1d_cdef_filter_4x4_16bpc_ssse3; - - if !flags.contains(CpuFlags::SSE41) { - return; - } - - (*c).dir = dav1d_cdef_dir_16bpc_sse4; - - #[cfg(target_arch = "x86_64")] - { - if !flags.contains(CpuFlags::AVX2) { - return; - } - - (*c).dir = dav1d_cdef_dir_16bpc_avx2; - (*c).fb[0] = dav1d_cdef_filter_8x8_16bpc_avx2; - (*c).fb[1] = dav1d_cdef_filter_4x8_16bpc_avx2; - (*c).fb[2] = dav1d_cdef_filter_4x4_16bpc_avx2; - - if !flags.contains(CpuFlags::AVX512ICL) { - return; - } - - (*c).fb[0] = dav1d_cdef_filter_8x8_16bpc_avx512icl; - (*c).fb[1] = dav1d_cdef_filter_4x8_16bpc_avx512icl; - (*c).fb[2] = dav1d_cdef_filter_4x4_16bpc_avx512icl; - } - } - }; -} - #[inline(always)] #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64"),))] unsafe extern "C" fn cdef_filter_8x8_neon_erased( @@ -1274,37 +1177,150 @@ unsafe extern "C" fn cdef_filter_4x4_neon_erased( } } -#[inline(always)] -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64"),))] -unsafe fn cdef_dsp_init_arm(c: *mut Rav1dCdefDSPContext) { - let flags = rav1d_get_cpu_flags(); +impl Rav1dCdefDSPContext { + pub const fn default() -> Self { + Self { + dir: cdef_find_dir_c_erased::, + fb: [ + cdef_filter_block_8x8_c_erased::, + cdef_filter_block_4x8_c_erased::, + cdef_filter_block_4x4_c_erased::, + ], + } + } + + #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] + #[inline(always)] + const fn init_x86(mut self, flags: CpuFlags) -> Self { + match BD::BPC { + BPC::BPC8 => { + if !flags.contains(CpuFlags::SSE2) { + return self; + } + + self.fb[0] = dav1d_cdef_filter_8x8_8bpc_sse2; + self.fb[1] = dav1d_cdef_filter_4x8_8bpc_sse2; + self.fb[2] = dav1d_cdef_filter_4x4_8bpc_sse2; + + if !flags.contains(CpuFlags::SSSE3) { + return self; + } + + self.dir = dav1d_cdef_dir_8bpc_ssse3; + self.fb[0] = dav1d_cdef_filter_8x8_8bpc_ssse3; + self.fb[1] = dav1d_cdef_filter_4x8_8bpc_ssse3; + self.fb[2] = dav1d_cdef_filter_4x4_8bpc_ssse3; + + if !flags.contains(CpuFlags::SSE41) { + return self; + } + + self.dir = dav1d_cdef_dir_8bpc_sse4; + self.fb[0] = dav1d_cdef_filter_8x8_8bpc_sse4; + self.fb[1] = dav1d_cdef_filter_4x8_8bpc_sse4; + self.fb[2] = dav1d_cdef_filter_4x4_8bpc_sse4; + + #[cfg(target_arch = "x86_64")] + { + if !flags.contains(CpuFlags::AVX2) { + return self; + } + + self.dir = dav1d_cdef_dir_8bpc_avx2; + self.fb[0] = dav1d_cdef_filter_8x8_8bpc_avx2; + self.fb[1] = dav1d_cdef_filter_4x8_8bpc_avx2; + self.fb[2] = dav1d_cdef_filter_4x4_8bpc_avx2; + + if !flags.contains(CpuFlags::AVX512ICL) { + return self; + } + + self.fb[0] = dav1d_cdef_filter_8x8_8bpc_avx512icl; + self.fb[1] = dav1d_cdef_filter_4x8_8bpc_avx512icl; + self.fb[2] = dav1d_cdef_filter_4x4_8bpc_avx512icl; + } + } + BPC::BPC16 => { + if !flags.contains(CpuFlags::SSSE3) { + return self; + } + + self.dir = dav1d_cdef_dir_16bpc_ssse3; + self.fb[0] = dav1d_cdef_filter_8x8_16bpc_ssse3; + self.fb[1] = dav1d_cdef_filter_4x8_16bpc_ssse3; + self.fb[2] = dav1d_cdef_filter_4x4_16bpc_ssse3; + + if !flags.contains(CpuFlags::SSE41) { + return self; + } - if !flags.contains(CpuFlags::NEON) { - return; + self.dir = dav1d_cdef_dir_16bpc_sse4; + + #[cfg(target_arch = "x86_64")] + { + if !flags.contains(CpuFlags::AVX2) { + return self; + } + + self.dir = dav1d_cdef_dir_16bpc_avx2; + self.fb[0] = dav1d_cdef_filter_8x8_16bpc_avx2; + self.fb[1] = dav1d_cdef_filter_4x8_16bpc_avx2; + self.fb[2] = dav1d_cdef_filter_4x4_16bpc_avx2; + + if !flags.contains(CpuFlags::AVX512ICL) { + return self; + } + + self.fb[0] = dav1d_cdef_filter_8x8_16bpc_avx512icl; + self.fb[1] = dav1d_cdef_filter_4x8_16bpc_avx512icl; + self.fb[2] = dav1d_cdef_filter_4x4_16bpc_avx512icl; + } + } + }; + + self } - (*c).dir = match BD::BPC { - BPC::BPC8 => dav1d_cdef_find_dir_8bpc_neon, - BPC::BPC16 => dav1d_cdef_find_dir_16bpc_neon, - }; - (*c).fb[0] = cdef_filter_8x8_neon_erased::; - (*c).fb[1] = cdef_filter_4x8_neon_erased::; - (*c).fb[2] = cdef_filter_4x4_neon_erased::; -} + #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] + #[inline(always)] + const fn init_arm(mut self, flags: CpuFlags) -> Self { + if !flags.contains(CpuFlags::NEON) { + return self; + } + + self.dir = match BD::BPC { + BPC::BPC8 => dav1d_cdef_find_dir_8bpc_neon, + BPC::BPC16 => dav1d_cdef_find_dir_16bpc_neon, + }; + self.fb[0] = cdef_filter_8x8_neon_erased::; + self.fb[1] = cdef_filter_4x8_neon_erased::; + self.fb[2] = cdef_filter_4x4_neon_erased::; -#[cold] -pub unsafe fn rav1d_cdef_dsp_init(c: *mut Rav1dCdefDSPContext) { - (*c).dir = cdef_find_dir_c_erased::; - (*c).fb[0] = cdef_filter_block_8x8_c_erased::; - (*c).fb[1] = cdef_filter_block_4x8_c_erased::; - (*c).fb[2] = cdef_filter_block_4x4_c_erased::; + self + } - #[cfg(feature = "asm")] - cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - cdef_dsp_init_x86::(c); - } else if #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] { - cdef_dsp_init_arm::(c); + #[inline(always)] + const fn init(self, flags: CpuFlags) -> Self { + #[cfg(feature = "asm")] + { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + return self.init_x86::(flags); + } + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + { + return self.init_arm::(flags); + } + } + + #[allow(unreachable_code)] // Reachable on some #[cfg]s. + { + let _ = flags; + self } } + + pub const fn new(flags: CpuFlags) -> Self { + Self::default::().init::(flags) + } } diff --git a/src/cdef_apply.rs b/src/cdef_apply.rs index 7489c5541..9659dac82 100644 --- a/src/cdef_apply.rs +++ b/src/cdef_apply.rs @@ -171,7 +171,6 @@ pub(crate) unsafe fn rav1d_cdef_brow( BPC::BPC8 => 0, BPC::BPC16 => f.cur.p.bpc - 8, }; - let dsp = &*f.dsp; let mut edges: CdefEdgeFlags = if by_start > 0 { CdefEdgeFlags::HAVE_BOTTOM | CdefEdgeFlags::HAVE_TOP } else { @@ -304,7 +303,7 @@ pub(crate) unsafe fn rav1d_cdef_brow( let mut variance = 0; let dir = if y_pri_lvl != 0 || uv_pri_lvl != 0 { - (dsp.cdef.dir)( + (f.dsp.cdef.dir)( bptrs[0].cast(), f.cur.stride[0], &mut variance, @@ -370,7 +369,7 @@ pub(crate) unsafe fn rav1d_cdef_brow( if y_pri_lvl != 0 { let adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance); if adj_y_pri_lvl != 0 || y_sec_lvl != 0 { - dsp.cdef.fb[0]( + f.dsp.cdef.fb[0]( bptrs[0].cast(), f.cur.stride[0], lr_bak[bit as usize][0].as_mut_ptr().cast(), @@ -385,7 +384,7 @@ pub(crate) unsafe fn rav1d_cdef_brow( ); } } else if y_sec_lvl != 0 { - dsp.cdef.fb[0]( + f.dsp.cdef.fb[0]( bptrs[0].cast(), f.cur.stride[0], (lr_bak[bit as usize][0]).as_mut_ptr().cast(), @@ -469,7 +468,7 @@ pub(crate) unsafe fn rav1d_cdef_brow( bot = bptrs[pl].offset((8 >> ss_ver) * uv_stride); } - dsp.cdef.fb[uv_idx as usize]( + f.dsp.cdef.fb[uv_idx as usize]( bptrs[pl].cast(), f.cur.stride[1], lr_bak[bit as usize][pl].as_mut_ptr().cast(), diff --git a/src/cpu.rs b/src/cpu.rs index ed30728dc..20124cf72 100644 --- a/src/cpu.rs +++ b/src/cpu.rs @@ -180,7 +180,6 @@ static rav1d_cpu_flags: AtomicU32 = AtomicU32::new(0); /// so it shouldn't be performance sensitive. static rav1d_cpu_flags_mask: AtomicU32 = AtomicU32::new(!0); -#[cfg(feature = "asm")] #[inline(always)] pub(crate) fn rav1d_get_cpu_flags() -> CpuFlags { let flags = diff --git a/src/decode.rs b/src/decode.rs index 65b3e2802..8162e0b9e 100644 --- a/src/decode.rs +++ b/src/decode.rs @@ -1,6 +1,4 @@ use crate::include::common::attributes::ctz; -use crate::include::common::bitdepth::BitDepth16; -use crate::include::common::bitdepth::BitDepth8; use crate::include::common::bitdepth::DynPixel; use crate::include::common::bitdepth::BPC; use crate::include::common::intops::apply_sign64; @@ -21,7 +19,6 @@ use crate::include::dav1d::headers::RAV1D_MAX_SEGMENTS; use crate::include::dav1d::headers::RAV1D_PRIMARY_REF_NONE; use crate::src::align::Align16; use crate::src::align::AlignedVec64; -use crate::src::cdef::rav1d_cdef_dsp_init; use crate::src::cdf::rav1d_cdf_thread_alloc; use crate::src::cdf::rav1d_cdf_thread_copy; use crate::src::cdf::rav1d_cdf_thread_init_static; @@ -62,10 +59,10 @@ use crate::src::error::Rav1dError::EINVAL; use crate::src::error::Rav1dError::ENOMEM; use crate::src::error::Rav1dError::ENOPROTOOPT; use crate::src::error::Rav1dResult; -use crate::src::filmgrain::Rav1dFilmGrainDSPContext; use crate::src::internal::Bxy; use crate::src::internal::Rav1dContext; use crate::src::internal::Rav1dContextTaskType; +use crate::src::internal::Rav1dDSPContext; use crate::src::internal::Rav1dFrameData; use crate::src::internal::Rav1dTaskContext; use crate::src::internal::Rav1dTaskContext_scratch_pal; @@ -75,8 +72,6 @@ use crate::src::internal::TileStateRef; use crate::src::intra_edge::EdgeFlags; use crate::src::intra_edge::EdgeIndex; use crate::src::intra_edge::IntraEdges; -use crate::src::ipred::rav1d_intra_pred_dsp_init; -use crate::src::itx::rav1d_itx_dsp_init; use crate::src::levels::mv; use crate::src::levels::Av1Block; use crate::src::levels::BlockLevel; @@ -116,9 +111,6 @@ use crate::src::lf_mask::rav1d_create_lf_mask_inter; use crate::src::lf_mask::rav1d_create_lf_mask_intra; use crate::src::lf_mask::Av1RestorationUnit; use crate::src::log::Rav1dLog as _; -use crate::src::loopfilter::rav1d_loop_filter_dsp_init; -use crate::src::looprestoration::rav1d_loop_restoration_dsp_init; -use crate::src::mc::rav1d_mc_dsp_init; use crate::src::mem::rav1d_alloc_aligned; use crate::src::mem::rav1d_free_aligned; use crate::src::mem::rav1d_freep_aligned; @@ -140,18 +132,13 @@ use crate::src::picture::rav1d_thread_picture_ref; use crate::src::picture::rav1d_thread_picture_unref; use crate::src::picture::Rav1dThreadPicture; use crate::src::qm::dav1d_qm_tbl; -use crate::src::r#ref::rav1d_ref_create_using_pool; -use crate::src::r#ref::rav1d_ref_dec; -use crate::src::r#ref::rav1d_ref_inc; use crate::src::recon::debug_block_info; use crate::src::refmvs::rav1d_refmvs_find; use crate::src::refmvs::rav1d_refmvs_init_frame; -use crate::src::refmvs::rav1d_refmvs_save_tmvs; use crate::src::refmvs::rav1d_refmvs_tile_sbrow_init; use crate::src::refmvs::refmvs_block; use crate::src::refmvs::refmvs_mvpair; use crate::src::refmvs::refmvs_refpair; -use crate::src::refmvs::refmvs_temporal_block; use crate::src::refmvs::RefMvsFrame; use crate::src::tables::cfl_allowed_mask; use crate::src::tables::dav1d_al_part_ctx; @@ -184,7 +171,6 @@ use std::ffi::c_uint; use std::ffi::c_void; use std::iter; use std::mem; -use std::ptr; use std::ptr::addr_of_mut; use std::slice; use std::sync::atomic::AtomicI32; @@ -862,20 +848,11 @@ fn get_prev_frame_segid( ) -> u8 { assert!(frame_hdr.primary_ref_frame != RAV1D_PRIMARY_REF_NONE); - // Need checked casts here because an overflowing cast - // would give a too large `len` to [`std::slice::from_raw_parts`], which would UB. - let w4 = usize::try_from(w4).unwrap(); - let h4 = usize::try_from(h4).unwrap(); - let stride = usize::try_from(stride).unwrap(); - let mut prev_seg_id = 8; - let offset = b.y as usize * stride as usize + b.x as usize; - let len = h4 as usize * stride; - let ref_seg_map = ref_seg_map.index(offset..offset + len); - - assert!(w4 <= stride); - for ref_seg_map in ref_seg_map.chunks_exact(stride) { - prev_seg_id = ref_seg_map[..w4] + for y in 0..h4 as usize { + let offset = (b.y as usize + y) * stride as usize + b.x as usize; + prev_seg_id = ref_seg_map + .index(offset..offset + w4 as usize) .iter() .copied() .fold(prev_seg_id, cmp::min); @@ -3921,9 +3898,10 @@ pub(crate) unsafe fn rav1d_decode_tile_sbrow( } if c.tc.len() > 1 && frame_hdr.use_ref_frame_mvs != 0 { - let rf = f.rf.as_mut_dav1d(); - (c.refmvs_dsp.load_tmvs)( - &rf, + c.refmvs_dsp.load_tmvs( + &f.rf, + &f.mvs, + &f.ref_mvs, ts.tiling.row, ts.tiling.col_start >> 1, ts.tiling.col_end >> 1, @@ -4031,10 +4009,10 @@ pub(crate) unsafe fn rav1d_decode_tile_sbrow( && c.tc.len() > 1 && f.frame_hdr().frame_type.is_inter_or_switch() { - rav1d_refmvs_save_tmvs( - &c.refmvs_dsp, + c.refmvs_dsp.save_tmvs( &t.rt, &f.rf, + &f.mvs, ts.tiling.col_start >> 1, ts.tiling.col_end >> 1, t.b.y >> 1, @@ -4330,7 +4308,6 @@ pub(crate) unsafe fn rav1d_decode_frame_init( seq_hdr, frame_hdr, &f.refpoc, - f.mvs, &f.refrefpoc, &f.ref_mvs, c.tc.len() as u32, @@ -4526,24 +4503,25 @@ unsafe fn rav1d_decode_frame_main(c: &Rav1dContext, f: &mut Rav1dFrameData) -> R t.b.y = sby << 4 + seq_hdr.sb128; let by_end = t.b.y + f.sb_step >> 1; if frame_hdr.use_ref_frame_mvs != 0 { - let rf = f.rf.as_mut_dav1d(); - (c.refmvs_dsp.load_tmvs)(&rf, tile_row as c_int, 0, f.bw >> 1, t.b.y >> 1, by_end); - } - for col in 0..cols { - t.ts = tile_row * cols + col; - rav1d_decode_tile_sbrow(c, &mut t, f).map_err(|()| EINVAL)?; - } - if f.frame_hdr().frame_type.is_inter_or_switch() { - rav1d_refmvs_save_tmvs( - &c.refmvs_dsp, - &t.rt, + c.refmvs_dsp.load_tmvs( &f.rf, + &f.mvs, + &f.ref_mvs, + tile_row as c_int, 0, f.bw >> 1, t.b.y >> 1, by_end, ); } + for col in 0..cols { + t.ts = tile_row * cols + col; + rav1d_decode_tile_sbrow(c, &mut t, f).map_err(|()| EINVAL)?; + } + if f.frame_hdr().frame_type.is_inter_or_switch() { + c.refmvs_dsp + .save_tmvs(&t.rt, &f.rf, &f.mvs, 0, f.bw >> 1, t.b.y >> 1, by_end); + } // loopfilter + cdef + restoration (f.bd_fn().filter_sbrow)(c, f, &mut t, sby); @@ -4569,7 +4547,7 @@ pub(crate) unsafe fn rav1d_decode_frame_exit( if f.refp[i].p.frame_hdr.is_some() { rav1d_thread_picture_unref(&mut f.refp[i]); } - rav1d_ref_dec(&mut f.ref_mvs_ref[i]); + let _ = mem::take(&mut f.ref_mvs[i]); } rav1d_picture_unref_internal(&mut f.cur); rav1d_thread_picture_unref(&mut f.sr_cur); @@ -4588,7 +4566,7 @@ pub(crate) unsafe fn rav1d_decode_frame_exit( let _ = mem::take(&mut f.cur_segmap); let _ = mem::take(&mut f.prev_segmap); - rav1d_ref_dec(&mut f.mvs_ref); + let _ = mem::take(&mut f.mvs); let _ = mem::take(&mut f.seq_hdr); let _ = mem::take(&mut f.frame_hdr); f.tiles.clear(); @@ -4705,9 +4683,6 @@ pub unsafe fn rav1d_submit_frame(c: &mut Rav1dContext) -> Rav1dResult { f.seq_hdr = c.seq_hdr.clone(); f.frame_hdr = mem::take(&mut c.frame_hdr); let seq_hdr = &***f.seq_hdr.as_ref().unwrap(); - f.dsp = &mut c.dsp[seq_hdr.hbd as usize]; - - let bpc = 8 + 2 * seq_hdr.hbd; unsafe fn on_error(f: &mut Rav1dFrameData, c: &Rav1dContext, out: *mut Rav1dThreadPicture) { f.task_thread.error = AtomicI32::new(1); @@ -4719,12 +4694,12 @@ pub unsafe fn rav1d_submit_frame(c: &mut Rav1dContext) -> Rav1dResult { if f.refp[i].p.frame_hdr.is_some() { rav1d_thread_picture_unref(&mut f.refp[i]); } - rav1d_ref_dec(&mut f.ref_mvs_ref[i]); + let _ = mem::take(&mut f.ref_mvs[i]); } rav1d_thread_picture_unref(out); rav1d_picture_unref_internal(&mut f.cur); rav1d_thread_picture_unref(&mut f.sr_cur); - rav1d_ref_dec(&mut f.mvs_ref); + let _ = mem::take(&mut f.mvs); let _ = mem::take(&mut f.seq_hdr); let _ = mem::take(&mut f.frame_hdr); *c.cached_error_props.lock().unwrap() = c.in_0.m.clone(); @@ -4733,45 +4708,15 @@ pub unsafe fn rav1d_submit_frame(c: &mut Rav1dContext) -> Rav1dResult { f.task_thread.finished.store(true, Ordering::SeqCst); } - // TODO(kkysen) Rather than lazy initializing this, - // we should probably initialize all the fn ptrs - // when `c` is allocated during [`rav1d_open`]. - if !(*f.dsp).initialized { - let dsp = &mut c.dsp[seq_hdr.hbd as usize]; - dsp.initialized = true; - - match bpc { - #[cfg(feature = "bitdepth_8")] - 8 => { - rav1d_cdef_dsp_init::(&mut dsp.cdef); - rav1d_intra_pred_dsp_init::(&mut dsp.ipred); - rav1d_itx_dsp_init::(&mut dsp.itx, bpc); - rav1d_loop_filter_dsp_init::(&mut dsp.lf); - rav1d_loop_restoration_dsp_init::(&mut dsp.lr, bpc); - rav1d_mc_dsp_init::(&mut dsp.mc); - dsp.fg = Rav1dFilmGrainDSPContext::new::(); - } - #[cfg(feature = "bitdepth_16")] - 10 | 12 => { - rav1d_cdef_dsp_init::(&mut dsp.cdef); - rav1d_intra_pred_dsp_init::(&mut dsp.ipred); - rav1d_itx_dsp_init::(&mut dsp.itx, bpc); - rav1d_loop_filter_dsp_init::(&mut dsp.lf); - rav1d_loop_restoration_dsp_init::(&mut dsp.lr, bpc); - rav1d_mc_dsp_init::(&mut dsp.mc); - dsp.fg = Rav1dFilmGrainDSPContext::new::(); - } - _ => { - writeln!( - c.logger, - "Compiled without support for {}-bit decoding", - 8 + 2 * seq_hdr.hbd - ); - on_error(f, c, out); - return Err(ENOPROTOOPT); - } + let bpc = 8 + 2 * seq_hdr.hbd; + match Rav1dDSPContext::get(bpc) { + Some(dsp) => f.dsp = dsp, + None => { + writeln!(c.logger, "Compiled without support for {bpc}-bit decoding",); + on_error(f, c, out); + return Err(ENOPROTOOPT); } - } + }; fn scale_fac(ref_sz: i32, this_sz: i32) -> i32 { ((ref_sz << 14) + (this_sz >> 1)) / this_sz @@ -4909,18 +4854,12 @@ pub unsafe fn rav1d_submit_frame(c: &mut Rav1dContext) -> Rav1dResult { // ref_mvs if frame_hdr.frame_type.is_inter_or_switch() || frame_hdr.allow_intrabc { - f.mvs_ref = rav1d_ref_create_using_pool( - c.refmvs_pool, - ::core::mem::size_of::() - * f.sb128h as usize - * 16 - * (f.b4_stride >> 1) as usize, + // TODO fallible allocation + f.mvs = Some( + (0..f.sb128h as usize * 16 * (f.b4_stride >> 1) as usize) + .map(|_| Default::default()) + .collect(), ); - if f.mvs_ref.is_null() { - on_error(f, c, out); - return Err(ENOMEM); - } - f.mvs = (*f.mvs_ref).data.cast::(); if !frame_hdr.allow_intrabc { for i in 0..7 { f.refpoc[i] = f.refp[i].p.frame_hdr.as_ref().unwrap().frame_offset as c_uint; @@ -4933,24 +4872,19 @@ pub unsafe fn rav1d_submit_frame(c: &mut Rav1dContext) -> Rav1dResult { let refidx = frame_hdr.refidx[i] as usize; let ref_w = (ref_coded_width[i] + 7 >> 3) << 1; let ref_h = (f.refp[i].p.p.h + 7 >> 3) << 1; - if !c.refs[refidx].refmvs.is_null() && ref_w == f.bw && ref_h == f.bh { - f.ref_mvs_ref[i] = c.refs[refidx].refmvs; - rav1d_ref_inc(f.ref_mvs_ref[i]); - f.ref_mvs[i] = (*c.refs[refidx].refmvs) - .data - .cast::(); + if ref_w == f.bw && ref_h == f.bh { + f.ref_mvs[i] = c.refs[refidx].refmvs.clone(); } else { - f.ref_mvs[i] = ptr::null_mut(); - f.ref_mvs_ref[i] = ptr::null_mut(); + f.ref_mvs[i] = None; } f.refrefpoc[i] = c.refs[refidx].refpoc; } } else { - f.ref_mvs_ref.fill_with(ptr::null_mut); + f.ref_mvs.fill_with(Default::default); } } else { - f.mvs_ref = ptr::null_mut(); - f.ref_mvs_ref.fill_with(ptr::null_mut); + f.mvs = None; + f.ref_mvs.fill_with(Default::default); } // segmap @@ -5015,12 +4949,9 @@ pub unsafe fn rav1d_submit_frame(c: &mut Rav1dContext) -> Rav1dResult { } c.refs[i].segmap = f.cur_segmap.clone(); - rav1d_ref_dec(&mut c.refs[i].refmvs); + let _ = mem::take(&mut c.refs[i].refmvs); if !frame_hdr.allow_intrabc { - c.refs[i].refmvs = f.mvs_ref; - if !f.mvs_ref.is_null() { - rav1d_ref_inc(f.mvs_ref); - } + c.refs[i].refmvs = f.mvs.clone(); } c.refs[i].refpoc = f.refpoc; } @@ -5037,7 +4968,7 @@ pub unsafe fn rav1d_submit_frame(c: &mut Rav1dContext) -> Rav1dResult { } let _ = mem::take(&mut c.cdf[i]); let _ = mem::take(&mut c.refs[i].segmap); - rav1d_ref_dec(&mut c.refs[i].refmvs); + let _ = mem::take(&mut c.refs[i].refmvs); } } on_error(f, c, out); diff --git a/src/disjoint_mut.rs b/src/disjoint_mut.rs index 2247ec5bc..f15a15652 100644 --- a/src/disjoint_mut.rs +++ b/src/disjoint_mut.rs @@ -368,6 +368,7 @@ impl SliceBounds for RangeToInclusive {} impl DisjointMutIndex<[T]> for usize { type Output = <[T] as Index>::Output; + #[cfg_attr(debug_assertions, track_caller)] unsafe fn get_mut(self, slice: *mut [T]) -> *mut Self::Output { // SAFETY: The safety precondition for this trait method requires that // we can immutably dereference `slice`. @@ -378,7 +379,7 @@ impl DisjointMutIndex<[T]> for usize { // an allocation of sufficient length. unsafe { (slice as *mut T).add(self) } } else { - panic!("{:?} was not a valid index", &self); + panic!("index out of bounds: the len is {len} but the index is {self}"); } } } @@ -389,6 +390,7 @@ where { type Output = <[T] as Index>>::Output; + #[cfg_attr(debug_assertions, track_caller)] unsafe fn get_mut(self, slice: *mut [T]) -> *mut Self::Output { // SAFETY: The safety precondition for this trait method // requires that we can immutably dereference `slice`. @@ -402,7 +404,16 @@ where let data = unsafe { (slice as *mut T).add(start) }; ptr::slice_from_raw_parts_mut(data, end - start) } else { - panic!("{:?} was not a valid index", &self); + if start > end { + panic!("slice index starts at {start} but ends at {end}"); + } + if end > len { + panic!("range end index {end} out of range for slice of length {len}"); + } + if start >= len { + panic!("range start index {start} out of range for slice of length {len}") + } + unreachable!(); } } } @@ -434,8 +445,10 @@ mod release { mod debug { use super::*; use std::backtrace::Backtrace; + use std::backtrace::BacktraceStatus; use std::fmt::Debug; use std::ops::Bound; + use std::panic::Location; use std::sync::Mutex; use std::thread; use std::thread::ThreadId; @@ -443,21 +456,53 @@ mod debug { #[derive(Debug)] struct DisjointMutBounds { bounds: Bounds, - - #[allow(unused)] + mutable: bool, + location: &'static Location<'static>, backtrace: Backtrace, - #[allow(unused)] thread: ThreadId, } impl DisjointMutBounds { - fn new(bounds: Bounds) -> Self { + #[track_caller] + pub fn new(bounds: Bounds, mutable: bool) -> Self { Self { bounds, + mutable, + location: Location::caller(), backtrace: Backtrace::capture(), thread: thread::current().id(), } } + + pub fn check_overlaps(&self, existing: &Self) { + if !self.bounds.overlaps(&existing.bounds) { + return; + } + // Example: + // + // overlapping DisjointMut: + // current: &mut _[0..2] on ThreadId(2) at src/disjoint_mut.rs:855:24 + // existing: & _[0..1] on ThreadId(2) at src/disjoint_mut.rs:854:24 + panic!("\toverlapping DisjointMut:\n current: {self}\nexisting: {existing}"); + } + } + + impl Display for DisjointMutBounds { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + let Self { + bounds, + mutable, + location, + backtrace, + thread, + } = self; + let mutable = if *mutable { "&mut" } else { " &" }; + write!(f, "{mutable} _[{bounds}] on {thread:?} at {location}")?; + if backtrace.status() == BacktraceStatus::Captured { + write!(f, ":\nstack backtrace:\n{backtrace}")?; + } + Ok(()) + } } #[derive(Default)] @@ -476,57 +521,28 @@ mod debug { } } - #[track_caller] - fn check_overlaps( - current_bounds: &Bounds, - current_mutable: bool, - existing: &DisjointMutBounds, - existing_mutable: bool, - ) { - let DisjointMutBounds { - bounds: existing_bounds, - backtrace: existing_backtrace, - thread: existing_thread, - } = existing; - if !current_bounds.overlaps(existing_bounds) { - return; - } - let current_thread = thread::current().id(); - let [current_mutable, existing_mutable] = - [current_mutable, existing_mutable].map(|mutable| if mutable { "&mut" } else { "&" }); - // Example: - // - // &mut _[0..8] on ThreadId(3) overlaps with existing &mut _[0..8] on ThreadId(2): - // stack backtrace: - // 0: rav1d::src::disjoint_mut::debug::DisjointMutBounds::new - // at ./src/disjoint_mut.rs:443:28 - panic!("{current_mutable} _[{current_bounds}] on {current_thread:?} overlaps with existing {existing_mutable} _[{existing_bounds}] on {existing_thread:?}:\nstack backtrace:\n{existing_backtrace}"); - } - impl DisjointMut { #[track_caller] fn add_mut_bounds(&self, bounds: Bounds) { - for b in self.bounds.immutable.lock().unwrap().iter() { - check_overlaps(&bounds, true, b, false); + let current = DisjointMutBounds::new(bounds, true); + for existing in self.bounds.immutable.lock().unwrap().iter() { + current.check_overlaps(existing); } let mut mut_bounds = self.bounds.mutable.lock().unwrap(); - for b in mut_bounds.iter() { - check_overlaps(&bounds, true, b, true); + for existing in mut_bounds.iter() { + current.check_overlaps(existing); } - mut_bounds.push(DisjointMutBounds::new(bounds)); + mut_bounds.push(current); } #[track_caller] fn add_immut_bounds(&self, bounds: Bounds) { + let current = DisjointMutBounds::new(bounds, false); let mut_bounds = self.bounds.mutable.lock().unwrap(); - for b in mut_bounds.iter() { - check_overlaps(&bounds, false, b, true); + for existing in mut_bounds.iter() { + current.check_overlaps(existing); } - self.bounds - .immutable - .lock() - .unwrap() - .push(DisjointMutBounds::new(bounds)); + self.bounds.immutable.lock().unwrap().push(current); } fn remove_bound(&self, bounds: &Bounds, mutable: bool) { diff --git a/src/enum_map.rs b/src/enum_map.rs index 97a9e2776..e51503d56 100644 --- a/src/enum_map.rs +++ b/src/enum_map.rs @@ -59,7 +59,7 @@ where { /// Create an [`EnumMap`] with default values when `V: ` [`DefaultValue`]. #[allow(dead_code)] // TODO(kkysen) remove when used - pub const fn default() -> Self { + const fn default() -> Self { Self { array: [V::DEFAULT; N], _phantom: PhantomData, diff --git a/src/filmgrain.rs b/src/filmgrain.rs index 31b90292a..43b348fad 100644 --- a/src/filmgrain.rs +++ b/src/filmgrain.rs @@ -11,6 +11,7 @@ use crate::include::dav1d::headers::Dav1dFilmGrainData; use crate::include::dav1d::headers::Rav1dFilmGrainData; use crate::include::dav1d::headers::Rav1dPixelLayoutSubSampled; use crate::src::assume::assume; +use crate::src::cpu::CpuFlags; use crate::src::enum_map::enum_map; use crate::src::enum_map::enum_map_ty; use crate::src::enum_map::DefaultValue; @@ -28,7 +29,7 @@ use std::ops::Shr; use to_method::To; #[cfg(feature = "asm")] -use crate::{include::common::bitdepth::bd_fn, src::cpu::rav1d_get_cpu_flags, src::cpu::CpuFlags}; +use crate::include::common::bitdepth::bd_fn; pub const GRAIN_WIDTH: usize = 82; pub const GRAIN_HEIGHT: usize = 73; @@ -1091,7 +1092,7 @@ unsafe fn fguv_32x32xn_neon() -> Self { + pub const fn default() -> Self { Self { generate_grain_y: generate_grain_y::Fn::new(generate_grain_y_c_erased::), generate_grain_uv: enum_map!(Rav1dPixelLayoutSubSampled => generate_grain_uv::Fn; match key { @@ -1190,11 +1191,10 @@ impl Rav1dFilmGrainDSPContext { self } - fn init(self) -> Self { + #[inline(always)] + const fn init(self, flags: CpuFlags) -> Self { #[cfg(feature = "asm")] { - let flags = rav1d_get_cpu_flags(); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { return self.init_x86::(flags); @@ -1206,11 +1206,13 @@ impl Rav1dFilmGrainDSPContext { } #[allow(unreachable_code)] // Reachable on some #[cfg]s. - self + { + let _ = flags; + self + } } - #[cold] - pub fn new() -> Self { - Self::new_c::().init::() + pub const fn new(flags: CpuFlags) -> Self { + Self::default::().init::(flags) } } diff --git a/src/internal.rs b/src/internal.rs index 526fdc0ca..e2efad0d1 100644 --- a/src/internal.rs +++ b/src/internal.rs @@ -27,6 +27,8 @@ use crate::src::align::*; use crate::src::cdef::Rav1dCdefDSPContext; use crate::src::cdf::CdfContext; use crate::src::cdf::CdfThreadContext; +use crate::src::cpu::rav1d_get_cpu_flags; +use crate::src::cpu::CpuFlags; use crate::src::disjoint_mut::DisjointMut; use crate::src::disjoint_mut::DisjointMutArcSlice; use crate::src::env::BlockContext; @@ -54,7 +56,6 @@ use crate::src::mem::Rav1dMemPool; use crate::src::msac::MsacContext; use crate::src::picture::PictureFlags; use crate::src::picture::Rav1dThreadPicture; -use crate::src::r#ref::Rav1dRef; use crate::src::recon::backup_ipred_edge_fn; use crate::src::recon::copy_pal_block_fn; use crate::src::recon::filter_sbrow_fn; @@ -106,6 +107,7 @@ use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Condvar; use std::sync::Mutex; +use std::sync::OnceLock; use std::thread::JoinHandle; #[repr(C)] @@ -120,6 +122,55 @@ pub(crate) struct Rav1dDSPContext { pub initialized: bool, } +impl Rav1dDSPContext { + pub const fn _default() -> Self { + Self { + fg: Rav1dFilmGrainDSPContext::default::(), + ipred: Rav1dIntraPredDSPContext::default::(), + mc: Rav1dMCDSPContext::default::(), + itx: Rav1dInvTxfmDSPContext::default::(), + lf: Rav1dLoopFilterDSPContext::default::(), + cdef: Rav1dCdefDSPContext::default::(), + lr: Rav1dLoopRestorationDSPContext::default::(), + initialized: true, + } + } + + pub const fn new(flags: CpuFlags, bpc: c_int) -> Self { + Self { + fg: Rav1dFilmGrainDSPContext::new::(flags), + ipred: Rav1dIntraPredDSPContext::new::(flags), + mc: Rav1dMCDSPContext::new::(flags), + itx: Rav1dInvTxfmDSPContext::new::(flags, bpc), + lf: Rav1dLoopFilterDSPContext::new::(flags), + cdef: Rav1dCdefDSPContext::new::(flags), + lr: Rav1dLoopRestorationDSPContext::new::(flags, bpc), + initialized: true, + } + } + + pub fn get(bpc: c_int) -> Option<&'static Self> { + static BPC8: OnceLock = OnceLock::new(); + static BPC10: OnceLock = OnceLock::new(); + static BPC12: OnceLock = OnceLock::new(); + Some(match bpc { + 8 => BPC8.get_or_init(|| { + let flags = rav1d_get_cpu_flags(); + Self::new::(flags, bpc) + }), + 10 => BPC10.get_or_init(|| { + let flags = rav1d_get_cpu_flags(); + Self::new::(flags, bpc) + }), + 12 => BPC12.get_or_init(|| { + let flags = rav1d_get_cpu_flags(); + Self::new::(flags, bpc) + }), + _ => return None, + }) + } +} + #[derive(Clone, Default)] pub(crate) struct Rav1dTileGroupHeader { pub start: c_int, @@ -225,7 +276,7 @@ pub(crate) struct TaskThreadData { pub(crate) struct Rav1dContext_refs { pub p: Rav1dThreadPicture, pub segmap: Option>, - pub refmvs: *mut Rav1dRef, + pub refmvs: Option>, pub refpoc: [c_uint; 7], } @@ -286,12 +337,9 @@ pub struct Rav1dContext { pub(crate) task_thread: Arc, // reference/entropy state - pub(crate) refmvs_pool: *mut Rav1dMemPool, pub(crate) refs: [Rav1dContext_refs; 8], - pub(crate) cdf_pool: *mut Rav1dMemPool, - pub(crate) cdf: [CdfThreadContext; 8], + pub(crate) cdf: [CdfThreadContext; 8], // Previously pooled - pub(crate) dsp: [Rav1dDSPContext; 3], /* 8, 10, 12 bits/component */ pub(crate) refmvs_dsp: Rav1dRefmvsDSPContext, pub(crate) allocator: Rav1dPicAllocator, @@ -553,6 +601,7 @@ impl TxLpfRightEdge { } /// loopfilter +#[derive(Default)] #[repr(C)] pub struct Rav1dFrameContext_lf { pub level: DisjointMut>, @@ -737,10 +786,8 @@ pub(crate) struct Rav1dFrameData { pub cur: Rav1dPicture, // after super-resolution upscaling pub sr_cur: Rav1dThreadPicture, - pub mvs_ref: *mut Rav1dRef, - pub mvs: *mut refmvs_temporal_block, - pub ref_mvs: [*mut refmvs_temporal_block; 7], - pub ref_mvs_ref: [*mut Rav1dRef; 7], + pub mvs: Option>, // Previously pooled. + pub ref_mvs: [Option>; 7], pub cur_segmap: Option>, // Previously pooled. pub prev_segmap: Option>, pub refpoc: [c_uint; 7], @@ -757,7 +804,7 @@ pub(crate) struct Rav1dFrameData { pub ts: *mut Rav1dTileState, pub n_ts: c_int, - pub dsp: *const Rav1dDSPContext, + pub dsp: &'static Rav1dDSPContext, pub ipred_edge_sz: c_int, pub ipred_edge: [*mut DynPixel; 3], diff --git a/src/ipred.rs b/src/ipred.rs index b7f1d2915..46bcf743a 100644 --- a/src/ipred.rs +++ b/src/ipred.rs @@ -6,6 +6,7 @@ use crate::include::common::bitdepth::BPC; use crate::include::common::intops::apply_sign; use crate::include::common::intops::iclip; use crate::include::dav1d::headers::Rav1dPixelLayoutSubSampled; +use crate::src::cpu::CpuFlags; use crate::src::enum_map::enum_map; use crate::src::enum_map::enum_map_ty; use crate::src::enum_map::DefaultValue; @@ -14,6 +15,7 @@ use crate::src::levels::DC_PRED; use crate::src::levels::FILTER_PRED; use crate::src::levels::HOR_PRED; use crate::src::levels::LEFT_DC_PRED; +use crate::src::levels::N_IMPL_INTRA_PRED_MODES; use crate::src::levels::PAETH_PRED; use crate::src::levels::SMOOTH_H_PRED; use crate::src::levels::SMOOTH_PRED; @@ -40,7 +42,7 @@ use std::slice; use strum::FromRepr; #[cfg(feature = "asm")] -use crate::{include::common::bitdepth::bd_fn, src::cpu::rav1d_get_cpu_flags, src::cpu::CpuFlags}; +use crate::include::common::bitdepth::bd_fn; #[cfg(all(feature = "asm", target_arch = "x86_64"))] use crate::include::common::bitdepth::bpc_fn; @@ -2046,210 +2048,254 @@ mod neon { } } -#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64"),))] -#[inline(always)] -fn intra_pred_dsp_init_x86(c: &mut Rav1dIntraPredDSPContext) { - let flags = rav1d_get_cpu_flags(); - - if !flags.contains(CpuFlags::SSSE3) { - return; +impl Rav1dIntraPredDSPContext { + pub const fn default() -> Self { + Self { + intra_pred: { + let mut a = [DefaultValue::DEFAULT; N_IMPL_INTRA_PRED_MODES]; + a[DC_PRED as usize] = + angular_ipred::Fn::new(ipred_dc_c_erased::); + a[DC_128_PRED as usize] = angular_ipred::Fn::new(ipred_dc_128_c_erased::); + a[TOP_DC_PRED as usize] = + angular_ipred::Fn::new(ipred_dc_c_erased::); + a[LEFT_DC_PRED as usize] = + angular_ipred::Fn::new(ipred_dc_c_erased::); + a[HOR_PRED as usize] = angular_ipred::Fn::new(ipred_h_c_erased::); + a[VERT_PRED as usize] = angular_ipred::Fn::new(ipred_v_c_erased::); + a[PAETH_PRED as usize] = angular_ipred::Fn::new(ipred_paeth_c_erased::); + a[SMOOTH_PRED as usize] = angular_ipred::Fn::new(ipred_smooth_c_erased::); + a[SMOOTH_V_PRED as usize] = angular_ipred::Fn::new(ipred_smooth_v_c_erased::); + a[SMOOTH_H_PRED as usize] = angular_ipred::Fn::new(ipred_smooth_h_c_erased::); + a[Z1_PRED as usize] = angular_ipred::Fn::new(ipred_z_c_erased::); + a[Z2_PRED as usize] = angular_ipred::Fn::new(ipred_z_c_erased::); + a[Z3_PRED as usize] = angular_ipred::Fn::new(ipred_z_c_erased::); + a[FILTER_PRED as usize] = angular_ipred::Fn::new(ipred_filter_c_erased::); + a + }, + cfl_ac: enum_map!(Rav1dPixelLayoutSubSampled => cfl_ac::Fn; match key { + I420 => cfl_ac::Fn::new(cfl_ac_c_erased::), + I422 => cfl_ac::Fn::new(cfl_ac_c_erased::), + I444 => cfl_ac::Fn::new(cfl_ac_c_erased::), + }), + cfl_pred: { + // Not all elements are initialized with fns, + // so we default initialize first so that there is no unitialized memory. + // The defaults just call `unimplemented!()`, + // which shouldn't slow down the other code paths at all. + let mut a = [DefaultValue::DEFAULT; 6]; + a[DC_PRED as usize] = + cfl_pred::Fn::new(ipred_cfl_c_erased::); + a[DC_128_PRED as usize] = cfl_pred::Fn::new(ipred_cfl_128_c_erased::); + a[TOP_DC_PRED as usize] = + cfl_pred::Fn::new(ipred_cfl_c_erased::); + a[LEFT_DC_PRED as usize] = + cfl_pred::Fn::new(ipred_cfl_c_erased::); + a + }, + pal_pred: pal_pred::Fn::new(pal_pred_c_erased::), + } } - c.intra_pred[DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc, ssse3); - c.intra_pred[DC_128_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_128, ssse3); - c.intra_pred[TOP_DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_top, ssse3); - c.intra_pred[LEFT_DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_left, ssse3); - c.intra_pred[HOR_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_h, ssse3); - c.intra_pred[VERT_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_v, ssse3); - c.intra_pred[PAETH_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_paeth, ssse3); - c.intra_pred[SMOOTH_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth, ssse3); - c.intra_pred[SMOOTH_H_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_h, ssse3); - c.intra_pred[SMOOTH_V_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_v, ssse3); - c.intra_pred[Z1_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z1, ssse3); - c.intra_pred[Z2_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z2, ssse3); - c.intra_pred[Z3_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z3, ssse3); - c.intra_pred[FILTER_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_filter, ssse3); - - c.cfl_pred[DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl, ssse3); - c.cfl_pred[DC_128_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_128, ssse3); - c.cfl_pred[TOP_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_top, ssse3); - c.cfl_pred[LEFT_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_left, ssse3); - - c.cfl_ac = enum_map!(Rav1dPixelLayoutSubSampled => cfl_ac::Fn; match key { - I420 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_420, ssse3), - I422 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_422, ssse3), - I444 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_444, ssse3), - }); - - c.pal_pred = bd_fn!(pal_pred::decl_fn, BD, pal_pred, ssse3); - - #[cfg(target_arch = "x86_64")] - { - if !flags.contains(CpuFlags::AVX2) { - return; + #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] + #[inline(always)] + const fn init_x86(mut self, flags: CpuFlags) -> Self { + if !flags.contains(CpuFlags::SSSE3) { + return self; } - c.intra_pred[DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc, avx2); - c.intra_pred[DC_128_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_128, avx2); - c.intra_pred[TOP_DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_top, avx2); - c.intra_pred[LEFT_DC_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_left, avx2); - c.intra_pred[HOR_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_h, avx2); - c.intra_pred[VERT_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_v, avx2); - c.intra_pred[PAETH_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_paeth, avx2); - c.intra_pred[SMOOTH_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth, avx2); - c.intra_pred[SMOOTH_H_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_h, avx2); - c.intra_pred[SMOOTH_V_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_v, avx2); - c.intra_pred[Z1_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z1, avx2); - c.intra_pred[Z2_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z2, avx2); - c.intra_pred[Z3_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z3, avx2); - c.intra_pred[FILTER_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_filter, avx2); - - c.cfl_pred[DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl, avx2); - c.cfl_pred[DC_128_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_128, avx2); - c.cfl_pred[TOP_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_top, avx2); - c.cfl_pred[LEFT_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_left, avx2); - - c.cfl_ac = enum_map!(Rav1dPixelLayoutSubSampled => cfl_ac::Fn; match key { - I420 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_420, avx2), - I422 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_422, avx2), - I444 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_444, avx2), + self.intra_pred[DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc, ssse3); + self.intra_pred[DC_128_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_128, ssse3); + self.intra_pred[TOP_DC_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_top, ssse3); + self.intra_pred[LEFT_DC_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_left, ssse3); + self.intra_pred[HOR_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_h, ssse3); + self.intra_pred[VERT_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_v, ssse3); + self.intra_pred[PAETH_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_paeth, ssse3); + self.intra_pred[SMOOTH_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth, ssse3); + self.intra_pred[SMOOTH_H_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_h, ssse3); + self.intra_pred[SMOOTH_V_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_v, ssse3); + self.intra_pred[Z1_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z1, ssse3); + self.intra_pred[Z2_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z2, ssse3); + self.intra_pred[Z3_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z3, ssse3); + self.intra_pred[FILTER_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_filter, ssse3); + + self.cfl_pred[DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl, ssse3); + self.cfl_pred[DC_128_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_128, ssse3); + self.cfl_pred[TOP_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_top, ssse3); + self.cfl_pred[LEFT_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_left, ssse3); + + self.cfl_ac = enum_map!(Rav1dPixelLayoutSubSampled => cfl_ac::Fn; match key { + I420 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_420, ssse3), + I422 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_422, ssse3), + I444 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_444, ssse3), }); - c.pal_pred = bd_fn!(pal_pred::decl_fn, BD, pal_pred, avx2); + self.pal_pred = bd_fn!(pal_pred::decl_fn, BD, pal_pred, ssse3); - if !flags.contains(CpuFlags::AVX512ICL) { - return; - } + #[cfg(target_arch = "x86_64")] + { + if !flags.contains(CpuFlags::AVX2) { + return self; + } - if BD::BPC == BPC::BPC8 { - c.intra_pred[DC_PRED as usize] = - bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_dc, avx512icl); - c.intra_pred[DC_128_PRED as usize] = - bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_dc_128, avx512icl); - c.intra_pred[TOP_DC_PRED as usize] = - bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_dc_top, avx512icl); - c.intra_pred[LEFT_DC_PRED as usize] = - bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_dc_left, avx512icl); - c.intra_pred[HOR_PRED as usize] = - bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_h, avx512icl); - c.intra_pred[VERT_PRED as usize] = - bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_v, avx512icl); - } + self.intra_pred[DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc, avx2); + self.intra_pred[DC_128_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_128, avx2); + self.intra_pred[TOP_DC_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_top, avx2); + self.intra_pred[LEFT_DC_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_left, avx2); + self.intra_pred[HOR_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_h, avx2); + self.intra_pred[VERT_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_v, avx2); + self.intra_pred[PAETH_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_paeth, avx2); + self.intra_pred[SMOOTH_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth, avx2); + self.intra_pred[SMOOTH_H_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_h, avx2); + self.intra_pred[SMOOTH_V_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_v, avx2); + self.intra_pred[Z1_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z1, avx2); + self.intra_pred[Z2_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z2, avx2); + self.intra_pred[Z3_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_z3, avx2); + self.intra_pred[FILTER_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_filter, avx2); + + self.cfl_pred[DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl, avx2); + self.cfl_pred[DC_128_PRED as usize] = + bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_128, avx2); + self.cfl_pred[TOP_DC_PRED as usize] = + bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_top, avx2); + self.cfl_pred[LEFT_DC_PRED as usize] = + bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_left, avx2); + + self.cfl_ac = enum_map!(Rav1dPixelLayoutSubSampled => cfl_ac::Fn; match key { + I420 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_420, avx2), + I422 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_422, avx2), + I444 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_444, avx2), + }); + + self.pal_pred = bd_fn!(pal_pred::decl_fn, BD, pal_pred, avx2); + + if !flags.contains(CpuFlags::AVX512ICL) { + return self; + } - c.intra_pred[PAETH_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_paeth, avx512icl); - c.intra_pred[SMOOTH_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth, avx512icl); - c.intra_pred[SMOOTH_H_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_h, avx512icl); - c.intra_pred[SMOOTH_V_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_v, avx512icl); - c.intra_pred[FILTER_PRED as usize] = - bd_fn!(angular_ipred::decl_fn, BD, ipred_filter, avx512icl); - - c.pal_pred = bd_fn!(pal_pred::decl_fn, BD, pal_pred, avx512icl); - } -} + if let BPC::BPC8 = BD::BPC { + self.intra_pred[DC_PRED as usize] = + bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_dc, avx512icl); + self.intra_pred[DC_128_PRED as usize] = + bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_dc_128, avx512icl); + self.intra_pred[TOP_DC_PRED as usize] = + bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_dc_top, avx512icl); + self.intra_pred[LEFT_DC_PRED as usize] = + bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_dc_left, avx512icl); + self.intra_pred[HOR_PRED as usize] = + bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_h, avx512icl); + self.intra_pred[VERT_PRED as usize] = + bpc_fn!(angular_ipred::decl_fn, 8 bpc, ipred_v, avx512icl); + } -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64"),))] -#[inline(always)] -fn intra_pred_dsp_init_arm(c: &mut Rav1dIntraPredDSPContext) { - let flags = rav1d_get_cpu_flags(); + self.intra_pred[PAETH_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_paeth, avx512icl); + self.intra_pred[SMOOTH_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth, avx512icl); + self.intra_pred[SMOOTH_H_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_h, avx512icl); + self.intra_pred[SMOOTH_V_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_v, avx512icl); + self.intra_pred[FILTER_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_filter, avx512icl); + + self.pal_pred = bd_fn!(pal_pred::decl_fn, BD, pal_pred, avx512icl); + } - if !flags.contains(CpuFlags::NEON) { - return; + self } - c.intra_pred[DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc, neon); - c.intra_pred[DC_128_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_128, neon); - c.intra_pred[TOP_DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_top, neon); - c.intra_pred[LEFT_DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_left, neon); - c.intra_pred[HOR_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_h, neon); - c.intra_pred[VERT_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_v, neon); - c.intra_pred[PAETH_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_paeth, neon); - c.intra_pred[SMOOTH_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth, neon); - c.intra_pred[SMOOTH_V_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_v, neon); - c.intra_pred[SMOOTH_H_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_h, neon); - #[cfg(target_arch = "aarch64")] - { - use self::neon::ipred_z_neon_erased; - - c.intra_pred[Z1_PRED as usize] = angular_ipred::Fn::new(ipred_z_neon_erased::); - c.intra_pred[Z2_PRED as usize] = angular_ipred::Fn::new(ipred_z_neon_erased::); - c.intra_pred[Z3_PRED as usize] = angular_ipred::Fn::new(ipred_z_neon_erased::); - } - c.intra_pred[FILTER_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_filter, neon); + #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] + #[inline(always)] + const fn init_arm(mut self, flags: CpuFlags) -> Self { + if !flags.contains(CpuFlags::NEON) { + return self; + } - c.cfl_pred[DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl, neon); - c.cfl_pred[DC_128_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_128, neon); - c.cfl_pred[TOP_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_top, neon); - c.cfl_pred[LEFT_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_left, neon); + self.intra_pred[DC_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_dc, neon); + self.intra_pred[DC_128_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_128, neon); + self.intra_pred[TOP_DC_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_top, neon); + self.intra_pred[LEFT_DC_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_dc_left, neon); + self.intra_pred[HOR_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_h, neon); + self.intra_pred[VERT_PRED as usize] = bd_fn!(angular_ipred::decl_fn, BD, ipred_v, neon); + self.intra_pred[PAETH_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_paeth, neon); + self.intra_pred[SMOOTH_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth, neon); + self.intra_pred[SMOOTH_V_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_v, neon); + self.intra_pred[SMOOTH_H_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_smooth_h, neon); + #[cfg(target_arch = "aarch64")] + { + use self::neon::ipred_z_neon_erased; + + self.intra_pred[Z1_PRED as usize] = + angular_ipred::Fn::new(ipred_z_neon_erased::); + self.intra_pred[Z2_PRED as usize] = + angular_ipred::Fn::new(ipred_z_neon_erased::); + self.intra_pred[Z3_PRED as usize] = + angular_ipred::Fn::new(ipred_z_neon_erased::); + } + self.intra_pred[FILTER_PRED as usize] = + bd_fn!(angular_ipred::decl_fn, BD, ipred_filter, neon); + + self.cfl_pred[DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl, neon); + self.cfl_pred[DC_128_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_128, neon); + self.cfl_pred[TOP_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_top, neon); + self.cfl_pred[LEFT_DC_PRED as usize] = bd_fn!(cfl_pred::decl_fn, BD, ipred_cfl_left, neon); + + self.cfl_ac = enum_map!(Rav1dPixelLayoutSubSampled => cfl_ac::Fn; match key { + I420 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_420, neon), + I422 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_422, neon), + I444 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_444, neon), + }); - c.cfl_ac = enum_map!(Rav1dPixelLayoutSubSampled => cfl_ac::Fn; match key { - I420 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_420, neon), - I422 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_422, neon), - I444 => bd_fn!(cfl_ac::decl_fn, BD, ipred_cfl_ac_444, neon), - }); + self.pal_pred = bd_fn!(pal_pred::decl_fn, BD, pal_pred, neon); - c.pal_pred = bd_fn!(pal_pred::decl_fn, BD, pal_pred, neon); -} + self + } -#[cold] -pub(crate) fn rav1d_intra_pred_dsp_init(c: &mut Rav1dIntraPredDSPContext) { - c.intra_pred[DC_PRED as usize] = - angular_ipred::Fn::new(ipred_dc_c_erased::); - c.intra_pred[DC_128_PRED as usize] = angular_ipred::Fn::new(ipred_dc_128_c_erased::); - c.intra_pred[TOP_DC_PRED as usize] = - angular_ipred::Fn::new(ipred_dc_c_erased::); - c.intra_pred[LEFT_DC_PRED as usize] = - angular_ipred::Fn::new(ipred_dc_c_erased::); - c.intra_pred[HOR_PRED as usize] = angular_ipred::Fn::new(ipred_h_c_erased::); - c.intra_pred[VERT_PRED as usize] = angular_ipred::Fn::new(ipred_v_c_erased::); - c.intra_pred[PAETH_PRED as usize] = angular_ipred::Fn::new(ipred_paeth_c_erased::); - c.intra_pred[SMOOTH_PRED as usize] = angular_ipred::Fn::new(ipred_smooth_c_erased::); - c.intra_pred[SMOOTH_V_PRED as usize] = angular_ipred::Fn::new(ipred_smooth_v_c_erased::); - c.intra_pred[SMOOTH_H_PRED as usize] = angular_ipred::Fn::new(ipred_smooth_h_c_erased::); - c.intra_pred[Z1_PRED as usize] = angular_ipred::Fn::new(ipred_z_c_erased::); - c.intra_pred[Z2_PRED as usize] = angular_ipred::Fn::new(ipred_z_c_erased::); - c.intra_pred[Z3_PRED as usize] = angular_ipred::Fn::new(ipred_z_c_erased::); - c.intra_pred[FILTER_PRED as usize] = angular_ipred::Fn::new(ipred_filter_c_erased::); - - c.cfl_ac = enum_map!(Rav1dPixelLayoutSubSampled => cfl_ac::Fn; match key { - I420 => cfl_ac::Fn::new(cfl_ac_c_erased::), - I422 => cfl_ac::Fn::new(cfl_ac_c_erased::), - I444 => cfl_ac::Fn::new(cfl_ac_c_erased::), - }); - - // Not all elements are initialized with fns, - // so we default initialize first so that there is no unitialized memory. - // The defaults just call `unimplemented!()`, - // which shouldn't slow down the other code paths at all. - c.cfl_pred = [DefaultValue::DEFAULT; 6]; - c.cfl_pred[DC_PRED as usize] = - cfl_pred::Fn::new(ipred_cfl_c_erased::); - c.cfl_pred[DC_128_PRED as usize] = cfl_pred::Fn::new(ipred_cfl_128_c_erased::); - c.cfl_pred[TOP_DC_PRED as usize] = - cfl_pred::Fn::new(ipred_cfl_c_erased::); - c.cfl_pred[LEFT_DC_PRED as usize] = - cfl_pred::Fn::new(ipred_cfl_c_erased::); - - c.pal_pred = pal_pred::Fn::new(pal_pred_c_erased::); - - #[cfg(feature = "asm")] - cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - use crate::src::ipred::intra_pred_dsp_init_x86; - - intra_pred_dsp_init_x86::(c); - } else if #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] { - use crate::src::ipred::intra_pred_dsp_init_arm; - - intra_pred_dsp_init_arm::(c); + #[inline(always)] + const fn init(self, flags: CpuFlags) -> Self { + #[cfg(feature = "asm")] + { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + return self.init_x86::(flags); + } + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + { + return self.init_arm::(flags); + } + } + + #[allow(unreachable_code)] // Reachable on some #[cfg]s. + { + let _ = flags; + self } } + + pub const fn new(flags: CpuFlags) -> Self { + Self::default::().init::(flags) + } } diff --git a/src/itx.rs b/src/itx.rs index a2310f5d7..0dae95214 100644 --- a/src/itx.rs +++ b/src/itx.rs @@ -3,6 +3,7 @@ use crate::include::common::bitdepth::BitDepth; use crate::include::common::bitdepth::DynCoef; use crate::include::common::bitdepth::DynPixel; use crate::include::common::intops::iclip; +use crate::src::cpu::CpuFlags; use crate::src::levels::ADST_ADST; use crate::src::levels::ADST_DCT; use crate::src::levels::ADST_FLIPADST; @@ -47,12 +48,6 @@ use std::cmp; use std::ffi::c_int; use std::ffi::c_void; -#[cfg(feature = "asm")] -use crate::src::cpu::{rav1d_get_cpu_flags, CpuFlags}; - -#[cfg(feature = "asm")] -use cfg_if::cfg_if; - #[cfg(feature = "asm")] use crate::include::common::bitdepth::bd_fn; @@ -531,7 +526,7 @@ macro_rules! assign_itx_fn { use paste::paste; paste! { - (*$c).itxfm_add[[] as usize][$type_enum as usize] + $c.itxfm_add[[] as usize][$type_enum as usize] = Some(bd_fn!(BD, [< inv_txfm_add_ $type _ $w x $h >], $ext)); } }}; @@ -540,7 +535,7 @@ macro_rules! assign_itx_fn { use paste::paste; paste! { - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][$type_enum as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][$type_enum as usize] = Some(bd_fn!(BD, [< inv_txfm_add_ $type _ $w x $h >], $ext)); } }}; @@ -552,7 +547,7 @@ macro_rules! assign_itx_bpc_fn { use paste::paste; paste! { - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][$type_enum as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][$type_enum as usize] = Some(bpc_fn!($bpc bpc, [< inv_txfm_add_ $type _ $w x $h >], $ext)); } }}; @@ -561,7 +556,7 @@ macro_rules! assign_itx_bpc_fn { use paste::paste; paste! { - (*$c).itxfm_add[[] as usize][$type_enum as usize] + $c.itxfm_add[[] as usize][$type_enum as usize] = Some(bpc_fn!($bpc bpc, [< inv_txfm_add_ $type _ $w x $h >], $ext)); } }}; @@ -616,7 +611,6 @@ macro_rules! assign_itx2_fn { } #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] -#[rustfmt::skip] macro_rules! assign_itx12_bpc_fn { ($c:ident, $w:literal, $h:literal, $bpc:literal bpc, $ext:ident) => {{ assign_itx2_bpc_fn!($c, $w, $h, $bpc bpc, $ext); @@ -630,7 +624,6 @@ macro_rules! assign_itx12_bpc_fn { assign_itx_bpc_fn!($c, $w, $h, flipadst_adst, ADST_FLIPADST, $bpc bpc, $ext); assign_itx_bpc_fn!($c, $w, $h, flipadst_flipadst, FLIPADST_FLIPADST, $bpc bpc, $ext); assign_itx_bpc_fn!($c, $w, $h, identity_dct, V_DCT, $bpc bpc, $ext); - }}; ($c:ident, $pfx:ident, $w:literal, $h:literal, $bpc:literal bpc, $ext:ident) => {{ @@ -649,7 +642,6 @@ macro_rules! assign_itx12_bpc_fn { } #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -#[rustfmt::skip] macro_rules! assign_itx12_fn { ($c:ident, $BD:ty, $w:literal, $h:literal, $ext:ident) => {{ assign_itx2_fn!($c, BD, $w, $h, $ext); @@ -675,7 +667,16 @@ macro_rules! assign_itx12_fn { assign_itx_fn!($c, BD, $pfx, $w, $h, adst_flipadst, FLIPADST_ADST, $ext); assign_itx_fn!($c, BD, $pfx, $w, $h, flipadst_dct, DCT_FLIPADST, $ext); assign_itx_fn!($c, BD, $pfx, $w, $h, flipadst_adst, ADST_FLIPADST, $ext); - assign_itx_fn!($c, BD, $pfx, $w, $h, flipadst_flipadst, FLIPADST_FLIPADST, $ext); + assign_itx_fn!( + $c, + BD, + $pfx, + $w, + $h, + flipadst_flipadst, + FLIPADST_FLIPADST, + $ext + ); assign_itx_fn!($c, BD, $pfx, $w, $h, identity_dct, V_DCT, $ext); }}; } @@ -718,227 +719,12 @@ macro_rules! assign_itx16_fn { }}; } -#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] -#[inline(always)] -#[rustfmt::skip] -unsafe fn itx_dsp_init_x86(c: *mut Rav1dInvTxfmDSPContext, bpc: c_int) { - - let flags = rav1d_get_cpu_flags(); - - if !flags.contains(CpuFlags::SSE2) { - return; - } - - assign_itx_fn!(c, BD, 4, 4, wht_wht, WHT_WHT, sse2); - - if !flags.contains(CpuFlags::SSSE3) { - return; - } - - if BD::BITDEPTH == 8 { - assign_itx16_bpc_fn!(c, 4, 4, 8 bpc, ssse3); - assign_itx16_bpc_fn!(c, R, 4, 8, 8 bpc, ssse3); - assign_itx16_bpc_fn!(c, R, 8, 4, 8 bpc, ssse3); - assign_itx16_bpc_fn!(c, 8, 8, 8 bpc, ssse3); - assign_itx16_bpc_fn!(c, R, 4, 16, 8 bpc, ssse3); - assign_itx16_bpc_fn!(c, R, 16, 4, 8 bpc, ssse3); - assign_itx16_bpc_fn!(c, R, 8, 16, 8 bpc, ssse3); - assign_itx16_bpc_fn!(c, R, 16, 8, 8 bpc, ssse3); - assign_itx12_bpc_fn!(c, 16, 16, 8 bpc, ssse3); - assign_itx2_bpc_fn! (c, R, 8, 32, 8 bpc, ssse3); - assign_itx2_bpc_fn! (c, R, 32, 8, 8 bpc, ssse3); - assign_itx2_bpc_fn! (c, R, 16, 32, 8 bpc, ssse3); - assign_itx2_bpc_fn! (c, R, 32, 16, 8 bpc, ssse3); - assign_itx2_bpc_fn! (c, 32, 32, 8 bpc, ssse3); - assign_itx1_bpc_fn! (c, R, 16, 64, 8 bpc, ssse3); - assign_itx1_bpc_fn! (c, R, 32, 64, 8 bpc, ssse3); - assign_itx1_bpc_fn! (c, R, 64, 16, 8 bpc, ssse3); - assign_itx1_bpc_fn! (c, R, 64, 32, 8 bpc, ssse3); - assign_itx1_bpc_fn! (c, 64, 64, 8 bpc, ssse3); - } - - if !flags.contains(CpuFlags::SSE41) { - return; - } - - if BD::BITDEPTH == 16 { - if bpc == 10 { - assign_itx16_bpc_fn!(c, 4, 4, 16 bpc, sse4); - assign_itx16_bpc_fn!(c, R, 4, 8, 16 bpc, sse4); - assign_itx16_bpc_fn!(c, R, 4, 16, 16 bpc, sse4); - assign_itx16_bpc_fn!(c, R, 8, 4, 16 bpc, sse4); - assign_itx16_bpc_fn!(c, 8, 8, 16 bpc, sse4); - assign_itx16_bpc_fn!(c, R, 8, 16, 16 bpc, sse4); - assign_itx16_bpc_fn!(c, R, 16, 4, 16 bpc, sse4); - assign_itx16_bpc_fn!(c, R, 16, 8, 16 bpc, sse4); - assign_itx12_bpc_fn!(c, 16, 16, 16 bpc, sse4); - assign_itx2_bpc_fn! (c, R, 8, 32, 16 bpc, sse4); - assign_itx2_bpc_fn! (c, R, 16, 32, 16 bpc, sse4); - assign_itx2_bpc_fn! (c, R, 32, 8, 16 bpc, sse4); - assign_itx2_bpc_fn! (c, R, 32, 16, 16 bpc, sse4); - assign_itx2_bpc_fn! (c, 32, 32, 16 bpc, sse4); - assign_itx1_bpc_fn! (c, R, 16, 64, 16 bpc, sse4); - assign_itx1_bpc_fn! (c, R, 32, 64, 16 bpc, sse4); - assign_itx1_bpc_fn! (c, R, 64, 16, 16 bpc, sse4); - assign_itx1_bpc_fn! (c, R, 64, 32, 16 bpc, sse4); - assign_itx1_bpc_fn! (c, 64, 64, 16 bpc, sse4); - } - } - - #[cfg(target_arch = "x86_64")] - { - if !flags.contains(CpuFlags::AVX2) { - return; - } - - assign_itx_fn!(c, BD, 4, 4, wht_wht, WHT_WHT, avx2); - - if BD::BITDEPTH == 8 { - assign_itx16_bpc_fn!(c, 4, 4, 8 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 4, 8, 8 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 4, 16, 8 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 8, 4, 8 bpc, avx2); - assign_itx16_bpc_fn!(c, 8, 8, 8 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 8, 16, 8 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 16, 4, 8 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 16, 8, 8 bpc, avx2); - assign_itx12_bpc_fn!(c, 16, 16, 8 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 8, 32, 8 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 16, 32, 8 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 32, 8, 8 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 32, 16, 8 bpc, avx2); - assign_itx2_bpc_fn! (c, 32, 32, 8 bpc, avx2); - assign_itx1_bpc_fn! (c, R, 16, 64, 8 bpc, avx2); - assign_itx1_bpc_fn! (c, R, 32, 64, 8 bpc, avx2); - assign_itx1_bpc_fn! (c, R, 64, 16, 8 bpc, avx2); - assign_itx1_bpc_fn! (c, R, 64, 32, 8 bpc, avx2); - assign_itx1_bpc_fn! (c, 64, 64, 8 bpc, avx2); - } else { - if bpc == 10 { - assign_itx16_bpc_fn!(c, 4, 4, 10 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 4, 8, 10 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 4, 16, 10 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 8, 4, 10 bpc, avx2); - assign_itx16_bpc_fn!(c, 8, 8, 10 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 8, 16, 10 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 16, 4, 10 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 16, 8, 10 bpc, avx2); - assign_itx12_bpc_fn!(c, 16, 16, 10 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 8, 32, 10 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 16, 32, 10 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 32, 8, 10 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 32, 16, 10 bpc, avx2); - assign_itx2_bpc_fn! (c, 32, 32, 10 bpc, avx2); - assign_itx1_bpc_fn! (c, R, 16, 64, 10 bpc, avx2); - assign_itx1_bpc_fn! (c, R, 32, 64, 10 bpc, avx2); - assign_itx1_bpc_fn! (c, R, 64, 16, 10 bpc, avx2); - assign_itx1_bpc_fn! (c, R, 64, 32, 10 bpc, avx2); - assign_itx1_bpc_fn! (c, 64, 64, 10 bpc, avx2); - } else { - assign_itx16_bpc_fn!(c, 4, 4, 12 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 4, 8, 12 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 4, 16, 12 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 8, 4, 12 bpc, avx2); - assign_itx16_bpc_fn!(c, 8, 8, 12 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 8, 16, 12 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 16, 4, 12 bpc, avx2); - assign_itx16_bpc_fn!(c, R, 16, 8, 12 bpc, avx2); - assign_itx12_bpc_fn!(c, 16, 16, 12 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 8, 32, 12 bpc, avx2); - assign_itx2_bpc_fn! (c, R, 32, 8, 12 bpc, avx2); - assign_itx_bpc_fn! (c, R, 16, 32, identity_identity, IDTX, 12 bpc, avx2); - assign_itx_bpc_fn! (c, R, 32, 16, identity_identity, IDTX, 12 bpc, avx2); - assign_itx_bpc_fn! (c, 32, 32, identity_identity, IDTX, 12 bpc, avx2); - } - } - - if !flags.contains(CpuFlags::AVX512ICL) { - return; - } - - if BD::BITDEPTH == 8 { - assign_itx16_bpc_fn!(c, 4, 4, 8 bpc, avx512icl); // no wht - assign_itx16_bpc_fn!(c, R, 4, 8, 8 bpc, avx512icl); - assign_itx16_bpc_fn!(c, R, 4, 16, 8 bpc, avx512icl); - assign_itx16_bpc_fn!(c, R, 8, 4, 8 bpc, avx512icl); - assign_itx16_bpc_fn!(c, 8, 8, 8 bpc, avx512icl); - assign_itx16_bpc_fn!(c, R, 8, 16, 8 bpc, avx512icl); - assign_itx16_bpc_fn!(c, R, 16, 4, 8 bpc, avx512icl); - assign_itx16_bpc_fn!(c, R, 16, 8, 8 bpc, avx512icl); - assign_itx12_bpc_fn!(c, 16, 16, 8 bpc, avx512icl); - assign_itx2_bpc_fn! (c, R, 8, 32, 8 bpc, avx512icl); - assign_itx2_bpc_fn! (c, R, 16, 32, 8 bpc, avx512icl); - assign_itx2_bpc_fn! (c, R, 32, 8, 8 bpc, avx512icl); - assign_itx2_bpc_fn! (c, R, 32, 16, 8 bpc, avx512icl); - assign_itx2_bpc_fn! (c, 32, 32, 8 bpc, avx512icl); - assign_itx1_bpc_fn! (c, R, 16, 64, 8 bpc, avx512icl); - assign_itx1_bpc_fn! (c, R, 32, 64, 8 bpc, avx512icl); - assign_itx1_bpc_fn! (c, R, 64, 16, 8 bpc, avx512icl); - assign_itx1_bpc_fn! (c, R, 64, 32, 8 bpc, avx512icl); - assign_itx1_bpc_fn! (c, 64, 64, 8 bpc, avx512icl); - } else { - if bpc == 10 { - assign_itx16_bpc_fn!(c, 8, 8, 10 bpc, avx512icl); - assign_itx16_bpc_fn!(c, R, 8, 16, 10 bpc, avx512icl); - assign_itx16_bpc_fn!(c, R, 16, 8, 10 bpc, avx512icl); - assign_itx12_bpc_fn!(c, 16, 16, 10 bpc, avx512icl); - assign_itx2_bpc_fn! (c, R, 8, 32, 10 bpc, avx512icl); - assign_itx2_bpc_fn! (c, R, 16, 32, 10 bpc, avx512icl); - assign_itx2_bpc_fn! (c, R, 32, 8, 10 bpc, avx512icl); - assign_itx2_bpc_fn! (c, R, 32, 16, 10 bpc, avx512icl); - assign_itx2_bpc_fn! (c, 32, 32, 10 bpc, avx512icl); - assign_itx1_bpc_fn! (c, R, 16, 64, 10 bpc, avx512icl); - assign_itx1_bpc_fn! (c, R, 32, 64, 10 bpc, avx512icl); - assign_itx1_bpc_fn! (c, R, 64, 16, 10 bpc, avx512icl); - assign_itx1_bpc_fn! (c, R, 64, 32, 10 bpc, avx512icl); - assign_itx1_bpc_fn! (c, 64, 64, 10 bpc, avx512icl); - } - } - } -} - -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -#[inline(always)] -#[rustfmt::skip] -unsafe fn itx_dsp_init_arm(c: *mut Rav1dInvTxfmDSPContext, bpc: c_int) { - let flags = rav1d_get_cpu_flags(); - - if !flags.contains(CpuFlags::NEON) { - return; - } - - if BD::BITDEPTH == 16 && bpc != 10 { - return; - } - - assign_itx_fn! (c, BD, 4, 4, wht_wht, WHT_WHT, neon); - assign_itx16_fn!(c, BD, 4, 4, neon); - assign_itx16_fn!(c, BD, R, 4, 8, neon); - assign_itx16_fn!(c, BD, R, 4, 16, neon); - assign_itx16_fn!(c, BD, R, 8, 4, neon); - assign_itx16_fn!(c, BD, 8, 8, neon); - assign_itx16_fn!(c, BD, R, 8, 16, neon); - assign_itx16_fn!(c, BD, R, 16, 4, neon); - assign_itx16_fn!(c, BD, R, 16, 8, neon); - assign_itx12_fn!(c, BD, 16, 16, neon); - assign_itx2_fn! (c, BD, R, 8, 32, neon); - assign_itx2_fn! (c, BD, R, 16, 32, neon); - assign_itx2_fn! (c, BD, R, 32, 8, neon); - assign_itx2_fn! (c, BD, R, 32, 16, neon); - assign_itx2_fn! (c, BD, 32, 32, neon); - assign_itx1_fn! (c, BD, R, 16, 64, neon); - assign_itx1_fn! (c, BD, R, 32, 64, neon); - assign_itx1_fn! (c, BD, R, 64, 16, neon); - assign_itx1_fn! (c, BD, R, 64, 32, neon); - assign_itx1_fn! (c, BD, 64, 64, neon); -} - macro_rules! assign_itx_all_fn64 { ($c:ident, $BD:ty, $w:literal, $h:literal) => {{ use paste::paste; paste! { - (*$c).itxfm_add[[] as usize][DCT_DCT as usize] = + $c.itxfm_add[[] as usize][DCT_DCT as usize] = Some([< inv_txfm_add_dct_dct_ $w x $h _c_erased >]::); } }}; @@ -947,7 +733,7 @@ macro_rules! assign_itx_all_fn64 { use paste::paste; paste! { - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][DCT_DCT as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][DCT_DCT as usize] = Some([< inv_txfm_add_dct_dct_ $w x $h _c_erased >]::); } }}; @@ -959,7 +745,7 @@ macro_rules! assign_itx_all_fn32 { assign_itx_all_fn64!($c, BD, $w, $h); paste! { - (*$c).itxfm_add[[] as usize][IDTX as usize] + $c.itxfm_add[[] as usize][IDTX as usize] = Some([< inv_txfm_add_identity_identity_ $w x $h _c_erased >]::); } }}; @@ -969,7 +755,7 @@ macro_rules! assign_itx_all_fn32 { assign_itx_all_fn64!($c, BD, $w, $h, $pfx); paste! { - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][IDTX as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][IDTX as usize] = Some([< inv_txfm_add_identity_identity_ $w x $h _c_erased >]::); } }}; @@ -981,25 +767,25 @@ macro_rules! assign_itx_all_fn16 { assign_itx_all_fn32!($c, BD, $w, $h); paste! { - (*$c).itxfm_add[[] as usize][DCT_ADST as usize] + $c.itxfm_add[[] as usize][DCT_ADST as usize] = Some([< inv_txfm_add_adst_dct_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][ADST_DCT as usize] + $c.itxfm_add[[] as usize][ADST_DCT as usize] = Some([< inv_txfm_add_dct_adst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][ADST_ADST as usize] + $c.itxfm_add[[] as usize][ADST_ADST as usize] = Some([< inv_txfm_add_adst_adst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][ADST_FLIPADST as usize] + $c.itxfm_add[[] as usize][ADST_FLIPADST as usize] = Some([< inv_txfm_add_flipadst_adst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][FLIPADST_ADST as usize] + $c.itxfm_add[[] as usize][FLIPADST_ADST as usize] = Some([< inv_txfm_add_adst_flipadst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][DCT_FLIPADST as usize] + $c.itxfm_add[[] as usize][DCT_FLIPADST as usize] = Some([< inv_txfm_add_flipadst_dct_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][FLIPADST_DCT as usize] + $c.itxfm_add[[] as usize][FLIPADST_DCT as usize] = Some([< inv_txfm_add_dct_flipadst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][FLIPADST_FLIPADST as usize] + $c.itxfm_add[[] as usize][FLIPADST_FLIPADST as usize] = Some([< inv_txfm_add_flipadst_flipadst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][H_DCT as usize] + $c.itxfm_add[[] as usize][H_DCT as usize] = Some([< inv_txfm_add_dct_identity_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][V_DCT as usize] + $c.itxfm_add[[] as usize][V_DCT as usize] = Some([< inv_txfm_add_identity_dct_ $w x $h _c_erased >]::); } }}; @@ -1009,25 +795,25 @@ macro_rules! assign_itx_all_fn16 { assign_itx_all_fn32!($c, BD, $w, $h, $pfx); paste! { - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][DCT_ADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][DCT_ADST as usize] = Some([< inv_txfm_add_adst_dct_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][ADST_DCT as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][ADST_DCT as usize] = Some([< inv_txfm_add_dct_adst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][ADST_ADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][ADST_ADST as usize] = Some([< inv_txfm_add_adst_adst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][ADST_FLIPADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][ADST_FLIPADST as usize] = Some([< inv_txfm_add_flipadst_adst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][FLIPADST_ADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][FLIPADST_ADST as usize] = Some([< inv_txfm_add_adst_flipadst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][DCT_FLIPADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][DCT_FLIPADST as usize] = Some([< inv_txfm_add_flipadst_dct_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][FLIPADST_DCT as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][FLIPADST_DCT as usize] = Some([< inv_txfm_add_dct_flipadst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][FLIPADST_FLIPADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][FLIPADST_FLIPADST as usize] = Some([< inv_txfm_add_flipadst_flipadst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][H_DCT as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][H_DCT as usize] = Some([< inv_txfm_add_dct_identity_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][V_DCT as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][V_DCT as usize] = Some([< inv_txfm_add_identity_dct_ $w x $h _c_erased >]::); } }}; @@ -1039,13 +825,13 @@ macro_rules! assign_itx_all_fn84 { assign_itx_all_fn16!($c, BD, $w, $h); paste! { - (*$c).itxfm_add[[] as usize][H_FLIPADST as usize] + $c.itxfm_add[[] as usize][H_FLIPADST as usize] = Some([< inv_txfm_add_flipadst_identity_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][V_FLIPADST as usize] + $c.itxfm_add[[] as usize][V_FLIPADST as usize] = Some([< inv_txfm_add_identity_flipadst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][H_ADST as usize] + $c.itxfm_add[[] as usize][H_ADST as usize] = Some([< inv_txfm_add_adst_identity_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[] as usize][V_ADST as usize] + $c.itxfm_add[[] as usize][V_ADST as usize] = Some([< inv_txfm_add_identity_adst_ $w x $h _c_erased >]::); } }}; @@ -1055,51 +841,296 @@ macro_rules! assign_itx_all_fn84 { assign_itx_all_fn16!($c, BD, $w, $h, $pfx); paste! { - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][H_FLIPADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][H_FLIPADST as usize] = Some([< inv_txfm_add_flipadst_identity_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][V_FLIPADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][V_FLIPADST as usize] = Some([< inv_txfm_add_identity_flipadst_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][H_ADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][H_ADST as usize] = Some([< inv_txfm_add_adst_identity_ $w x $h _c_erased >]::); - (*$c).itxfm_add[[<$pfx TX_ $w X $h>] as usize][V_ADST as usize] + $c.itxfm_add[[<$pfx TX_ $w X $h>] as usize][V_ADST as usize] = Some([< inv_txfm_add_identity_adst_ $w x $h _c_erased >]::); } }}; } -#[cold] -#[rustfmt::skip] -pub unsafe fn rav1d_itx_dsp_init(c: *mut Rav1dInvTxfmDSPContext, mut _bpc: c_int) { - - - (*c).itxfm_add[TX_4X4 as usize][WHT_WHT as usize] - = Some(inv_txfm_add_wht_wht_4x4_c_erased::); - assign_itx_all_fn84!(c, BD, 4, 4 ); - assign_itx_all_fn84!(c, BD, 4, 8, R); - assign_itx_all_fn84!(c, BD, 4, 16, R); - assign_itx_all_fn84!(c, BD, 8, 4, R); - assign_itx_all_fn84!(c, BD, 8, 8 ); - assign_itx_all_fn84!(c, BD, 8, 16, R); - assign_itx_all_fn32!(c, BD, 8, 32, R); - assign_itx_all_fn84!(c, BD, 16, 4, R); - assign_itx_all_fn84!(c, BD, 16, 8, R); - assign_itx_all_fn16!(c, BD, 16, 16 ); - assign_itx_all_fn32!(c, BD, 16, 32, R); - assign_itx_all_fn64!(c, BD, 16, 64, R); - assign_itx_all_fn32!(c, BD, 32, 8, R); - assign_itx_all_fn32!(c, BD, 32, 16, R); - assign_itx_all_fn32!(c, BD, 32, 32 ); - assign_itx_all_fn64!(c, BD, 32, 64, R); - assign_itx_all_fn64!(c, BD, 64, 16, R); - assign_itx_all_fn64!(c, BD, 64, 32, R); - assign_itx_all_fn64!(c, BD, 64, 64 ); - - #[cfg(feature = "asm")] - cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - itx_dsp_init_x86::(c, _bpc); - } else if #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] { - itx_dsp_init_arm::(c, _bpc); +impl Rav1dInvTxfmDSPContext { + pub const fn default() -> Self { + let mut c = Self { + itxfm_add: [[None; N_TX_TYPES_PLUS_LL]; N_RECT_TX_SIZES], + }; + + c.itxfm_add[TX_4X4 as usize][WHT_WHT as usize] = + Some(inv_txfm_add_wht_wht_4x4_c_erased::); + + #[rustfmt::skip] + const fn assign(mut c: Rav1dInvTxfmDSPContext) -> Rav1dInvTxfmDSPContext { + assign_itx_all_fn84!(c, BD, 4, 4 ); + assign_itx_all_fn84!(c, BD, 4, 8, R); + assign_itx_all_fn84!(c, BD, 4, 16, R); + assign_itx_all_fn84!(c, BD, 8, 4, R); + assign_itx_all_fn84!(c, BD, 8, 8 ); + assign_itx_all_fn84!(c, BD, 8, 16, R); + assign_itx_all_fn32!(c, BD, 8, 32, R); + assign_itx_all_fn84!(c, BD, 16, 4, R); + assign_itx_all_fn84!(c, BD, 16, 8, R); + assign_itx_all_fn16!(c, BD, 16, 16 ); + assign_itx_all_fn32!(c, BD, 16, 32, R); + assign_itx_all_fn64!(c, BD, 16, 64, R); + assign_itx_all_fn32!(c, BD, 32, 8, R); + assign_itx_all_fn32!(c, BD, 32, 16, R); + assign_itx_all_fn32!(c, BD, 32, 32 ); + assign_itx_all_fn64!(c, BD, 32, 64, R); + assign_itx_all_fn64!(c, BD, 64, 16, R); + assign_itx_all_fn64!(c, BD, 64, 32, R); + assign_itx_all_fn64!(c, BD, 64, 64 ); + + c + } + + assign::(c) + } + + #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] + #[inline(always)] + const fn init_x86(mut self, flags: CpuFlags, bpc: c_int) -> Self { + if !flags.contains(CpuFlags::SSE2) { + return self; + } + + assign_itx_fn!(self, BD, 4, 4, wht_wht, WHT_WHT, sse2); + + if !flags.contains(CpuFlags::SSSE3) { + return self; + } + + if BD::BITDEPTH == 8 { + assign_itx16_bpc_fn!(self, 4, 4, 8 bpc, ssse3); + assign_itx16_bpc_fn!(self, R, 4, 8, 8 bpc, ssse3); + assign_itx16_bpc_fn!(self, R, 8, 4, 8 bpc, ssse3); + assign_itx16_bpc_fn!(self, 8, 8, 8 bpc, ssse3); + assign_itx16_bpc_fn!(self, R, 4, 16, 8 bpc, ssse3); + assign_itx16_bpc_fn!(self, R, 16, 4, 8 bpc, ssse3); + assign_itx16_bpc_fn!(self, R, 8, 16, 8 bpc, ssse3); + assign_itx16_bpc_fn!(self, R, 16, 8, 8 bpc, ssse3); + assign_itx12_bpc_fn!(self, 16, 16, 8 bpc, ssse3); + assign_itx2_bpc_fn! (self, R, 8, 32, 8 bpc, ssse3); + assign_itx2_bpc_fn! (self, R, 32, 8, 8 bpc, ssse3); + assign_itx2_bpc_fn! (self, R, 16, 32, 8 bpc, ssse3); + assign_itx2_bpc_fn! (self, R, 32, 16, 8 bpc, ssse3); + assign_itx2_bpc_fn! (self, 32, 32, 8 bpc, ssse3); + assign_itx1_bpc_fn! (self, R, 16, 64, 8 bpc, ssse3); + assign_itx1_bpc_fn! (self, R, 32, 64, 8 bpc, ssse3); + assign_itx1_bpc_fn! (self, R, 64, 16, 8 bpc, ssse3); + assign_itx1_bpc_fn! (self, R, 64, 32, 8 bpc, ssse3); + assign_itx1_bpc_fn! (self, 64, 64, 8 bpc, ssse3); + } + + if !flags.contains(CpuFlags::SSE41) { + return self; + } + + if BD::BITDEPTH == 16 { + if bpc == 10 { + assign_itx16_bpc_fn!(self, 4, 4, 16 bpc, sse4); + assign_itx16_bpc_fn!(self, R, 4, 8, 16 bpc, sse4); + assign_itx16_bpc_fn!(self, R, 4, 16, 16 bpc, sse4); + assign_itx16_bpc_fn!(self, R, 8, 4, 16 bpc, sse4); + assign_itx16_bpc_fn!(self, 8, 8, 16 bpc, sse4); + assign_itx16_bpc_fn!(self, R, 8, 16, 16 bpc, sse4); + assign_itx16_bpc_fn!(self, R, 16, 4, 16 bpc, sse4); + assign_itx16_bpc_fn!(self, R, 16, 8, 16 bpc, sse4); + assign_itx12_bpc_fn!(self, 16, 16, 16 bpc, sse4); + assign_itx2_bpc_fn! (self, R, 8, 32, 16 bpc, sse4); + assign_itx2_bpc_fn! (self, R, 16, 32, 16 bpc, sse4); + assign_itx2_bpc_fn! (self, R, 32, 8, 16 bpc, sse4); + assign_itx2_bpc_fn! (self, R, 32, 16, 16 bpc, sse4); + assign_itx2_bpc_fn! (self, 32, 32, 16 bpc, sse4); + assign_itx1_bpc_fn! (self, R, 16, 64, 16 bpc, sse4); + assign_itx1_bpc_fn! (self, R, 32, 64, 16 bpc, sse4); + assign_itx1_bpc_fn! (self, R, 64, 16, 16 bpc, sse4); + assign_itx1_bpc_fn! (self, R, 64, 32, 16 bpc, sse4); + assign_itx1_bpc_fn! (self, 64, 64, 16 bpc, sse4); + } + } + + #[cfg(target_arch = "x86_64")] + { + if !flags.contains(CpuFlags::AVX2) { + return self; + } + + assign_itx_fn!(self, BD, 4, 4, wht_wht, WHT_WHT, avx2); + + if BD::BITDEPTH == 8 { + assign_itx16_bpc_fn!(self, 4, 4, 8 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 4, 8, 8 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 4, 16, 8 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 8, 4, 8 bpc, avx2); + assign_itx16_bpc_fn!(self, 8, 8, 8 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 8, 16, 8 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 16, 4, 8 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 16, 8, 8 bpc, avx2); + assign_itx12_bpc_fn!(self, 16, 16, 8 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 8, 32, 8 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 16, 32, 8 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 32, 8, 8 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 32, 16, 8 bpc, avx2); + assign_itx2_bpc_fn! (self, 32, 32, 8 bpc, avx2); + assign_itx1_bpc_fn! (self, R, 16, 64, 8 bpc, avx2); + assign_itx1_bpc_fn! (self, R, 32, 64, 8 bpc, avx2); + assign_itx1_bpc_fn! (self, R, 64, 16, 8 bpc, avx2); + assign_itx1_bpc_fn! (self, R, 64, 32, 8 bpc, avx2); + assign_itx1_bpc_fn! (self, 64, 64, 8 bpc, avx2); + } else { + if bpc == 10 { + assign_itx16_bpc_fn!(self, 4, 4, 10 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 4, 8, 10 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 4, 16, 10 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 8, 4, 10 bpc, avx2); + assign_itx16_bpc_fn!(self, 8, 8, 10 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 8, 16, 10 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 16, 4, 10 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 16, 8, 10 bpc, avx2); + assign_itx12_bpc_fn!(self, 16, 16, 10 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 8, 32, 10 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 16, 32, 10 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 32, 8, 10 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 32, 16, 10 bpc, avx2); + assign_itx2_bpc_fn! (self, 32, 32, 10 bpc, avx2); + assign_itx1_bpc_fn! (self, R, 16, 64, 10 bpc, avx2); + assign_itx1_bpc_fn! (self, R, 32, 64, 10 bpc, avx2); + assign_itx1_bpc_fn! (self, R, 64, 16, 10 bpc, avx2); + assign_itx1_bpc_fn! (self, R, 64, 32, 10 bpc, avx2); + assign_itx1_bpc_fn! (self, 64, 64, 10 bpc, avx2); + } else { + assign_itx16_bpc_fn!(self, 4, 4, 12 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 4, 8, 12 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 4, 16, 12 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 8, 4, 12 bpc, avx2); + assign_itx16_bpc_fn!(self, 8, 8, 12 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 8, 16, 12 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 16, 4, 12 bpc, avx2); + assign_itx16_bpc_fn!(self, R, 16, 8, 12 bpc, avx2); + assign_itx12_bpc_fn!(self, 16, 16, 12 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 8, 32, 12 bpc, avx2); + assign_itx2_bpc_fn! (self, R, 32, 8, 12 bpc, avx2); + assign_itx_bpc_fn! (self, R, 16, 32, identity_identity, IDTX, 12 bpc, avx2); + assign_itx_bpc_fn! (self, R, 32, 16, identity_identity, IDTX, 12 bpc, avx2); + assign_itx_bpc_fn! (self, 32, 32, identity_identity, IDTX, 12 bpc, avx2); + } + } + + if !flags.contains(CpuFlags::AVX512ICL) { + return self; + } + + if BD::BITDEPTH == 8 { + assign_itx16_bpc_fn!(self, 4, 4, 8 bpc, avx512icl); // no wht + assign_itx16_bpc_fn!(self, R, 4, 8, 8 bpc, avx512icl); + assign_itx16_bpc_fn!(self, R, 4, 16, 8 bpc, avx512icl); + assign_itx16_bpc_fn!(self, R, 8, 4, 8 bpc, avx512icl); + assign_itx16_bpc_fn!(self, 8, 8, 8 bpc, avx512icl); + assign_itx16_bpc_fn!(self, R, 8, 16, 8 bpc, avx512icl); + assign_itx16_bpc_fn!(self, R, 16, 4, 8 bpc, avx512icl); + assign_itx16_bpc_fn!(self, R, 16, 8, 8 bpc, avx512icl); + assign_itx12_bpc_fn!(self, 16, 16, 8 bpc, avx512icl); + assign_itx2_bpc_fn! (self, R, 8, 32, 8 bpc, avx512icl); + assign_itx2_bpc_fn! (self, R, 16, 32, 8 bpc, avx512icl); + assign_itx2_bpc_fn! (self, R, 32, 8, 8 bpc, avx512icl); + assign_itx2_bpc_fn! (self, R, 32, 16, 8 bpc, avx512icl); + assign_itx2_bpc_fn! (self, 32, 32, 8 bpc, avx512icl); + assign_itx1_bpc_fn! (self, R, 16, 64, 8 bpc, avx512icl); + assign_itx1_bpc_fn! (self, R, 32, 64, 8 bpc, avx512icl); + assign_itx1_bpc_fn! (self, R, 64, 16, 8 bpc, avx512icl); + assign_itx1_bpc_fn! (self, R, 64, 32, 8 bpc, avx512icl); + assign_itx1_bpc_fn! (self, 64, 64, 8 bpc, avx512icl); + } else { + if bpc == 10 { + assign_itx16_bpc_fn!(self, 8, 8, 10 bpc, avx512icl); + assign_itx16_bpc_fn!(self, R, 8, 16, 10 bpc, avx512icl); + assign_itx16_bpc_fn!(self, R, 16, 8, 10 bpc, avx512icl); + assign_itx12_bpc_fn!(self, 16, 16, 10 bpc, avx512icl); + assign_itx2_bpc_fn! (self, R, 8, 32, 10 bpc, avx512icl); + assign_itx2_bpc_fn! (self, R, 16, 32, 10 bpc, avx512icl); + assign_itx2_bpc_fn! (self, R, 32, 8, 10 bpc, avx512icl); + assign_itx2_bpc_fn! (self, R, 32, 16, 10 bpc, avx512icl); + assign_itx2_bpc_fn! (self, 32, 32, 10 bpc, avx512icl); + assign_itx1_bpc_fn! (self, R, 16, 64, 10 bpc, avx512icl); + assign_itx1_bpc_fn! (self, R, 32, 64, 10 bpc, avx512icl); + assign_itx1_bpc_fn! (self, R, 64, 16, 10 bpc, avx512icl); + assign_itx1_bpc_fn! (self, R, 64, 32, 10 bpc, avx512icl); + assign_itx1_bpc_fn! (self, 64, 64, 10 bpc, avx512icl); + } + } + } + + self + } + + #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] + #[inline(always)] + const fn init_arm(mut self, flags: CpuFlags, bpc: c_int) -> Self { + if !flags.contains(CpuFlags::NEON) { + return self; + } + + if BD::BITDEPTH == 16 && bpc != 10 { + return self; } + + assign_itx_fn!(self, BD, 4, 4, wht_wht, WHT_WHT, neon); + + #[rustfmt::skip] + const fn assign(mut c: Rav1dInvTxfmDSPContext) -> Rav1dInvTxfmDSPContext { + assign_itx16_fn!(c, BD, 4, 4, neon); + assign_itx16_fn!(c, BD, R, 4, 8, neon); + assign_itx16_fn!(c, BD, R, 4, 16, neon); + assign_itx16_fn!(c, BD, R, 8, 4, neon); + assign_itx16_fn!(c, BD, 8, 8, neon); + assign_itx16_fn!(c, BD, R, 8, 16, neon); + assign_itx16_fn!(c, BD, R, 16, 4, neon); + assign_itx16_fn!(c, BD, R, 16, 8, neon); + assign_itx12_fn!(c, BD, 16, 16, neon); + assign_itx2_fn! (c, BD, R, 8, 32, neon); + assign_itx2_fn! (c, BD, R, 16, 32, neon); + assign_itx2_fn! (c, BD, R, 32, 8, neon); + assign_itx2_fn! (c, BD, R, 32, 16, neon); + assign_itx2_fn! (c, BD, 32, 32, neon); + assign_itx1_fn! (c, BD, R, 16, 64, neon); + assign_itx1_fn! (c, BD, R, 32, 64, neon); + assign_itx1_fn! (c, BD, R, 64, 16, neon); + assign_itx1_fn! (c, BD, R, 64, 32, neon); + assign_itx1_fn! (c, BD, 64, 64, neon); + + c + } + + assign::(self) + } + + #[inline(always)] + const fn init(self, flags: CpuFlags, bpc: c_int) -> Self { + #[cfg(feature = "asm")] + { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + return self.init_x86::(flags, bpc); + } + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + { + return self.init_arm::(flags, bpc); + } + } + + #[allow(unreachable_code)] // Reachable on some #[cfg]s. + { + let _ = flags; + let _ = bpc; + self + } + } + + pub const fn new(flags: CpuFlags, bpc: c_int) -> Self { + Self::default::().init::(flags, bpc) } } diff --git a/src/levels.rs b/src/levels.rs index 3c23cc091..2591535a4 100644 --- a/src/levels.rs +++ b/src/levels.rs @@ -1,3 +1,4 @@ +use crate::src::enum_map::EnumKey; use std::ops::Neg; use strum::EnumCount; use strum::FromRepr; @@ -182,6 +183,25 @@ pub enum Filter2d { Bilinear = 9, } +impl EnumKey<{ Self::COUNT }> for Filter2d { + const VALUES: [Self; Self::COUNT] = [ + Self::Regular8Tap, + Self::RegularSmooth8Tap, + Self::RegularSharp8Tap, + Self::SharpRegular8Tap, + Self::SharpSmooth8Tap, + Self::Sharp8Tap, + Self::SmoothRegular8Tap, + Self::Smooth8Tap, + Self::SmoothSharp8Tap, + Self::Bilinear, + ]; + + fn as_usize(self) -> usize { + self as usize + } +} + #[derive(Clone, Copy, PartialEq, Eq, FromRepr, EnumCount)] pub enum MVJoint { Zero = 0, diff --git a/src/lf_apply.rs b/src/lf_apply.rs index d48e7c0ef..3963ead96 100644 --- a/src/lf_apply.rs +++ b/src/lf_apply.rs @@ -13,7 +13,6 @@ use libc::ptrdiff_t; use std::cmp; use std::ffi::c_int; use std::ffi::c_uint; -use std::slice; use std::sync::atomic::AtomicU16; use std::sync::atomic::Ordering; @@ -38,7 +37,7 @@ unsafe fn backup_lpf( ss_hor: c_int, lr_backup: c_int, frame_hdr: &Rav1dFrameHeader, - dsp: *const Rav1dDSPContext, + dsp: &Rav1dDSPContext, resize_step: [c_int; 2], resize_start: [c_int; 2], bitdepth_max: c_int, @@ -82,7 +81,7 @@ unsafe fn backup_lpf( if lr_backup != 0 && frame_hdr.size.width[0] != frame_hdr.size.width[1] { while row + stripe_h <= row_h { let n_lines = 4 - (row + stripe_h + 1 == h) as c_int; - ((*dsp).mc.resize)( + (dsp.mc.resize)( dst.as_mut_ptr().add(dst_offset).cast(), dst_stride, src.as_ptr().add(src_offset).cast(), @@ -156,6 +155,8 @@ pub(crate) unsafe fn rav1d_copy_lpf( let seq_hdr = &***f.seq_hdr.as_ref().unwrap(); let tt_off = have_tt * sby * ((4 as c_int) << seq_hdr.sb128); + let src_y_stride = BD::pxstride(src_stride[0]); + let src_uv_stride = BD::pxstride(src_stride[1]); let y_stride = BD::pxstride(lr_stride[0]); let uv_stride = BD::pxstride(lr_stride[1]); @@ -185,7 +186,7 @@ pub(crate) unsafe fn rav1d_copy_lpf( dst_offset[0], lr_stride[0], src[0], - (src_offset[0] as isize - offset as isize * BD::pxstride(src_stride[0])) as usize, + (src_offset[0] as isize - offset as isize * src_y_stride) as usize, src_stride[0], 0, seq_hdr.sb128, @@ -203,24 +204,18 @@ pub(crate) unsafe fn rav1d_copy_lpf( ); } if have_tt != 0 && resize != 0 { - let cdef_off_y: ptrdiff_t = (sby * 4) as isize * BD::pxstride(src_stride[0]); - let cdef_plane_y_sz = 4 * f.sbh as isize * y_stride; - let y_span = cdef_plane_y_sz - y_stride; + let cdef_off_y: ptrdiff_t = (sby * 4) as isize * src_y_stride; + let cdef_plane_y_sz = 4 * f.sbh as isize * src_y_stride; + let y_span = cdef_plane_y_sz - src_y_stride; + let cdef_line_start = (f.lf.cdef_lpf_line[0] as isize + cmp::min(y_span, 0)) as usize; backup_lpf::( c, - slice::from_raw_parts_mut( - cdef_line_buf - .as_mut_ptr() - .add(f.lf.cdef_lpf_line[0]) - .offset(cmp::min(y_span, 0)), - cdef_plane_y_sz.unsigned_abs(), - ), + &mut cdef_line_buf + [cdef_line_start..cdef_line_start + cdef_plane_y_sz.unsigned_abs()], (cdef_off_y - cmp::min(y_span, 0)) as usize, src_stride[0], src[0], - (src_offset[0] as isize - - offset as isize * BD::pxstride(src_stride[0] as usize) as isize) - as usize, + (src_offset[0] as isize - offset as isize * src_y_stride as isize) as usize, src_stride[0], 0, seq_hdr.sb128, @@ -248,7 +243,7 @@ pub(crate) unsafe fn rav1d_copy_lpf( let row_h_0 = cmp::min((sby + 1) << 6 - ss_ver + seq_hdr.sb128, h_0 - 1); let offset_uv = offset >> ss_ver; let y_stripe_0 = (sby << 6 - ss_ver + seq_hdr.sb128) - offset_uv; - let cdef_off_uv: ptrdiff_t = sby as isize * 4 * BD::pxstride(src_stride[1]); + let cdef_off_uv: ptrdiff_t = sby as isize * 4 * src_uv_stride; if seq_hdr.cdef != 0 || restore_planes & LR_RESTORE_U as c_int != 0 { if restore_planes & LR_RESTORE_U as c_int != 0 || resize == 0 { backup_lpf::( @@ -257,8 +252,7 @@ pub(crate) unsafe fn rav1d_copy_lpf( dst_offset[1], lr_stride[1], src[1], - (src_offset[1] as isize - offset_uv as isize * BD::pxstride(src_stride[1])) - as usize, + (src_offset[1] as isize - offset_uv as isize * src_uv_stride) as usize, src_stride[1], ss_ver, seq_hdr.sb128, @@ -276,22 +270,18 @@ pub(crate) unsafe fn rav1d_copy_lpf( ); } if have_tt != 0 && resize != 0 { - let cdef_plane_uv_sz = 4 * f.sbh as isize * uv_stride; - let uv_span = cdef_plane_uv_sz - uv_stride; + let cdef_plane_uv_sz = 4 * f.sbh as isize * src_uv_stride; + let uv_span = cdef_plane_uv_sz - src_uv_stride; + let cdef_line_start = + (f.lf.cdef_lpf_line[1] as isize + cmp::min(uv_span, 0)) as usize; backup_lpf::( c, - slice::from_raw_parts_mut( - cdef_line_buf - .as_mut_ptr() - .add(f.lf.cdef_lpf_line[1]) - .offset(cmp::min(uv_span, 0)), - cdef_plane_uv_sz.unsigned_abs(), - ), + &mut cdef_line_buf + [cdef_line_start..cdef_line_start + cdef_plane_uv_sz.unsigned_abs()], (cdef_off_uv - cmp::min(uv_span, 0)) as usize, src_stride[1], src[1], - (src_offset[1] as isize - offset_uv as isize * BD::pxstride(src_stride[1])) - as usize, + (src_offset[1] as isize - offset_uv as isize * src_uv_stride) as usize, src_stride[1], ss_ver, seq_hdr.sb128, @@ -317,8 +307,7 @@ pub(crate) unsafe fn rav1d_copy_lpf( dst_offset[2], lr_stride[1], src[2], - (src_offset[1] as isize - offset_uv as isize * BD::pxstride(src_stride[1])) - as usize, + (src_offset[1] as isize - offset_uv as isize * src_uv_stride) as usize, src_stride[1], ss_ver, seq_hdr.sb128, @@ -336,22 +325,18 @@ pub(crate) unsafe fn rav1d_copy_lpf( ); } if have_tt != 0 && resize != 0 { - let cdef_plane_uv_sz = 4 * f.sbh as isize * uv_stride; - let uv_span = cdef_plane_uv_sz - uv_stride; + let cdef_plane_uv_sz = 4 * f.sbh as isize * src_uv_stride; + let uv_span = cdef_plane_uv_sz - src_uv_stride; + let cdef_line_start = + (f.lf.cdef_lpf_line[2] as isize + cmp::min(uv_span, 0)) as usize; backup_lpf::( c, - slice::from_raw_parts_mut( - cdef_line_buf - .as_mut_ptr() - .add(f.lf.cdef_lpf_line[2]) - .offset(cmp::min(uv_span, 0)), - cdef_plane_uv_sz.unsigned_abs(), - ), + &mut cdef_line_buf + [cdef_line_start..cdef_line_start + cdef_plane_uv_sz.unsigned_abs()], (cdef_off_uv - cmp::min(uv_span, 0)) as usize, src_stride[1], src[2], - (src_offset[1] as isize - offset_uv as isize * BD::pxstride(src_stride[1])) - as usize, + (src_offset[1] as isize - offset_uv as isize * src_uv_stride) as usize, src_stride[1], ss_ver, seq_hdr.sb128, @@ -398,8 +383,6 @@ unsafe fn filter_plane_cols_y( starty4: c_int, endy4: c_int, ) { - let dsp: &Rav1dDSPContext = &*f.dsp; - // filter edges between columns (e.g. block1 | block2) for x in 0..w as usize { if !(!have_left && x == 0) { @@ -419,7 +402,7 @@ unsafe fn filter_plane_cols_y( hmask[2] = mask[x][2][1].load(Ordering::Relaxed) as u32; } // hmask[3] = 0; already initialized above - dsp.lf.loop_filter_sb[0][0]( + f.dsp.lf.loop_filter_sb[0][0]( dst.as_mut_ptr().add(dst_offset + x * 4).cast(), ls, hmask.as_mut_ptr(), @@ -447,8 +430,6 @@ unsafe fn filter_plane_rows_y( starty4: c_int, endy4: c_int, ) { - let dsp: &Rav1dDSPContext = &*f.dsp; - // block1 // filter edges between rows (e.g. ------) // block2 @@ -463,7 +444,7 @@ unsafe fn filter_plane_rows_y( | (mask[y as usize][2][1].load(Ordering::Relaxed) as u32) << 16, 0, ]; - dsp.lf.loop_filter_sb[0][1]( + f.dsp.lf.loop_filter_sb[0][1]( dst.as_mut_ptr().add(dst_offset).cast(), ls, vmask.as_ptr(), @@ -494,8 +475,6 @@ unsafe fn filter_plane_cols_uv( endy4: c_int, ss_ver: c_int, ) { - let dsp: &Rav1dDSPContext = &*f.dsp; - // filter edges between columns (e.g. block1 | block2) for x in 0..w as usize { if !(!have_left && x == 0) { @@ -514,7 +493,7 @@ unsafe fn filter_plane_cols_uv( hmask[1] = mask[x as usize][1][1].load(Ordering::Relaxed) as u32; } // hmask[2] = 0; Already initialized to 0 above - dsp.lf.loop_filter_sb[1][0]( + f.dsp.lf.loop_filter_sb[1][0]( u.as_mut_ptr().add(uv_offset + x * 4).cast(), ls, hmask.as_mut_ptr(), @@ -524,7 +503,7 @@ unsafe fn filter_plane_cols_uv( endy4 - starty4, f.bitdepth_max, ); - dsp.lf.loop_filter_sb[1][0]( + f.dsp.lf.loop_filter_sb[1][0]( v.as_mut_ptr().add(uv_offset + x * 4).cast(), ls, hmask.as_mut_ptr(), @@ -554,7 +533,6 @@ unsafe fn filter_plane_rows_uv( endy4: c_int, ss_hor: c_int, ) { - let dsp: &Rav1dDSPContext = &*f.dsp; let mut off_l = uv_offset as ptrdiff_t; // block1 @@ -569,7 +547,7 @@ unsafe fn filter_plane_rows_uv( | (mask[y as usize][1][1].load(Ordering::Relaxed) as u32) << (16 >> ss_hor), 0, ]; - dsp.lf.loop_filter_sb[1][1]( + f.dsp.lf.loop_filter_sb[1][1]( u.as_mut_ptr().offset(off_l).cast(), ls, vmask.as_ptr(), @@ -579,7 +557,7 @@ unsafe fn filter_plane_rows_uv( w, f.bitdepth_max, ); - dsp.lf.loop_filter_sb[1][1]( + f.dsp.lf.loop_filter_sb[1][1]( v.as_mut_ptr().offset(off_l).cast(), ls, vmask.as_ptr(), diff --git a/src/lf_mask.rs b/src/lf_mask.rs index 7ef1b149e..0128eb4e8 100644 --- a/src/lf_mask.rs +++ b/src/lf_mask.rs @@ -4,6 +4,7 @@ use crate::include::dav1d::headers::Rav1dLoopfilterModeRefDeltas; use crate::include::dav1d::headers::Rav1dPixelLayout; use crate::include::dav1d::headers::Rav1dRestorationType; use crate::src::align::Align16; +use crate::src::align::ArrayDefault; use crate::src::ctx::CaseSet; use crate::src::disjoint_mut::DisjointMut; use crate::src::internal::Bxy; @@ -27,6 +28,22 @@ pub struct Av1FilterLUT { pub sharp: [u64; 2], } +impl Default for Av1FilterLUT { + fn default() -> Self { + Self { + e: [0; 64], + i: [0; 64], + sharp: Default::default(), + } + } +} + +impl ArrayDefault for Av1FilterLUT { + fn default() -> Self { + Default::default() + } +} + #[derive(Clone, Copy, Default)] #[repr(C)] pub struct Av1RestorationUnit { diff --git a/src/lib.rs b/src/lib.rs index 8053ee430..b2819e0b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,6 +28,7 @@ use crate::src::fg_apply; use crate::src::internal::Rav1dContext; use crate::src::internal::Rav1dContextTaskThread; use crate::src::internal::Rav1dContextTaskType; +use crate::src::internal::Rav1dDSPContext; use crate::src::internal::Rav1dFrameData; use crate::src::internal::Rav1dTaskContext; use crate::src::internal::Rav1dTaskContext_task_thread; @@ -51,7 +52,6 @@ use crate::src::picture::rav1d_thread_picture_ref; use crate::src::picture::rav1d_thread_picture_unref; use crate::src::picture::PictureFlags; use crate::src::picture::Rav1dThreadPicture; -use crate::src::r#ref::rav1d_ref_dec; use crate::src::refmvs::rav1d_refmvs_clear; use crate::src::refmvs::rav1d_refmvs_dsp_init; use crate::src::refmvs::rav1d_refmvs_init; @@ -218,11 +218,6 @@ pub(crate) unsafe fn rav1d_open(c_out: &mut *mut Rav1dContext, s: &Rav1dSettings (*c).inloop_filters = s.inloop_filters; (*c).decode_frame_type = s.decode_frame_type; (*c).cached_error_props = Default::default(); - if rav1d_mem_pool_init(&mut (*c).refmvs_pool).is_err() - || rav1d_mem_pool_init(&mut (*c).cdf_pool).is_err() - { - return error(c, c_out); - } if (*c).allocator.alloc_picture_callback == dav1d_default_picture_alloc && (*c).allocator.release_picture_callback == dav1d_default_picture_release { @@ -293,19 +288,14 @@ pub(crate) unsafe fn rav1d_open(c_out: &mut *mut Rav1dContext, s: &Rav1dSettings addr_of_mut!(f.task_thread.finished).write(AtomicBool::new(true)); addr_of_mut!(f.frame_thread).write(Default::default()); addr_of_mut!(f.frame_thread_progress).write(Default::default()); + addr_of_mut!(f.lowest_pixel_mem).write(Default::default()); + addr_of_mut!(f.lf).write(Default::default()); if n_tc > 1 { f.task_thread.lock = Mutex::new(()); f.task_thread.cond = Condvar::new(); f.task_thread.pending_tasks = Default::default(); } (&mut f.task_thread.ttd as *mut Arc).write(Arc::clone(&(*c).task_thread)); - addr_of_mut!(f.lf.level).write(Default::default()); - addr_of_mut!(f.lf.mask).write(Default::default()); - addr_of_mut!(f.lf.lr_mask).write(Default::default()); - addr_of_mut!(f.lf.tx_lpf_right_edge).write(Default::default()); - addr_of_mut!(f.lf.cdef_line_buf).write(Default::default()); - addr_of_mut!(f.lf.lr_line_buf).write(Default::default()); - addr_of_mut!(f.lf.start_of_tile_row).write(Default::default()); f.lf.last_sharpness = -(1 as c_int); rav1d_refmvs_init(&mut f.rf); } @@ -620,17 +610,17 @@ pub(crate) unsafe fn rav1d_apply_grain( } else { match out.p.bpc { #[cfg(feature = "bitdepth_8")] - 8 => { + bpc @ 8 => { fg_apply::rav1d_apply_grain::( - &mut (*(c.dsp).as_mut_ptr().offset(0)).fg, + &Rav1dDSPContext::get(bpc).as_ref().unwrap().fg, out, in_0, ); } #[cfg(feature = "bitdepth_16")] - 10 | 12 => { + bpc @ 10 | bpc @ 12 => { fg_apply::rav1d_apply_grain::( - &mut (*(c.dsp).as_mut_ptr().offset(((out.p.bpc >> 1) - 4) as isize)).fg, + &Rav1dDSPContext::get(bpc).as_ref().unwrap().fg, out, in_0, ); @@ -684,7 +674,7 @@ pub(crate) unsafe fn rav1d_flush(c: *mut Rav1dContext) { rav1d_thread_picture_unref(&mut (*((*c).refs).as_mut_ptr().offset(i as isize)).p); } let _ = mem::take(&mut (*c).refs[i as usize].segmap); - rav1d_ref_dec(&mut (*((*c).refs).as_mut_ptr().offset(i as isize)).refmvs); + let _ = mem::take(&mut (*c).refs[i as usize].refmvs); let _ = mem::take(&mut (*c).cdf[i]); i += 1; } @@ -855,7 +845,7 @@ impl Drop for Rav1dContext { &mut (*(self.refs).as_mut_ptr().offset(n_4 as isize)).p, ); } - rav1d_ref_dec(&mut (*(self.refs).as_mut_ptr().offset(n_4 as isize)).refmvs); + let _ = mem::take(&mut self.refs[n_4 as usize].refmvs); let _ = mem::take(&mut self.refs[n_4 as usize].segmap); n_4 += 1; } @@ -864,8 +854,6 @@ impl Drop for Rav1dContext { let _ = mem::take(&mut self.mastering_display); let _ = mem::take(&mut self.content_light); let _ = mem::take(&mut self.itut_t35); - rav1d_mem_pool_end(self.refmvs_pool); - rav1d_mem_pool_end(self.cdf_pool); rav1d_mem_pool_end(self.picture_pool); } } diff --git a/src/loopfilter.rs b/src/loopfilter.rs index b6bd74d27..14844f600 100644 --- a/src/loopfilter.rs +++ b/src/loopfilter.rs @@ -2,18 +2,13 @@ use crate::include::common::bitdepth::AsPrimitive; use crate::include::common::bitdepth::BitDepth; use crate::include::common::bitdepth::DynPixel; use crate::include::common::intops::iclip; +use crate::src::cpu::CpuFlags; use crate::src::lf_mask::Av1FilterLUT; use libc::ptrdiff_t; use std::cmp; use std::ffi::c_int; use std::ffi::c_uint; -#[cfg(feature = "asm")] -use crate::src::cpu::{rav1d_get_cpu_flags, CpuFlags}; - -#[cfg(feature = "asm")] -use cfg_if::cfg_if; - #[cfg(feature = "asm")] use crate::include::common::bitdepth::BPC; @@ -984,110 +979,136 @@ unsafe fn loop_filter_v_sb128uv_rust( } } -#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] -#[inline(always)] -unsafe fn loop_filter_dsp_init_x86(c: *mut Rav1dLoopFilterDSPContext) { - let flags = rav1d_get_cpu_flags(); - - if !flags.contains(CpuFlags::SSSE3) { - return; +impl Rav1dLoopFilterDSPContext { + pub const fn default() -> Self { + Self { + loop_filter_sb: [ + [ + loop_filter_h_sb128y_c_erased::, + loop_filter_v_sb128y_c_erased::, + ], + [ + loop_filter_h_sb128uv_c_erased::, + loop_filter_v_sb128uv_c_erased::, + ], + ], + } } - match BD::BPC { - BPC::BPC8 => { - (*c).loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_8bpc_ssse3; - (*c).loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_8bpc_ssse3; - (*c).loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_8bpc_ssse3; - (*c).loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_8bpc_ssse3; - #[cfg(target_arch = "x86_64")] - { - if !flags.contains(CpuFlags::AVX2) { - return; - } + #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] + #[inline(always)] + const fn init_x86(mut self, flags: CpuFlags) -> Self { + if !flags.contains(CpuFlags::SSSE3) { + return self; + } - (*c).loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_8bpc_avx2; - (*c).loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_8bpc_avx2; - (*c).loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_8bpc_avx2; - (*c).loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_8bpc_avx2; + match BD::BPC { + BPC::BPC8 => { + self.loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_8bpc_ssse3; + self.loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_8bpc_ssse3; + self.loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_8bpc_ssse3; + self.loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_8bpc_ssse3; - if !flags.contains(CpuFlags::AVX512ICL) { - return; - } + #[cfg(target_arch = "x86_64")] + { + if !flags.contains(CpuFlags::AVX2) { + return self; + } - (*c).loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_8bpc_avx512icl; - (*c).loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_8bpc_avx512icl; - (*c).loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_8bpc_avx512icl; - (*c).loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_8bpc_avx512icl; - } - } - BPC::BPC16 => { - (*c).loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_ssse3; - (*c).loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_ssse3; - (*c).loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_ssse3; - (*c).loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_ssse3; + self.loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_8bpc_avx2; + self.loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_8bpc_avx2; + self.loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_8bpc_avx2; + self.loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_8bpc_avx2; - #[cfg(target_arch = "x86_64")] - { - if !flags.contains(CpuFlags::AVX2) { - return; + if !flags.contains(CpuFlags::AVX512ICL) { + return self; + } + + self.loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_8bpc_avx512icl; + self.loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_8bpc_avx512icl; + self.loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_8bpc_avx512icl; + self.loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_8bpc_avx512icl; } + } + BPC::BPC16 => { + self.loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_ssse3; + self.loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_ssse3; + self.loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_ssse3; + self.loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_ssse3; - (*c).loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_avx2; - (*c).loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_avx2; - (*c).loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_avx2; - (*c).loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_avx2; + #[cfg(target_arch = "x86_64")] + { + if !flags.contains(CpuFlags::AVX2) { + return self; + } - if !flags.contains(CpuFlags::AVX512ICL) { - return; - } + self.loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_avx2; + self.loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_avx2; + self.loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_avx2; + self.loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_avx2; + + if !flags.contains(CpuFlags::AVX512ICL) { + return self; + } - (*c).loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_avx512icl; - (*c).loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_avx512icl; - (*c).loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_avx512icl; - (*c).loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_avx512icl; + self.loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_avx512icl; + self.loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_avx512icl; + self.loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_avx512icl; + self.loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_avx512icl; + } } } - } -} - -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -#[inline(always)] -unsafe fn loop_filter_dsp_init_arm(c: *mut Rav1dLoopFilterDSPContext) { - let flags = rav1d_get_cpu_flags(); - if !flags.contains(CpuFlags::NEON) { - return; + self } - match BD::BPC { - BPC::BPC8 => { - (*c).loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_8bpc_neon; - (*c).loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_8bpc_neon; - (*c).loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_8bpc_neon; - (*c).loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_8bpc_neon; + #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] + #[inline(always)] + const fn init_arm(mut self, flags: CpuFlags) -> Self { + if !flags.contains(CpuFlags::NEON) { + return self; } - BPC::BPC16 => { - (*c).loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_neon; - (*c).loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_neon; - (*c).loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_neon; - (*c).loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_neon; + + match BD::BPC { + BPC::BPC8 => { + self.loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_8bpc_neon; + self.loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_8bpc_neon; + self.loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_8bpc_neon; + self.loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_8bpc_neon; + } + BPC::BPC16 => { + self.loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_neon; + self.loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_neon; + self.loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_neon; + self.loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_neon; + } } + + self } -} -#[cold] -pub unsafe fn rav1d_loop_filter_dsp_init(c: *mut Rav1dLoopFilterDSPContext) { - (*c).loop_filter_sb[0][0] = loop_filter_h_sb128y_c_erased::; - (*c).loop_filter_sb[0][1] = loop_filter_v_sb128y_c_erased::; - (*c).loop_filter_sb[1][0] = loop_filter_h_sb128uv_c_erased::; - (*c).loop_filter_sb[1][1] = loop_filter_v_sb128uv_c_erased::; + #[inline(always)] + const fn init(self, flags: CpuFlags) -> Self { + #[cfg(feature = "asm")] + { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + return self.init_x86::(flags); + } + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + { + return self.init_arm::(flags); + } + } - #[cfg(feature = "asm")] - cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - loop_filter_dsp_init_x86::(c); - } else if #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] { - loop_filter_dsp_init_arm::(c); + #[allow(unreachable_code)] // Reachable on some #[cfg]s. + { + let _ = flags; + self } } + + pub const fn new(flags: CpuFlags) -> Self { + Self::default::().init::(flags) + } } diff --git a/src/looprestoration.rs b/src/looprestoration.rs index 58bb6ddd8..ef00e340e 100644 --- a/src/looprestoration.rs +++ b/src/looprestoration.rs @@ -6,6 +6,7 @@ use crate::include::common::bitdepth::ToPrimitive; use crate::include::common::bitdepth::BPC; use crate::include::common::intops::iclip; use crate::src::align::Align16; +use crate::src::cpu::CpuFlags; use crate::src::cursor::CursorMut; use crate::src::tables::dav1d_sgr_x_by_x; use libc::ptrdiff_t; @@ -27,9 +28,6 @@ use libc::intptr_t; ))] use crate::include::common::bitdepth::bd_fn; -#[cfg(feature = "asm")] -use crate::src::cpu::{rav1d_get_cpu_flags, CpuFlags}; - #[cfg(all(feature = "asm", target_arch = "arm"))] extern "C" { fn dav1d_sgr_box3_v_neon( @@ -3454,110 +3452,132 @@ unsafe fn sgr_filter_mix_neon( ); } -#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] -#[inline(always)] -fn loop_restoration_dsp_init_x86(c: &mut Rav1dLoopRestorationDSPContext, bpc: c_int) { - let flags = rav1d_get_cpu_flags(); - - if !flags.contains(CpuFlags::SSE2) { - return; +impl Rav1dLoopRestorationDSPContext { + pub const fn default() -> Self { + Self { + wiener: [wiener_c_erased::; 2], + sgr: [ + sgr_5x5_c_erased::, + sgr_3x3_c_erased::, + sgr_mix_c_erased::, + ], + } } - if BD::BPC == BPC::BPC8 { - c.wiener[0] = decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_8bpc_sse2); - c.wiener[1] = decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_8bpc_sse2); - } + #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] + #[inline(always)] + const fn init_x86(mut self, flags: CpuFlags, bpc: c_int) -> Self { + if !flags.contains(CpuFlags::SSE2) { + return self; + } - if !flags.contains(CpuFlags::SSSE3) { - return; - } + if let BPC::BPC8 = BD::BPC { + self.wiener[0] = decl_looprestorationfilter_fn!(fn dav1d_wiener_filter7_8bpc_sse2); + self.wiener[1] = decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_8bpc_sse2); + }; - c.wiener[0] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter7, ssse3); - c.wiener[1] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter5, ssse3); + if !flags.contains(CpuFlags::SSSE3) { + return self; + } - if BD::BPC == BPC::BPC8 || bpc == 10 { - c.sgr[0] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_5x5, ssse3); - c.sgr[1] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_3x3, ssse3); - c.sgr[2] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_mix, ssse3); - } + self.wiener[0] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter7, ssse3); + self.wiener[1] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter5, ssse3); - #[cfg(target_arch = "x86_64")] - { - if !flags.contains(CpuFlags::AVX2) { - return; + if matches!(BD::BPC, BPC::BPC8) || bpc == 10 { + self.sgr[0] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_5x5, ssse3); + self.sgr[1] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_3x3, ssse3); + self.sgr[2] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_mix, ssse3); } - c.wiener[0] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter7, avx2); - c.wiener[1] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter5, avx2); + #[cfg(target_arch = "x86_64")] + { + if !flags.contains(CpuFlags::AVX2) { + return self; + } + + self.wiener[0] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter7, avx2); + self.wiener[1] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter5, avx2); + + if matches!(BD::BPC, BPC::BPC8) || bpc == 10 { + self.sgr[0] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_5x5, avx2); + self.sgr[1] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_3x3, avx2); + self.sgr[2] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_mix, avx2); + } + + if !flags.contains(CpuFlags::AVX512ICL) { + return self; + } + + self.wiener[0] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter7, avx512icl); + self.wiener[1] = match BD::BPC { + // With VNNI we don't need a 5-tap version. + BPC::BPC8 => self.wiener[0], + BPC::BPC16 => { + decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_16bpc_avx512icl) + } + }; - if BD::BPC == BPC::BPC8 || bpc == 10 { - c.sgr[0] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_5x5, avx2); - c.sgr[1] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_3x3, avx2); - c.sgr[2] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_mix, avx2); + if matches!(BD::BPC, BPC::BPC8) || bpc == 10 { + self.sgr[0] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_5x5, avx512icl); + self.sgr[1] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_3x3, avx512icl); + self.sgr[2] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_mix, avx512icl); + } } - if !flags.contains(CpuFlags::AVX512ICL) { - return; + self + } + + #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] + #[inline(always)] + const fn init_arm(mut self, flags: CpuFlags, bpc: c_int) -> Self { + if !flags.contains(CpuFlags::NEON) { + return self; } - c.wiener[0] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter7, avx512icl); - c.wiener[1] = match BD::BPC { - // With VNNI we don't need a 5-tap version. - BPC::BPC8 => c.wiener[0], - BPC::BPC16 => decl_looprestorationfilter_fn!(fn dav1d_wiener_filter5_16bpc_avx512icl), - }; + #[cfg(target_arch = "aarch64")] + { + self.wiener[0] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter7, neon); + self.wiener[1] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter5, neon); + } - if BD::BPC == BPC::BPC8 || bpc == 10 { - c.sgr[0] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_5x5, avx512icl); - c.sgr[1] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_3x3, avx512icl); - c.sgr[2] = bd_fn!(decl_looprestorationfilter_fn, BD, sgr_filter_mix, avx512icl); + #[cfg(target_arch = "arm")] + { + self.wiener[0] = wiener_filter_neon_erased::; + self.wiener[1] = wiener_filter_neon_erased::; } - } -} -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -#[inline(always)] -fn loop_restoration_dsp_init_arm(c: &mut Rav1dLoopRestorationDSPContext, bpc: c_int) { - let flags = rav1d_get_cpu_flags(); + if matches!(BD::BPC, BPC::BPC8) || bpc == 10 { + self.sgr[0] = sgr_filter_5x5_neon_erased::; + self.sgr[1] = sgr_filter_3x3_neon_erased::; + self.sgr[2] = sgr_filter_mix_neon_erased::; + } - if !flags.contains(CpuFlags::NEON) { - return; + self } - cfg_if::cfg_if! { - if #[cfg(target_arch = "aarch64")] { - c.wiener[0] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter7, neon); - c.wiener[1] = bd_fn!(decl_looprestorationfilter_fn, BD, wiener_filter5, neon); - } else { - c.wiener[0] = wiener_filter_neon_erased::; - c.wiener[1] = wiener_filter_neon_erased::; + #[inline(always)] + const fn init(self, flags: CpuFlags, bpc: c_int) -> Self { + #[cfg(feature = "asm")] + { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + return self.init_x86::(flags, bpc); + } + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + { + return self.init_arm::(flags, bpc); + } } - } - if BD::BPC == BPC::BPC8 || bpc == 10 { - c.sgr[0] = sgr_filter_5x5_neon_erased::; - c.sgr[1] = sgr_filter_3x3_neon_erased::; - c.sgr[2] = sgr_filter_mix_neon_erased::; + #[allow(unreachable_code)] // Reachable on some #[cfg]s. + { + let _ = flags; + let _ = bpc; + self + } } -} -#[cold] -pub fn rav1d_loop_restoration_dsp_init( - c: &mut Rav1dLoopRestorationDSPContext, - _bpc: c_int, -) { - c.wiener[1] = wiener_c_erased::; - c.wiener[0] = c.wiener[1]; - c.sgr[0] = sgr_5x5_c_erased::; - c.sgr[1] = sgr_3x3_c_erased::; - c.sgr[2] = sgr_mix_c_erased::; - - #[cfg(feature = "asm")] - cfg_if::cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - loop_restoration_dsp_init_x86::(c, _bpc); - } else if #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]{ - loop_restoration_dsp_init_arm::(c, _bpc); - } + pub const fn new(flags: CpuFlags, bpc: c_int) -> Self { + Self::default::().init::(flags, bpc) } } diff --git a/src/lr_apply.rs b/src/lr_apply.rs index e230723bf..7785732e4 100644 --- a/src/lr_apply.rs +++ b/src/lr_apply.rs @@ -3,7 +3,6 @@ use crate::include::dav1d::headers::Rav1dPixelLayout; use crate::include::dav1d::headers::Rav1dRestorationType; use crate::src::align::Align16; use crate::src::internal::Rav1dContext; -use crate::src::internal::Rav1dDSPContext; use crate::src::internal::Rav1dFrameData; use crate::src::lf_mask::Av1RestorationUnit; use crate::src::looprestoration::looprestorationfilter_fn; @@ -40,7 +39,6 @@ unsafe fn lr_stripe( mut edges: LrEdgeFlags, ) { let seq_hdr = &***f.seq_hdr.as_ref().unwrap(); - let dsp: &Rav1dDSPContext = &*f.dsp; let chroma = (plane != 0) as c_int; let ss_ver = chroma & (f.sr_cur.p.p.layout == Rav1dPixelLayout::I420) as c_int; let stride: ptrdiff_t = f.sr_cur.p.stride[chroma as usize]; @@ -79,7 +77,7 @@ unsafe fn lr_stripe( filter[1][4] = lr.filter_v[2] as i16; filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2; - lr_fn = dsp.lr.wiener[((filter[0][0] | filter[1][0]) == 0) as usize]; + lr_fn = f.dsp.lr.wiener[((filter[0][0] | filter[1][0]) == 0) as usize]; } else { let sgr_idx = assert_matches!(lr.r#type, Rav1dRestorationType::SgrProj(idx) => idx); let sgr_params = &dav1d_sgr_params[sgr_idx as usize]; @@ -87,7 +85,7 @@ unsafe fn lr_stripe( params.sgr.s1 = sgr_params[1] as u32; params.sgr.w0 = lr.sgr_weights[0] as i16; params.sgr.w1 = 128 - (lr.sgr_weights[0] as i16 + lr.sgr_weights[1] as i16); - lr_fn = dsp.lr.sgr[(sgr_params[0] != 0) as usize + (sgr_params[1] != 0) as usize * 2 - 1]; + lr_fn = f.dsp.lr.sgr[(sgr_params[0] != 0) as usize + (sgr_params[1] != 0) as usize * 2 - 1]; } let mut left = &left[..]; while y + stripe_h <= row_h { diff --git a/src/mc.rs b/src/mc.rs index 552382135..3cfe4cf60 100644 --- a/src/mc.rs +++ b/src/mc.rs @@ -3,11 +3,17 @@ use crate::include::common::bitdepth::BitDepth; use crate::include::common::bitdepth::DynPixel; use crate::include::common::intops::iclip; use crate::include::dav1d::headers::Rav1dFilterMode; +use crate::include::dav1d::headers::Rav1dPixelLayoutSubSampled; +use crate::src::cpu::CpuFlags; +use crate::src::enum_map::enum_map; +use crate::src::enum_map::enum_map_ty; +use crate::src::enum_map::DefaultValue; use crate::src::levels::Filter2d; use crate::src::tables::dav1d_mc_subpel_filters; use crate::src::tables::dav1d_mc_warp_filter; use crate::src::tables::dav1d_obmc_masks; use crate::src::tables::dav1d_resize_filter; +use crate::src::wrap_fn_ptr::wrap_fn_ptr; use libc::intptr_t; use libc::ptrdiff_t; use std::cmp; @@ -19,10 +25,7 @@ use to_method::To; use crate::include::common::bitdepth::bd_fn; #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] -use crate::include::common::bitdepth::BPC; - -#[cfg(feature = "asm")] -use crate::src::cpu::{rav1d_get_cpu_flags, CpuFlags}; +use crate::include::common::bitdepth::{bpc_fn, BPC}; #[inline(never)] unsafe fn put_rust( @@ -1171,31 +1174,73 @@ unsafe fn resize_rust( } } -pub type mc_fn = unsafe extern "C" fn( - *mut DynPixel, - ptrdiff_t, - *const DynPixel, - ptrdiff_t, - c_int, - c_int, - c_int, - c_int, - c_int, -) -> (); +wrap_fn_ptr!(pub unsafe extern "C" fn mc( + dst: *mut DynPixel, + dst_stride: ptrdiff_t, + src: *const DynPixel, + src_stride: ptrdiff_t, + w: c_int, + h: c_int, + mx: c_int, + my: c_int, + bitdepth_max: c_int, +) -> ()); + +impl mc::Fn { + pub unsafe fn call( + &self, + dst: *mut BD::Pixel, + dst_stride: ptrdiff_t, + src: *const BD::Pixel, + src_stride: ptrdiff_t, + w: c_int, + h: c_int, + mx: c_int, + my: c_int, + bd: BD, + ) { + let dst = dst.cast(); + let src = src.cast(); + let bd = bd.into_c(); + self.get()(dst, dst_stride, src, src_stride, w, h, mx, my, bd) + } +} -pub type mc_scaled_fn = unsafe extern "C" fn( - *mut DynPixel, - ptrdiff_t, - *const DynPixel, - ptrdiff_t, - c_int, - c_int, - c_int, - c_int, - c_int, - c_int, - c_int, -) -> (); +wrap_fn_ptr!(pub unsafe extern "C" fn mc_scaled( + dst: *mut DynPixel, + dst_stride: ptrdiff_t, + src: *const DynPixel, + src_stride: ptrdiff_t, + w: c_int, + h: c_int, + mx: c_int, + my: c_int, + dx: c_int, + dy: c_int, + bitdepth_max: c_int, +) -> ()); + +impl mc_scaled::Fn { + pub unsafe fn call( + &self, + dst: *mut BD::Pixel, + dst_stride: ptrdiff_t, + src: *const BD::Pixel, + src_stride: ptrdiff_t, + w: c_int, + h: c_int, + mx: c_int, + my: c_int, + dx: c_int, + dy: c_int, + bd: BD, + ) { + let dst = dst.cast(); + let src = src.cast(); + let bd = bd.into_c(); + self.get()(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, bd) + } +} pub type warp8x8_fn = unsafe extern "C" fn( *mut DynPixel, @@ -1208,29 +1253,67 @@ pub type warp8x8_fn = unsafe extern "C" fn( c_int, ) -> (); -pub type mct_fn = unsafe extern "C" fn( - *mut i16, - *const DynPixel, - ptrdiff_t, - c_int, - c_int, - c_int, - c_int, - c_int, -) -> (); +wrap_fn_ptr!(pub unsafe extern "C" fn mct( + tmp: *mut i16, + src: *const DynPixel, + src_stride: ptrdiff_t, + w: c_int, + h: c_int, + mx: c_int, + my: c_int, + bitdepth_max: c_int, +) -> ()); + +impl mct::Fn { + pub unsafe fn call( + &self, + tmp: *mut i16, + src: *const BD::Pixel, + src_stride: ptrdiff_t, + w: c_int, + h: c_int, + mx: c_int, + my: c_int, + bd: BD, + ) { + let src = src.cast(); + let bd = bd.into_c(); + self.get()(tmp, src, src_stride, w, h, mx, my, bd) + } +} -pub type mct_scaled_fn = unsafe extern "C" fn( - *mut i16, - *const DynPixel, - ptrdiff_t, - c_int, - c_int, - c_int, - c_int, - c_int, - c_int, - c_int, -) -> (); +wrap_fn_ptr!(pub unsafe extern "C" fn mct_scaled( + tmp: *mut i16, + src: *const DynPixel, + src_stride: ptrdiff_t, + w: c_int, + h: c_int, + mx: c_int, + my: c_int, + dx: c_int, + dy: c_int, + bitdepth_max: c_int, +) -> ()); + +impl mct_scaled::Fn { + pub unsafe fn call( + &self, + tmp: *mut i16, + src: *const BD::Pixel, + src_stride: ptrdiff_t, + w: c_int, + h: c_int, + mx: c_int, + my: c_int, + dx: c_int, + dy: c_int, + bd: BD, + ) { + let src = src.cast(); + let bd = bd.into_c(); + self.get()(tmp, src, src_stride, w, h, mx, my, dx, dy, bd) + } +} pub type warp8x8t_fn = unsafe extern "C" fn( *mut i16, @@ -1275,17 +1358,36 @@ pub type mask_fn = unsafe extern "C" fn( c_int, ) -> (); -pub type w_mask_fn = unsafe extern "C" fn( - *mut DynPixel, - ptrdiff_t, - *const i16, - *const i16, - c_int, - c_int, - *mut u8, - c_int, - c_int, -) -> (); +wrap_fn_ptr!(pub unsafe extern "C" fn w_mask( + dst: *mut DynPixel, + dst_stride: ptrdiff_t, + tmp1: *const i16, + tmp2: *const i16, + w: c_int, + h: c_int, + mask: *mut u8, + sign: c_int, + bitdepth_max: c_int, +) -> ()); + +impl w_mask::Fn { + pub unsafe fn call( + &self, + dst: *mut BD::Pixel, + dst_stride: ptrdiff_t, + tmp1: *const i16, + tmp2: *const i16, + w: c_int, + h: c_int, + mask: *mut u8, + sign: c_int, + bd: BD, + ) { + let dst = dst.cast(); + let bd = bd.into_c(); + self.get()(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, bd) + } +} pub type blend_fn = unsafe extern "C" fn(*mut DynPixel, ptrdiff_t, *const DynPixel, c_int, c_int, *const u8) -> (); @@ -1320,14 +1422,14 @@ pub type resize_fn = unsafe extern "C" fn( ) -> (); #[repr(C)] pub struct Rav1dMCDSPContext { - pub mc: [mc_fn; 10], - pub mc_scaled: [mc_scaled_fn; 10], - pub mct: [mct_fn; 10], - pub mct_scaled: [mct_scaled_fn; 10], + pub mc: enum_map_ty!(Filter2d, mc::Fn), + pub mc_scaled: enum_map_ty!(Filter2d, mc_scaled::Fn), + pub mct: enum_map_ty!(Filter2d, mct::Fn), + pub mct_scaled: enum_map_ty!(Filter2d, mct_scaled::Fn), pub avg: avg_fn, pub w_avg: w_avg_fn, pub mask: mask_fn, - pub w_mask: [w_mask_fn; 3], + pub w_mask: enum_map_ty!(Rav1dPixelLayoutSubSampled, w_mask::Fn), pub blend: blend_fn, pub blend_v: blend_dir_fn, pub blend_h: blend_dir_fn, @@ -1908,64 +2010,6 @@ pub(crate) unsafe extern "C" fn resize_c_erased( // TODO(legare): Generated fns are temporarily pub until init fns are deduplicated. #[cfg(feature = "asm")] macro_rules! decl_fn { - (mc, $name:ident) => { - pub(crate) fn $name( - dst: *mut DynPixel, - dst_stride: ptrdiff_t, - src: *const DynPixel, - src_stride: ptrdiff_t, - w: c_int, - h: c_int, - mx: c_int, - my: c_int, - bitdepth_max: c_int, - ); - }; - - (mct, $name:ident) => { - pub(crate) fn $name( - tmp: *mut i16, - src: *const DynPixel, - src_stride: ptrdiff_t, - w: c_int, - h: c_int, - mx: c_int, - my: c_int, - bitdepth_max: c_int, - ); - }; - - (mc_scaled, $name:ident) => { - pub(crate) fn $name( - dst: *mut DynPixel, - dst_stride: ptrdiff_t, - src: *const DynPixel, - src_stride: ptrdiff_t, - w: c_int, - h: c_int, - mx: c_int, - my: c_int, - dx: c_int, - dy: c_int, - bitdepth_max: c_int, - ); - }; - - (mct_scaled, $name:ident) => { - pub(crate) fn $name( - tmp: *mut i16, - src: *const DynPixel, - src_stride: ptrdiff_t, - w: c_int, - h: c_int, - mx: c_int, - my: c_int, - dx: c_int, - dy: c_int, - bitdepth_max: c_int, - ); - }; - (avg, $name:ident) => { pub(crate) fn $name( dst: *mut DynPixel, @@ -2004,20 +2048,6 @@ macro_rules! decl_fn { ); }; - (w_mask, $name:ident) => { - pub(crate) fn $name( - dst: *mut DynPixel, - dst_stride: ptrdiff_t, - tmp1: *const i16, - tmp2: *const i16, - w: c_int, - h: c_int, - mask: *mut u8, - sign: c_int, - bitdepth_max: c_int, - ); - }; - (blend, $name:ident) => { pub(crate) fn $name( dst: *mut DynPixel, @@ -2122,56 +2152,9 @@ macro_rules! decl_fns { #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] #[allow(dead_code)] // Macro invocations generate more fn declarations than are actually used. extern "C" { - decl_fns!(mc, dav1d_put_8tap_regular); - decl_fns!(mc, dav1d_put_8tap_regular_smooth); - decl_fns!(mc, dav1d_put_8tap_regular_sharp); - decl_fns!(mc, dav1d_put_8tap_smooth); - decl_fns!(mc, dav1d_put_8tap_smooth_regular); - decl_fns!(mc, dav1d_put_8tap_smooth_sharp); - decl_fns!(mc, dav1d_put_8tap_sharp); - decl_fns!(mc, dav1d_put_8tap_sharp_regular); - decl_fns!(mc, dav1d_put_8tap_sharp_smooth); - decl_fns!(mc, dav1d_put_bilin); - - decl_fns!(mct, dav1d_prep_8tap_regular); - decl_fns!(mct, dav1d_prep_8tap_regular_smooth); - decl_fns!(mct, dav1d_prep_8tap_regular_sharp); - decl_fns!(mct, dav1d_prep_8tap_smooth); - decl_fns!(mct, dav1d_prep_8tap_smooth_regular); - decl_fns!(mct, dav1d_prep_8tap_smooth_sharp); - decl_fns!(mct, dav1d_prep_8tap_sharp); - decl_fns!(mct, dav1d_prep_8tap_sharp_regular); - decl_fns!(mct, dav1d_prep_8tap_sharp_smooth); - decl_fns!(mct, dav1d_prep_bilin); - - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_regular); - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_regular_smooth); - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_regular_sharp); - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_smooth); - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_smooth_regular); - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp); - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_sharp); - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_sharp_regular); - decl_fns!(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth); - decl_fns!(mc_scaled, dav1d_put_bilin_scaled); - - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_regular); - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth); - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp); - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_smooth); - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular); - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp); - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_sharp); - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular); - decl_fns!(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth); - decl_fns!(mct_scaled, dav1d_prep_bilin_scaled); - decl_fns!(avg, dav1d_avg); decl_fns!(w_avg, dav1d_w_avg); decl_fns!(mask, dav1d_mask); - decl_fns!(w_mask, dav1d_w_mask_420); - decl_fns!(w_mask, dav1d_w_mask_422); - decl_fns!(w_mask, dav1d_w_mask_444); decl_fns!(blend, dav1d_blend); decl_fns!(blend_dir, dav1d_blend_v); decl_fns!(blend_dir, dav1d_blend_h); @@ -2187,34 +2170,9 @@ extern "C" { #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] extern "C" { - decl_fns!(mc, dav1d_put_8tap_regular, neon); - decl_fns!(mc, dav1d_put_8tap_regular_smooth, neon); - decl_fns!(mc, dav1d_put_8tap_regular_sharp, neon); - decl_fns!(mc, dav1d_put_8tap_smooth, neon); - decl_fns!(mc, dav1d_put_8tap_smooth_regular, neon); - decl_fns!(mc, dav1d_put_8tap_smooth_sharp, neon); - decl_fns!(mc, dav1d_put_8tap_sharp, neon); - decl_fns!(mc, dav1d_put_8tap_sharp_regular, neon); - decl_fns!(mc, dav1d_put_8tap_sharp_smooth, neon); - decl_fns!(mc, dav1d_put_bilin, neon); - - decl_fns!(mct, dav1d_prep_8tap_regular, neon); - decl_fns!(mct, dav1d_prep_8tap_regular_smooth, neon); - decl_fns!(mct, dav1d_prep_8tap_regular_sharp, neon); - decl_fns!(mct, dav1d_prep_8tap_smooth, neon); - decl_fns!(mct, dav1d_prep_8tap_smooth_regular, neon); - decl_fns!(mct, dav1d_prep_8tap_smooth_sharp, neon); - decl_fns!(mct, dav1d_prep_8tap_sharp, neon); - decl_fns!(mct, dav1d_prep_8tap_sharp_regular, neon); - decl_fns!(mct, dav1d_prep_8tap_sharp_smooth, neon); - decl_fns!(mct, dav1d_prep_bilin, neon); - decl_fns!(avg, dav1d_avg, neon); decl_fns!(w_avg, dav1d_w_avg, neon); decl_fns!(mask, dav1d_mask, neon); - decl_fns!(w_mask, dav1d_w_mask_420, neon); - decl_fns!(w_mask, dav1d_w_mask_422, neon); - decl_fns!(w_mask, dav1d_w_mask_444, neon); decl_fns!(blend, dav1d_blend, neon); decl_fns!(blend_dir, dav1d_blend_v, neon); decl_fns!(blend_dir, dav1d_blend_h, neon); @@ -2225,342 +2183,377 @@ extern "C" { decl_fns!(emu_edge, dav1d_emu_edge, neon); } -#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] -#[inline(always)] -unsafe fn mc_dsp_init_x86(c: *mut Rav1dMCDSPContext) { - use Filter2d::*; - - let flags = rav1d_get_cpu_flags(); - - if !flags.contains(CpuFlags::SSE2) { - return; +impl Rav1dMCDSPContext { + pub const fn default() -> Self { + Self { + mc: enum_map!(Filter2d => mc::Fn; match key { + Regular8Tap => mc::Fn::new(put_8tap_regular_c_erased::), + RegularSmooth8Tap => mc::Fn::new(put_8tap_regular_smooth_c_erased::), + RegularSharp8Tap => mc::Fn::new(put_8tap_regular_sharp_c_erased::), + SharpRegular8Tap => mc::Fn::new(put_8tap_sharp_regular_c_erased::), + SharpSmooth8Tap => mc::Fn::new(put_8tap_sharp_smooth_c_erased::), + Sharp8Tap => mc::Fn::new(put_8tap_sharp_c_erased::), + SmoothRegular8Tap => mc::Fn::new(put_8tap_smooth_regular_c_erased::), + Smooth8Tap => mc::Fn::new(put_8tap_smooth_c_erased::), + SmoothSharp8Tap => mc::Fn::new(put_8tap_smooth_sharp_c_erased::), + Bilinear => mc::Fn::new(put_bilin_c_erased::), + }), + mct: enum_map!(Filter2d => mct::Fn; match key { + Regular8Tap => mct::Fn::new(prep_8tap_regular_c_erased::), + RegularSmooth8Tap => mct::Fn::new(prep_8tap_regular_smooth_c_erased::), + RegularSharp8Tap => mct::Fn::new(prep_8tap_regular_sharp_c_erased::), + SharpRegular8Tap => mct::Fn::new(prep_8tap_sharp_regular_c_erased::), + SharpSmooth8Tap => mct::Fn::new(prep_8tap_sharp_smooth_c_erased::), + Sharp8Tap => mct::Fn::new(prep_8tap_sharp_c_erased::), + SmoothRegular8Tap => mct::Fn::new(prep_8tap_smooth_regular_c_erased::), + Smooth8Tap => mct::Fn::new(prep_8tap_smooth_c_erased::), + SmoothSharp8Tap => mct::Fn::new(prep_8tap_smooth_sharp_c_erased::), + Bilinear => mct::Fn::new(prep_bilin_c_erased::), + }), + mc_scaled: enum_map!(Filter2d => mc_scaled::Fn; match key { + Regular8Tap => mc_scaled::Fn::new(put_8tap_regular_scaled_c_erased::), + RegularSmooth8Tap => mc_scaled::Fn::new(put_8tap_regular_smooth_scaled_c_erased::), + RegularSharp8Tap => mc_scaled::Fn::new(put_8tap_regular_sharp_scaled_c_erased::), + SharpRegular8Tap => mc_scaled::Fn::new(put_8tap_sharp_regular_scaled_c_erased::), + SharpSmooth8Tap => mc_scaled::Fn::new(put_8tap_sharp_smooth_scaled_c_erased::), + Sharp8Tap => mc_scaled::Fn::new(put_8tap_sharp_scaled_c_erased::), + SmoothRegular8Tap => mc_scaled::Fn::new(put_8tap_smooth_regular_scaled_c_erased::), + Smooth8Tap => mc_scaled::Fn::new(put_8tap_smooth_scaled_c_erased::), + SmoothSharp8Tap => mc_scaled::Fn::new(put_8tap_smooth_sharp_scaled_c_erased::), + Bilinear => mc_scaled::Fn::new(put_bilin_scaled_c_erased::), + }), + mct_scaled: enum_map!(Filter2d => mct_scaled::Fn; match key { + Regular8Tap => mct_scaled::Fn::new(prep_8tap_regular_scaled_c_erased::), + RegularSmooth8Tap => mct_scaled::Fn::new(prep_8tap_regular_smooth_scaled_c_erased::), + RegularSharp8Tap => mct_scaled::Fn::new(prep_8tap_regular_sharp_scaled_c_erased::), + SharpRegular8Tap => mct_scaled::Fn::new(prep_8tap_sharp_regular_scaled_c_erased::), + SharpSmooth8Tap => mct_scaled::Fn::new(prep_8tap_sharp_smooth_scaled_c_erased::), + Sharp8Tap => mct_scaled::Fn::new(prep_8tap_sharp_scaled_c_erased::), + SmoothRegular8Tap => mct_scaled::Fn::new(prep_8tap_smooth_regular_scaled_c_erased::), + Smooth8Tap => mct_scaled::Fn::new(prep_8tap_smooth_scaled_c_erased::), + SmoothSharp8Tap => mct_scaled::Fn::new(prep_8tap_smooth_sharp_scaled_c_erased::), + Bilinear => mct_scaled::Fn::new(prep_bilin_scaled_c_erased::), + }), + avg: avg_c_erased::, + w_avg: w_avg_c_erased::, + mask: mask_c_erased::, + w_mask: enum_map!(Rav1dPixelLayoutSubSampled => w_mask::Fn; match key { + I420 => w_mask::Fn::new(w_mask_420_c_erased::), + I422 => w_mask::Fn::new(w_mask_422_c_erased::), + I444 => w_mask::Fn::new(w_mask_444_c_erased::), + }), + blend: blend_c_erased::, + blend_v: blend_v_c_erased::, + blend_h: blend_h_c_erased::, + warp8x8: warp_affine_8x8_c_erased::, + warp8x8t: warp_affine_8x8t_c_erased::, + emu_edge: emu_edge_c_erased::, + resize: resize_c_erased::, + } } - if BD::BPC == BPC::BPC8 { - (*c).mct[Bilinear as usize] = dav1d_prep_bilin_8bpc_sse2; - (*c).mct[Regular8Tap as usize] = dav1d_prep_8tap_regular_8bpc_sse2; - (*c).mct[RegularSmooth8Tap as usize] = dav1d_prep_8tap_regular_smooth_8bpc_sse2; - (*c).mct[RegularSharp8Tap as usize] = dav1d_prep_8tap_regular_sharp_8bpc_sse2; - (*c).mct[SmoothRegular8Tap as usize] = dav1d_prep_8tap_smooth_regular_8bpc_sse2; - (*c).mct[Smooth8Tap as usize] = dav1d_prep_8tap_smooth_8bpc_sse2; - (*c).mct[SmoothSharp8Tap as usize] = dav1d_prep_8tap_smooth_sharp_8bpc_sse2; - (*c).mct[SharpRegular8Tap as usize] = dav1d_prep_8tap_sharp_regular_8bpc_sse2; - (*c).mct[SharpSmooth8Tap as usize] = dav1d_prep_8tap_sharp_smooth_8bpc_sse2; - (*c).mct[Sharp8Tap as usize] = dav1d_prep_8tap_sharp_8bpc_sse2; - - (*c).warp8x8 = dav1d_warp_affine_8x8_8bpc_sse2; - (*c).warp8x8t = dav1d_warp_affine_8x8t_8bpc_sse2; - } + #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] + #[inline(always)] + const fn init_x86(mut self, flags: CpuFlags) -> Self { + if !flags.contains(CpuFlags::SSE2) { + return self; + } - if !flags.contains(CpuFlags::SSSE3) { - return; - } + if let BPC::BPC8 = BD::BPC { + self.mct = enum_map!(Filter2d => mct::Fn; match key { + Bilinear => bpc_fn!(mct::decl_fn, 8 bpc, prep_bilin, sse2), + Regular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular, sse2), + RegularSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_smooth, sse2), + RegularSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_sharp, sse2), + SmoothRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_regular, sse2), + Smooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth, sse2), + SmoothSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_sharp, sse2), + SharpRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_regular, sse2), + SharpSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_smooth, sse2), + Sharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp, sse2), + }); + + self.warp8x8 = bpc_fn!(8 bpc, warp_affine_8x8, sse2); + self.warp8x8t = bpc_fn!(8 bpc, warp_affine_8x8t, sse2); + } - (*c).mc[Regular8Tap as usize] = bd_fn!(BD, put_8tap_regular, ssse3); - (*c).mc[RegularSmooth8Tap as usize] = bd_fn!(BD, put_8tap_regular_smooth, ssse3); - (*c).mc[RegularSharp8Tap as usize] = bd_fn!(BD, put_8tap_regular_sharp, ssse3); - (*c).mc[SmoothRegular8Tap as usize] = bd_fn!(BD, put_8tap_smooth_regular, ssse3); - (*c).mc[Smooth8Tap as usize] = bd_fn!(BD, put_8tap_smooth, ssse3); - (*c).mc[SmoothSharp8Tap as usize] = bd_fn!(BD, put_8tap_smooth_sharp, ssse3); - (*c).mc[SharpRegular8Tap as usize] = bd_fn!(BD, put_8tap_sharp_regular, ssse3); - (*c).mc[SharpSmooth8Tap as usize] = bd_fn!(BD, put_8tap_sharp_smooth, ssse3); - (*c).mc[Sharp8Tap as usize] = bd_fn!(BD, put_8tap_sharp, ssse3); - (*c).mc[Bilinear as usize] = bd_fn!(BD, put_bilin, ssse3); - - (*c).mct[Regular8Tap as usize] = bd_fn!(BD, prep_8tap_regular, ssse3); - (*c).mct[RegularSmooth8Tap as usize] = bd_fn!(BD, prep_8tap_regular_smooth, ssse3); - (*c).mct[RegularSharp8Tap as usize] = bd_fn!(BD, prep_8tap_regular_sharp, ssse3); - (*c).mct[SmoothRegular8Tap as usize] = bd_fn!(BD, prep_8tap_smooth_regular, ssse3); - (*c).mct[Smooth8Tap as usize] = bd_fn!(BD, prep_8tap_smooth, ssse3); - (*c).mct[SmoothSharp8Tap as usize] = bd_fn!(BD, prep_8tap_smooth_sharp, ssse3); - (*c).mct[SharpRegular8Tap as usize] = bd_fn!(BD, prep_8tap_sharp_regular, ssse3); - (*c).mct[SharpSmooth8Tap as usize] = bd_fn!(BD, prep_8tap_sharp_smooth, ssse3); - (*c).mct[Sharp8Tap as usize] = bd_fn!(BD, prep_8tap_sharp, ssse3); - (*c).mct[Bilinear as usize] = bd_fn!(BD, prep_bilin, ssse3); - - (*c).mc_scaled[Regular8Tap as usize] = bd_fn!(BD, put_8tap_scaled_regular, ssse3); - (*c).mc_scaled[RegularSmooth8Tap as usize] = bd_fn!(BD, put_8tap_scaled_regular_smooth, ssse3); - (*c).mc_scaled[RegularSharp8Tap as usize] = bd_fn!(BD, put_8tap_scaled_regular_sharp, ssse3); - (*c).mc_scaled[SmoothRegular8Tap as usize] = bd_fn!(BD, put_8tap_scaled_smooth_regular, ssse3); - (*c).mc_scaled[Smooth8Tap as usize] = bd_fn!(BD, put_8tap_scaled_smooth, ssse3); - (*c).mc_scaled[SmoothSharp8Tap as usize] = bd_fn!(BD, put_8tap_scaled_smooth_sharp, ssse3); - (*c).mc_scaled[SharpRegular8Tap as usize] = bd_fn!(BD, put_8tap_scaled_sharp_regular, ssse3); - (*c).mc_scaled[SharpSmooth8Tap as usize] = bd_fn!(BD, put_8tap_scaled_sharp_smooth, ssse3); - (*c).mc_scaled[Sharp8Tap as usize] = bd_fn!(BD, put_8tap_scaled_sharp, ssse3); - (*c).mc_scaled[Bilinear as usize] = bd_fn!(BD, put_bilin_scaled, ssse3); - - (*c).mct_scaled[Regular8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_regular, ssse3); - (*c).mct_scaled[RegularSmooth8Tap as usize] = - bd_fn!(BD, prep_8tap_scaled_regular_smooth, ssse3); - (*c).mct_scaled[RegularSharp8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_regular_sharp, ssse3); - (*c).mct_scaled[SmoothRegular8Tap as usize] = - bd_fn!(BD, prep_8tap_scaled_smooth_regular, ssse3); - (*c).mct_scaled[Smooth8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_smooth, ssse3); - (*c).mct_scaled[SmoothSharp8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_smooth_sharp, ssse3); - (*c).mct_scaled[SharpRegular8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_sharp_regular, ssse3); - (*c).mct_scaled[SharpSmooth8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_sharp_smooth, ssse3); - (*c).mct_scaled[Sharp8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_sharp, ssse3); - (*c).mct_scaled[Bilinear as usize] = bd_fn!(BD, prep_bilin_scaled, ssse3); - - (*c).avg = bd_fn!(BD, avg, ssse3); - (*c).w_avg = bd_fn!(BD, w_avg, ssse3); - (*c).mask = bd_fn!(BD, mask, ssse3); - - (*c).w_mask[0] = bd_fn!(BD, w_mask_444, ssse3); - (*c).w_mask[1] = bd_fn!(BD, w_mask_422, ssse3); - (*c).w_mask[2] = bd_fn!(BD, w_mask_420, ssse3); - - (*c).blend = bd_fn!(BD, blend, ssse3); - (*c).blend_v = bd_fn!(BD, blend_v, ssse3); - (*c).blend_h = bd_fn!(BD, blend_h, ssse3); - (*c).warp8x8 = bd_fn!(BD, warp_affine_8x8, ssse3); - (*c).warp8x8t = bd_fn!(BD, warp_affine_8x8t, ssse3); - (*c).emu_edge = bd_fn!(BD, emu_edge, ssse3); - (*c).resize = bd_fn!(BD, resize, ssse3); - - if !flags.contains(CpuFlags::SSE41) { - return; - } + if !flags.contains(CpuFlags::SSSE3) { + return self; + } - if BD::BPC == BPC::BPC8 { - (*c).warp8x8 = dav1d_warp_affine_8x8_8bpc_sse4; - (*c).warp8x8t = dav1d_warp_affine_8x8t_8bpc_sse4; - } + self.mc = enum_map!(Filter2d => mc::Fn; match key { + Regular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular, ssse3), + RegularSmooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular_smooth, ssse3), + RegularSharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular_sharp, ssse3), + SmoothRegular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth_regular, ssse3), + Smooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth, ssse3), + SmoothSharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth_sharp, ssse3), + SharpRegular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp_regular, ssse3), + SharpSmooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp_smooth, ssse3), + Sharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp, ssse3), + Bilinear => bd_fn!(mc::decl_fn, BD, put_bilin, ssse3), + }); + self.mct = enum_map!(Filter2d => mct::Fn; match key { + Regular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular, ssse3), + RegularSmooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular_smooth, ssse3), + RegularSharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular_sharp, ssse3), + SmoothRegular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth_regular, ssse3), + Smooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth, ssse3), + SmoothSharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth_sharp, ssse3), + SharpRegular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp_regular, ssse3), + SharpSmooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp_smooth, ssse3), + Sharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp, ssse3), + Bilinear => bd_fn!(mct::decl_fn, BD, prep_bilin, ssse3), + }); + self.mc_scaled = enum_map!(Filter2d => mc_scaled::Fn; match key { + Regular8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_regular, ssse3), + RegularSmooth8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_regular_smooth, ssse3), + RegularSharp8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_regular_sharp, ssse3), + SmoothRegular8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_smooth_regular, ssse3), + Smooth8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_smooth, ssse3), + SmoothSharp8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_smooth_sharp, ssse3), + SharpRegular8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_sharp_regular, ssse3), + SharpSmooth8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_sharp_smooth, ssse3), + Sharp8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_sharp, ssse3), + Bilinear => bd_fn!(mc_scaled::decl_fn, BD, put_bilin_scaled, ssse3), + }); + self.mct_scaled = enum_map!(Filter2d => mct_scaled::Fn; match key { + Regular8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_regular, ssse3), + RegularSmooth8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_regular_smooth, ssse3), + RegularSharp8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_regular_sharp, ssse3), + SmoothRegular8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_smooth_regular, ssse3), + Smooth8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_smooth, ssse3), + SmoothSharp8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_smooth_sharp, ssse3), + SharpRegular8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_sharp_regular, ssse3), + SharpSmooth8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_sharp_smooth, ssse3), + Sharp8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_sharp, ssse3), + Bilinear => bd_fn!(mct_scaled::decl_fn, BD, prep_bilin_scaled, ssse3), + }); + + self.avg = bd_fn!(BD, avg, ssse3); + self.w_avg = bd_fn!(BD, w_avg, ssse3); + self.mask = bd_fn!(BD, mask, ssse3); + + self.w_mask = enum_map!(Rav1dPixelLayoutSubSampled => w_mask::Fn; match key { + I420 => bd_fn!(w_mask::decl_fn, BD, w_mask_420, ssse3), + I422 => bd_fn!(w_mask::decl_fn, BD, w_mask_422, ssse3), + I444 => bd_fn!(w_mask::decl_fn, BD, w_mask_444, ssse3), + }); + + self.blend = bd_fn!(BD, blend, ssse3); + self.blend_v = bd_fn!(BD, blend_v, ssse3); + self.blend_h = bd_fn!(BD, blend_h, ssse3); + self.warp8x8 = bd_fn!(BD, warp_affine_8x8, ssse3); + self.warp8x8t = bd_fn!(BD, warp_affine_8x8t, ssse3); + self.emu_edge = bd_fn!(BD, emu_edge, ssse3); + self.resize = bd_fn!(BD, resize, ssse3); + + if !flags.contains(CpuFlags::SSE41) { + return self; + } - #[cfg(target_arch = "x86_64")] - { - if !flags.contains(CpuFlags::AVX2) { - return; + if let BPC::BPC8 = BD::BPC { + self.warp8x8 = bpc_fn!(8 bpc, warp_affine_8x8, sse4); + self.warp8x8t = bpc_fn!(8 bpc, warp_affine_8x8t, sse4); } - (*c).mc[Regular8Tap as usize] = bd_fn!(BD, put_8tap_regular, avx2); - (*c).mc[RegularSmooth8Tap as usize] = bd_fn!(BD, put_8tap_regular_smooth, avx2); - (*c).mc[RegularSharp8Tap as usize] = bd_fn!(BD, put_8tap_regular_sharp, avx2); - (*c).mc[SmoothRegular8Tap as usize] = bd_fn!(BD, put_8tap_smooth_regular, avx2); - (*c).mc[Smooth8Tap as usize] = bd_fn!(BD, put_8tap_smooth, avx2); - (*c).mc[SmoothSharp8Tap as usize] = bd_fn!(BD, put_8tap_smooth_sharp, avx2); - (*c).mc[SharpRegular8Tap as usize] = bd_fn!(BD, put_8tap_sharp_regular, avx2); - (*c).mc[SharpSmooth8Tap as usize] = bd_fn!(BD, put_8tap_sharp_smooth, avx2); - (*c).mc[Sharp8Tap as usize] = bd_fn!(BD, put_8tap_sharp, avx2); - (*c).mc[Bilinear as usize] = bd_fn!(BD, put_bilin, avx2); - - (*c).mct[Regular8Tap as usize] = bd_fn!(BD, prep_8tap_regular, avx2); - (*c).mct[RegularSmooth8Tap as usize] = bd_fn!(BD, prep_8tap_regular_smooth, avx2); - (*c).mct[RegularSharp8Tap as usize] = bd_fn!(BD, prep_8tap_regular_sharp, avx2); - (*c).mct[SmoothRegular8Tap as usize] = bd_fn!(BD, prep_8tap_smooth_regular, avx2); - (*c).mct[Smooth8Tap as usize] = bd_fn!(BD, prep_8tap_smooth, avx2); - (*c).mct[SmoothSharp8Tap as usize] = bd_fn!(BD, prep_8tap_smooth_sharp, avx2); - (*c).mct[SharpRegular8Tap as usize] = bd_fn!(BD, prep_8tap_sharp_regular, avx2); - (*c).mct[SharpSmooth8Tap as usize] = bd_fn!(BD, prep_8tap_sharp_smooth, avx2); - (*c).mct[Sharp8Tap as usize] = bd_fn!(BD, prep_8tap_sharp, avx2); - (*c).mct[Bilinear as usize] = bd_fn!(BD, prep_bilin, avx2); - - (*c).mc_scaled[Regular8Tap as usize] = bd_fn!(BD, put_8tap_scaled_regular, avx2); - (*c).mc_scaled[RegularSmooth8Tap as usize] = - bd_fn!(BD, put_8tap_scaled_regular_smooth, avx2); - (*c).mc_scaled[RegularSharp8Tap as usize] = bd_fn!(BD, put_8tap_scaled_regular_sharp, avx2); - (*c).mc_scaled[SmoothRegular8Tap as usize] = - bd_fn!(BD, put_8tap_scaled_smooth_regular, avx2); - (*c).mc_scaled[Smooth8Tap as usize] = bd_fn!(BD, put_8tap_scaled_smooth, avx2); - (*c).mc_scaled[SmoothSharp8Tap as usize] = bd_fn!(BD, put_8tap_scaled_smooth_sharp, avx2); - (*c).mc_scaled[SharpRegular8Tap as usize] = bd_fn!(BD, put_8tap_scaled_sharp_regular, avx2); - (*c).mc_scaled[SharpSmooth8Tap as usize] = bd_fn!(BD, put_8tap_scaled_sharp_smooth, avx2); - (*c).mc_scaled[Sharp8Tap as usize] = bd_fn!(BD, put_8tap_scaled_sharp, avx2); - (*c).mc_scaled[Bilinear as usize] = bd_fn!(BD, put_bilin_scaled, avx2); - - (*c).mct_scaled[Regular8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_regular, avx2); - (*c).mct_scaled[RegularSmooth8Tap as usize] = - bd_fn!(BD, prep_8tap_scaled_regular_smooth, avx2); - (*c).mct_scaled[RegularSharp8Tap as usize] = - bd_fn!(BD, prep_8tap_scaled_regular_sharp, avx2); - (*c).mct_scaled[SmoothRegular8Tap as usize] = - bd_fn!(BD, prep_8tap_scaled_smooth_regular, avx2); - (*c).mct_scaled[Smooth8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_smooth, avx2); - (*c).mct_scaled[SmoothSharp8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_smooth_sharp, avx2); - (*c).mct_scaled[SharpRegular8Tap as usize] = - bd_fn!(BD, prep_8tap_scaled_sharp_regular, avx2); - (*c).mct_scaled[SharpSmooth8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_sharp_smooth, avx2); - (*c).mct_scaled[Sharp8Tap as usize] = bd_fn!(BD, prep_8tap_scaled_sharp, avx2); - (*c).mct_scaled[Bilinear as usize] = bd_fn!(BD, prep_bilin_scaled, avx2); - - (*c).avg = bd_fn!(BD, avg, avx2); - (*c).w_avg = bd_fn!(BD, w_avg, avx2); - (*c).mask = bd_fn!(BD, mask, avx2); - - (*c).w_mask[0] = bd_fn!(BD, w_mask_444, avx2); - (*c).w_mask[1] = bd_fn!(BD, w_mask_422, avx2); - (*c).w_mask[2] = bd_fn!(BD, w_mask_420, avx2); - - (*c).blend = bd_fn!(BD, blend, avx2); - (*c).blend_v = bd_fn!(BD, blend_v, avx2); - (*c).blend_h = bd_fn!(BD, blend_h, avx2); - (*c).warp8x8 = bd_fn!(BD, warp_affine_8x8, avx2); - (*c).warp8x8t = bd_fn!(BD, warp_affine_8x8t, avx2); - (*c).emu_edge = bd_fn!(BD, emu_edge, avx2); - (*c).resize = bd_fn!(BD, resize, avx2); - - if !flags.contains(CpuFlags::AVX512ICL) { - return; + #[cfg(target_arch = "x86_64")] + { + if !flags.contains(CpuFlags::AVX2) { + return self; + } + + self.mc = enum_map!(Filter2d => mc::Fn; match key { + Regular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular, avx2), + RegularSmooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular_smooth, avx2), + RegularSharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular_sharp, avx2), + SmoothRegular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth_regular, avx2), + Smooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth, avx2), + SmoothSharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth_sharp, avx2), + SharpRegular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp_regular, avx2), + SharpSmooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp_smooth, avx2), + Sharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp, avx2), + Bilinear => bd_fn!(mc::decl_fn, BD, put_bilin, avx2), + }); + self.mct = enum_map!(Filter2d => mct::Fn; match key { + Regular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular, avx2), + RegularSmooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular_smooth, avx2), + RegularSharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular_sharp, avx2), + SmoothRegular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth_regular, avx2), + Smooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth, avx2), + SmoothSharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth_sharp, avx2), + SharpRegular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp_regular, avx2), + SharpSmooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp_smooth, avx2), + Sharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp, avx2), + Bilinear => bd_fn!(mct::decl_fn, BD, prep_bilin, avx2), + }); + self.mc_scaled = enum_map!(Filter2d => mc_scaled::Fn; match key { + Regular8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_regular, avx2), + RegularSmooth8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_regular_smooth, avx2), + RegularSharp8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_regular_sharp, avx2), + SmoothRegular8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_smooth_regular, avx2), + Smooth8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_smooth, avx2), + SmoothSharp8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_smooth_sharp, avx2), + SharpRegular8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_sharp_regular, avx2), + SharpSmooth8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_sharp_smooth, avx2), + Sharp8Tap => bd_fn!(mc_scaled::decl_fn, BD, put_8tap_scaled_sharp, avx2), + Bilinear => bd_fn!(mc_scaled::decl_fn, BD, put_bilin_scaled, avx2), + }); + self.mct_scaled = enum_map!(Filter2d => mct_scaled::Fn; match key { + Regular8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_regular, avx2), + RegularSmooth8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_regular_smooth, avx2), + RegularSharp8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_regular_sharp, avx2), + SmoothRegular8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_smooth_regular, avx2), + Smooth8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_smooth, avx2), + SmoothSharp8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_smooth_sharp, avx2), + SharpRegular8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_sharp_regular, avx2), + SharpSmooth8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_sharp_smooth, avx2), + Sharp8Tap => bd_fn!(mct_scaled::decl_fn, BD, prep_8tap_scaled_sharp, avx2), + Bilinear => bd_fn!(mct_scaled::decl_fn, BD, prep_bilin_scaled, avx2), + }); + + self.avg = bd_fn!(BD, avg, avx2); + self.w_avg = bd_fn!(BD, w_avg, avx2); + self.mask = bd_fn!(BD, mask, avx2); + + self.w_mask = enum_map!(Rav1dPixelLayoutSubSampled => w_mask::Fn; match key { + I420 => bd_fn!(w_mask::decl_fn, BD, w_mask_420, avx2), + I422 => bd_fn!(w_mask::decl_fn, BD, w_mask_422, avx2), + I444 => bd_fn!(w_mask::decl_fn, BD, w_mask_444, avx2), + }); + + self.blend = bd_fn!(BD, blend, avx2); + self.blend_v = bd_fn!(BD, blend_v, avx2); + self.blend_h = bd_fn!(BD, blend_h, avx2); + self.warp8x8 = bd_fn!(BD, warp_affine_8x8, avx2); + self.warp8x8t = bd_fn!(BD, warp_affine_8x8t, avx2); + self.emu_edge = bd_fn!(BD, emu_edge, avx2); + self.resize = bd_fn!(BD, resize, avx2); + + if !flags.contains(CpuFlags::AVX512ICL) { + return self; + } + + self.mc = enum_map!(Filter2d => mc::Fn; match key { + Regular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular, avx512icl), + RegularSmooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular_smooth, avx512icl), + RegularSharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular_sharp, avx512icl), + SmoothRegular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth_regular, avx512icl), + Smooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth, avx512icl), + SmoothSharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth_sharp, avx512icl), + SharpRegular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp_regular, avx512icl), + SharpSmooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp_smooth, avx512icl), + Sharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp, avx512icl), + Bilinear => bd_fn!(mc::decl_fn, BD, put_bilin, avx512icl), + }); + self.mct = enum_map!(Filter2d => mct::Fn; match key { + Regular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular, avx512icl), + RegularSmooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular_smooth, avx512icl), + RegularSharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular_sharp, avx512icl), + SmoothRegular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth_regular, avx512icl), + Smooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth, avx512icl), + SmoothSharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth_sharp, avx512icl), + SharpRegular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp_regular, avx512icl), + SharpSmooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp_smooth, avx512icl), + Sharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp, avx512icl), + Bilinear => bd_fn!(mct::decl_fn, BD, prep_bilin, avx512icl), + }); + + self.avg = bd_fn!(BD, avg, avx512icl); + self.w_avg = bd_fn!(BD, w_avg, avx512icl); + self.mask = bd_fn!(BD, mask, avx512icl); + + self.w_mask = enum_map!(Rav1dPixelLayoutSubSampled => w_mask::Fn; match key { + I420 => bd_fn!(w_mask::decl_fn, BD, w_mask_420, avx512icl), + I422 => bd_fn!(w_mask::decl_fn, BD, w_mask_422, avx512icl), + I444 => bd_fn!(w_mask::decl_fn, BD, w_mask_444, avx512icl), + }); + + self.blend = bd_fn!(BD, blend, avx512icl); + self.blend_v = bd_fn!(BD, blend_v, avx512icl); + self.blend_h = bd_fn!(BD, blend_h, avx512icl); + self.warp8x8 = bd_fn!(BD, warp_affine_8x8, avx512icl); + self.warp8x8t = bd_fn!(BD, warp_affine_8x8t, avx512icl); + self.resize = bd_fn!(BD, resize, avx512icl); } - (*c).mc[Regular8Tap as usize] = bd_fn!(BD, put_8tap_regular, avx512icl); - (*c).mc[RegularSmooth8Tap as usize] = bd_fn!(BD, put_8tap_regular_smooth, avx512icl); - (*c).mc[RegularSharp8Tap as usize] = bd_fn!(BD, put_8tap_regular_sharp, avx512icl); - (*c).mc[SmoothRegular8Tap as usize] = bd_fn!(BD, put_8tap_smooth_regular, avx512icl); - (*c).mc[Smooth8Tap as usize] = bd_fn!(BD, put_8tap_smooth, avx512icl); - (*c).mc[SmoothSharp8Tap as usize] = bd_fn!(BD, put_8tap_smooth_sharp, avx512icl); - (*c).mc[SharpRegular8Tap as usize] = bd_fn!(BD, put_8tap_sharp_regular, avx512icl); - (*c).mc[SharpSmooth8Tap as usize] = bd_fn!(BD, put_8tap_sharp_smooth, avx512icl); - (*c).mc[Sharp8Tap as usize] = bd_fn!(BD, put_8tap_sharp, avx512icl); - (*c).mc[Bilinear as usize] = bd_fn!(BD, put_bilin, avx512icl); - - (*c).mct[Regular8Tap as usize] = bd_fn!(BD, prep_8tap_regular, avx512icl); - (*c).mct[RegularSmooth8Tap as usize] = bd_fn!(BD, prep_8tap_regular_smooth, avx512icl); - (*c).mct[RegularSharp8Tap as usize] = bd_fn!(BD, prep_8tap_regular_sharp, avx512icl); - (*c).mct[SmoothRegular8Tap as usize] = bd_fn!(BD, prep_8tap_smooth_regular, avx512icl); - (*c).mct[Smooth8Tap as usize] = bd_fn!(BD, prep_8tap_smooth, avx512icl); - (*c).mct[SmoothSharp8Tap as usize] = bd_fn!(BD, prep_8tap_smooth_sharp, avx512icl); - (*c).mct[SharpRegular8Tap as usize] = bd_fn!(BD, prep_8tap_sharp_regular, avx512icl); - (*c).mct[SharpSmooth8Tap as usize] = bd_fn!(BD, prep_8tap_sharp_smooth, avx512icl); - (*c).mct[Sharp8Tap as usize] = bd_fn!(BD, prep_8tap_sharp, avx512icl); - (*c).mct[Bilinear as usize] = bd_fn!(BD, prep_bilin, avx512icl); - - (*c).avg = bd_fn!(BD, avg, avx512icl); - (*c).w_avg = bd_fn!(BD, w_avg, avx512icl); - (*c).mask = bd_fn!(BD, mask, avx512icl); - - (*c).w_mask[0] = bd_fn!(BD, w_mask_444, avx512icl); - (*c).w_mask[1] = bd_fn!(BD, w_mask_422, avx512icl); - (*c).w_mask[2] = bd_fn!(BD, w_mask_420, avx512icl); - - (*c).blend = bd_fn!(BD, blend, avx512icl); - (*c).blend_v = bd_fn!(BD, blend_v, avx512icl); - (*c).blend_h = bd_fn!(BD, blend_h, avx512icl); - (*c).warp8x8 = bd_fn!(BD, warp_affine_8x8, avx512icl); - (*c).warp8x8t = bd_fn!(BD, warp_affine_8x8t, avx512icl); - (*c).resize = bd_fn!(BD, resize, avx512icl); + self } -} -#[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] -#[inline(always)] -unsafe fn mc_dsp_init_arm(c: *mut Rav1dMCDSPContext) { - use Filter2d::*; - - let flags = rav1d_get_cpu_flags(); + #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64")))] + #[inline(always)] + const fn init_arm(mut self, flags: CpuFlags) -> Self { + if !flags.contains(CpuFlags::NEON) { + return self; + } - if !flags.contains(CpuFlags::NEON) { - return; + self.mc = enum_map!(Filter2d => mc::Fn; match key { + Regular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular, neon), + RegularSmooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular_smooth, neon), + RegularSharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_regular_sharp, neon), + SmoothRegular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth_regular, neon), + Smooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth, neon), + SmoothSharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_smooth_sharp, neon), + SharpRegular8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp_regular, neon), + SharpSmooth8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp_smooth, neon), + Sharp8Tap => bd_fn!(mc::decl_fn, BD, put_8tap_sharp, neon), + Bilinear => bd_fn!(mc::decl_fn, BD, put_bilin, neon), + }); + self.mct = enum_map!(Filter2d => mct::Fn; match key { + Regular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular, neon), + RegularSmooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular_smooth, neon), + RegularSharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_regular_sharp, neon), + SmoothRegular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth_regular, neon), + Smooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth, neon), + SmoothSharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_smooth_sharp, neon), + SharpRegular8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp_regular, neon), + SharpSmooth8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp_smooth, neon), + Sharp8Tap => bd_fn!(mct::decl_fn, BD, prep_8tap_sharp, neon), + Bilinear => bd_fn!(mct::decl_fn, BD, prep_bilin, neon), + }); + + self.avg = bd_fn!(BP, avg, neon); + self.w_avg = bd_fn!(BP, w_avg, neon); + self.mask = bd_fn!(BP, mask, neon); + self.blend = bd_fn!(BP, blend, neon); + self.blend_h = bd_fn!(BP, blend_h, neon); + self.blend_v = bd_fn!(BP, blend_v, neon); + + self.w_mask = enum_map!(Rav1dPixelLayoutSubSampled => w_mask::Fn; match key { + I420 => bd_fn!(w_mask::decl_fn, BD, w_mask_420, neon), + I422 => bd_fn!(w_mask::decl_fn, BD, w_mask_422, neon), + I444 => bd_fn!(w_mask::decl_fn, BD, w_mask_444, neon), + }); + + self.warp8x8 = bd_fn!(BP, warp_affine_8x8, neon); + self.warp8x8t = bd_fn!(BP, warp_affine_8x8t, neon); + self.emu_edge = bd_fn!(BP, emu_edge, neon); + + self } - (*c).mc[Regular8Tap as usize] = bd_fn!(BP, put_8tap_regular, neon); - (*c).mc[RegularSmooth8Tap as usize] = bd_fn!(BP, put_8tap_regular_smooth, neon); - (*c).mc[RegularSharp8Tap as usize] = bd_fn!(BP, put_8tap_regular_sharp, neon); - (*c).mc[SmoothRegular8Tap as usize] = bd_fn!(BP, put_8tap_smooth_regular, neon); - (*c).mc[Smooth8Tap as usize] = bd_fn!(BP, put_8tap_smooth, neon); - (*c).mc[SmoothSharp8Tap as usize] = bd_fn!(BP, put_8tap_smooth_sharp, neon); - (*c).mc[SharpRegular8Tap as usize] = bd_fn!(BP, put_8tap_sharp_regular, neon); - (*c).mc[SharpSmooth8Tap as usize] = bd_fn!(BP, put_8tap_sharp_smooth, neon); - (*c).mc[Sharp8Tap as usize] = bd_fn!(BP, put_8tap_sharp, neon); - (*c).mc[Bilinear as usize] = bd_fn!(BP, put_bilin, neon); - - (*c).mct[Regular8Tap as usize] = bd_fn!(BP, prep_8tap_regular, neon); - (*c).mct[RegularSmooth8Tap as usize] = bd_fn!(BP, prep_8tap_regular_smooth, neon); - (*c).mct[RegularSharp8Tap as usize] = bd_fn!(BP, prep_8tap_regular_sharp, neon); - (*c).mct[SmoothRegular8Tap as usize] = bd_fn!(BP, prep_8tap_smooth_regular, neon); - (*c).mct[Smooth8Tap as usize] = bd_fn!(BP, prep_8tap_smooth, neon); - (*c).mct[SmoothSharp8Tap as usize] = bd_fn!(BP, prep_8tap_smooth_sharp, neon); - (*c).mct[SharpRegular8Tap as usize] = bd_fn!(BP, prep_8tap_sharp_regular, neon); - (*c).mct[SharpSmooth8Tap as usize] = bd_fn!(BP, prep_8tap_sharp_smooth, neon); - (*c).mct[Sharp8Tap as usize] = bd_fn!(BP, prep_8tap_sharp, neon); - (*c).mct[Bilinear as usize] = bd_fn!(BP, prep_bilin, neon); - - (*c).avg = bd_fn!(BP, avg, neon); - (*c).w_avg = bd_fn!(BP, w_avg, neon); - (*c).mask = bd_fn!(BP, mask, neon); - (*c).blend = bd_fn!(BP, blend, neon); - (*c).blend_h = bd_fn!(BP, blend_h, neon); - (*c).blend_v = bd_fn!(BP, blend_v, neon); - - (*c).w_mask[0] = bd_fn!(BP, w_mask_444, neon); - (*c).w_mask[1] = bd_fn!(BP, w_mask_422, neon); - (*c).w_mask[2] = bd_fn!(BP, w_mask_420, neon); - - (*c).warp8x8 = bd_fn!(BP, warp_affine_8x8, neon); - (*c).warp8x8t = bd_fn!(BP, warp_affine_8x8t, neon); - (*c).emu_edge = bd_fn!(BP, emu_edge, neon); -} + #[inline(always)] + const fn init(self, flags: CpuFlags) -> Self { + #[cfg(feature = "asm")] + { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + return self.init_x86::(flags); + } + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + { + return self.init_arm::(flags); + } + } -#[cold] -pub unsafe fn rav1d_mc_dsp_init(c: *mut Rav1dMCDSPContext) { - use Filter2d::*; - - (*c).mc[Regular8Tap as usize] = put_8tap_regular_c_erased::; - (*c).mc[RegularSmooth8Tap as usize] = put_8tap_regular_smooth_c_erased::; - (*c).mc[RegularSharp8Tap as usize] = put_8tap_regular_sharp_c_erased::; - (*c).mc[SharpRegular8Tap as usize] = put_8tap_sharp_regular_c_erased::; - (*c).mc[SharpSmooth8Tap as usize] = put_8tap_sharp_smooth_c_erased::; - (*c).mc[Sharp8Tap as usize] = put_8tap_sharp_c_erased::; - (*c).mc[SmoothRegular8Tap as usize] = put_8tap_smooth_regular_c_erased::; - (*c).mc[Smooth8Tap as usize] = put_8tap_smooth_c_erased::; - (*c).mc[SmoothSharp8Tap as usize] = put_8tap_smooth_sharp_c_erased::; - (*c).mc[Bilinear as usize] = put_bilin_c_erased::; - - (*c).mct[Regular8Tap as usize] = prep_8tap_regular_c_erased::; - (*c).mct[RegularSmooth8Tap as usize] = prep_8tap_regular_smooth_c_erased::; - (*c).mct[RegularSharp8Tap as usize] = prep_8tap_regular_sharp_c_erased::; - (*c).mct[SharpRegular8Tap as usize] = prep_8tap_sharp_regular_c_erased::; - (*c).mct[SharpSmooth8Tap as usize] = prep_8tap_sharp_smooth_c_erased::; - (*c).mct[Sharp8Tap as usize] = prep_8tap_sharp_c_erased::; - (*c).mct[SmoothRegular8Tap as usize] = prep_8tap_smooth_regular_c_erased::; - (*c).mct[Smooth8Tap as usize] = prep_8tap_smooth_c_erased::; - (*c).mct[SmoothSharp8Tap as usize] = prep_8tap_smooth_sharp_c_erased::; - (*c).mct[Bilinear as usize] = prep_bilin_c_erased::; - - (*c).mc_scaled[Regular8Tap as usize] = put_8tap_regular_scaled_c_erased::; - (*c).mc_scaled[RegularSmooth8Tap as usize] = put_8tap_regular_smooth_scaled_c_erased::; - (*c).mc_scaled[RegularSharp8Tap as usize] = put_8tap_regular_sharp_scaled_c_erased::; - (*c).mc_scaled[SharpRegular8Tap as usize] = put_8tap_sharp_regular_scaled_c_erased::; - (*c).mc_scaled[SharpSmooth8Tap as usize] = put_8tap_sharp_smooth_scaled_c_erased::; - (*c).mc_scaled[Sharp8Tap as usize] = put_8tap_sharp_scaled_c_erased::; - (*c).mc_scaled[SmoothRegular8Tap as usize] = put_8tap_smooth_regular_scaled_c_erased::; - (*c).mc_scaled[Smooth8Tap as usize] = put_8tap_smooth_scaled_c_erased::; - (*c).mc_scaled[SmoothSharp8Tap as usize] = put_8tap_smooth_sharp_scaled_c_erased::; - (*c).mc_scaled[Bilinear as usize] = put_bilin_scaled_c_erased::; - - (*c).mct_scaled[Regular8Tap as usize] = prep_8tap_regular_scaled_c_erased::; - (*c).mct_scaled[RegularSmooth8Tap as usize] = prep_8tap_regular_smooth_scaled_c_erased::; - (*c).mct_scaled[RegularSharp8Tap as usize] = prep_8tap_regular_sharp_scaled_c_erased::; - (*c).mct_scaled[SharpRegular8Tap as usize] = prep_8tap_sharp_regular_scaled_c_erased::; - (*c).mct_scaled[SharpSmooth8Tap as usize] = prep_8tap_sharp_smooth_scaled_c_erased::; - (*c).mct_scaled[Sharp8Tap as usize] = prep_8tap_sharp_scaled_c_erased::; - (*c).mct_scaled[SmoothRegular8Tap as usize] = prep_8tap_smooth_regular_scaled_c_erased::; - (*c).mct_scaled[Smooth8Tap as usize] = prep_8tap_smooth_scaled_c_erased::; - (*c).mct_scaled[SmoothSharp8Tap as usize] = prep_8tap_smooth_sharp_scaled_c_erased::; - (*c).mct_scaled[Bilinear as usize] = prep_bilin_scaled_c_erased::; - - (*c).avg = avg_c_erased::; - (*c).w_avg = w_avg_c_erased::; - (*c).mask = mask_c_erased::; - - (*c).w_mask[0] = w_mask_444_c_erased::; - (*c).w_mask[1] = w_mask_422_c_erased::; - (*c).w_mask[2] = w_mask_420_c_erased::; - - (*c).blend = blend_c_erased::; - (*c).blend_v = blend_v_c_erased::; - (*c).blend_h = blend_h_c_erased::; - (*c).warp8x8 = warp_affine_8x8_c_erased::; - (*c).warp8x8t = warp_affine_8x8t_c_erased::; - (*c).emu_edge = emu_edge_c_erased::; - (*c).resize = resize_c_erased::; - - #[cfg(feature = "asm")] - cfg_if::cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - mc_dsp_init_x86::(c); - } else if #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] { - mc_dsp_init_arm::(c); + #[allow(unreachable_code)] // Reachable on some #[cfg]s. + { + let _ = flags; + self } } + + pub const fn new(flags: CpuFlags) -> Self { + Self::default::().init::(flags) + } } diff --git a/src/obu.rs b/src/obu.rs index 307f17fef..18def8e5b 100644 --- a/src/obu.rs +++ b/src/obu.rs @@ -67,7 +67,6 @@ use crate::src::picture::rav1d_picture_copy_props; use crate::src::picture::rav1d_thread_picture_ref; use crate::src::picture::rav1d_thread_picture_unref; use crate::src::picture::PictureFlags; -use crate::src::r#ref::rav1d_ref_dec; use crate::src::thread_task::FRAME_ERROR; use std::array; use std::cmp; @@ -2296,7 +2295,7 @@ unsafe fn parse_obus( rav1d_thread_picture_unref(&mut c.refs[i as usize].p); } let _ = mem::take(&mut c.refs[i as usize].segmap); - rav1d_ref_dec(&mut c.refs[i as usize].refmvs); + let _ = mem::take(&mut c.refs[i as usize].refmvs); let _ = mem::take(&mut c.cdf[i]); } c.frame_flags @@ -2622,7 +2621,7 @@ unsafe fn parse_obus( c.cdf[i as usize] = c.cdf[r as usize].clone(); c.refs[i as usize].segmap = c.refs[r as usize].segmap.clone(); - rav1d_ref_dec(&mut c.refs[i as usize].refmvs); + let _ = mem::take(&mut c.refs[i as usize].refmvs); } } c.frame_hdr = None; diff --git a/src/recon.rs b/src/recon.rs index 1aaa571ec..848751c4c 100644 --- a/src/recon.rs +++ b/src/recon.rs @@ -10,6 +10,7 @@ use crate::include::common::intops::clip; use crate::include::common::intops::ulog2; use crate::include::dav1d::dav1d::Rav1dInloopFilterType; use crate::include::dav1d::headers::Rav1dPixelLayout; +use crate::include::dav1d::headers::Rav1dPixelLayoutSubSampled; use crate::include::dav1d::headers::Rav1dWarpedMotionParams; use crate::include::dav1d::headers::Rav1dWarpedMotionType; use crate::include::dav1d::picture::RAV1D_PICTURE_ALIGNMENT; @@ -1607,7 +1608,6 @@ unsafe fn read_coef_tree( mut dst: *mut BD::Pixel, ) { let ts = &mut *f.ts.offset((*t).ts as isize); - let dsp: *const Rav1dDSPContext = f.dsp; let t_dim: *const TxfmInfo = &*dav1d_txfm_dimensions.as_ptr().offset(ytx as isize) as *const TxfmInfo; let txw = (*t_dim).w as c_int; @@ -1778,7 +1778,7 @@ unsafe fn read_coef_tree( "dq", ); } - ((*dsp).itx.itxfm_add[ytx as usize][txtp as usize]) + (f.dsp.itx.itxfm_add[ytx as usize][txtp as usize]) .expect("non-null function pointer")( dst.cast(), f.cur.stride[0], @@ -2068,7 +2068,7 @@ unsafe fn mc( || dy + bh4 * v_mul + (my != 0) as c_int * 4 > h { let emu_edge_buf = BD::select_mut(emu_edge); - ((*f.dsp).mc.emu_edge)( + (f.dsp.mc.emu_edge)( (bw4 * h_mul + (mx != 0) as c_int * 7) as intptr_t, (bh4 * v_mul + (my != 0) as c_int * 7) as intptr_t, w as intptr_t, @@ -2090,27 +2090,27 @@ unsafe fn mc( .offset(dx as isize); } if !dst8.is_null() { - (*f.dsp).mc.mc[filter_2d as usize]( - dst8.cast(), + f.dsp.mc.mc[filter_2d].call::( + dst8, dst_stride, - r#ref.cast(), + r#ref, ref_stride, bw4 * h_mul, bh4 * v_mul, mx << (ss_hor == 0) as c_int, my << (ss_ver == 0) as c_int, - f.bitdepth_max, + BitDepth::from_c(f.bitdepth_max), ); } else { - (*f.dsp).mc.mct[filter_2d as usize]( + f.dsp.mc.mct[filter_2d].call::( dst16, - r#ref.cast(), + r#ref, ref_stride, bw4 * h_mul, bh4 * v_mul, mx << (ss_hor == 0) as c_int, my << (ss_ver == 0) as c_int, - f.bitdepth_max, + BD::from_c(f.bitdepth_max), ); } } else { @@ -2147,7 +2147,7 @@ unsafe fn mc( let h = refp.p.p.h + ss_ver >> ss_ver; if left < 3 || top < 3 || right + 4 > w || bottom + 4 > h { let emu_edge_buf = BD::select_mut(emu_edge); - ((*f.dsp).mc.emu_edge)( + (f.dsp.mc.emu_edge)( (right - left + 7) as intptr_t, (bottom - top + 7) as intptr_t, w as intptr_t, @@ -2170,10 +2170,10 @@ unsafe fn mc( .offset(left as isize); } if !dst8.is_null() { - (*f.dsp).mc.mc_scaled[filter_2d as usize]( - dst8.cast(), + f.dsp.mc.mc_scaled[filter_2d].call::( + dst8, dst_stride, - r#ref.cast(), + r#ref, ref_stride, bw4 * h_mul, bh4 * v_mul, @@ -2181,12 +2181,12 @@ unsafe fn mc( pos_y & 0x3ff, f.svc[refidx][0].step, f.svc[refidx][1].step, - f.bitdepth_max, + BD::from_c(f.bitdepth_max), ); } else { - (*f.dsp).mc.mct_scaled[filter_2d as usize]( + f.dsp.mc.mct_scaled[filter_2d].call::( dst16, - r#ref.cast(), + r#ref, ref_stride, bw4 * h_mul, bh4 * v_mul, @@ -2194,7 +2194,7 @@ unsafe fn mc( pos_y & 0x3ff, f.svc[refidx][0].step, f.svc[refidx][1].step, - f.bitdepth_max, + BD::from_c(f.bitdepth_max), ); } } @@ -2252,7 +2252,7 @@ unsafe fn obmc( dav1d_filter_2d[(*t.a).filter[1][(bx4 + x + 1) as usize] as usize] [(*t.a).filter[0][(bx4 + x + 1) as usize] as usize], )?; - ((*f.dsp).mc.blend_h)( + (f.dsp.mc.blend_h)( dst.offset((x * h_mul) as isize).cast(), dst_stride, lap.cast(), @@ -2292,7 +2292,7 @@ unsafe fn obmc( dav1d_filter_2d[t.l.filter[1][(by4 + y + 1) as usize] as usize] [t.l.filter[0][(by4 + y + 1) as usize] as usize], )?; - ((*f.dsp).mc.blend_v)( + (f.dsp.mc.blend_v)( dst.offset((y * v_mul) as isize * BD::pxstride(dst_stride)) .cast(), dst_stride, @@ -2321,7 +2321,6 @@ unsafe fn warp_affine( wmp: &Rav1dWarpedMotionParams, ) -> Result<(), ()> { assert!(dst8.is_null() ^ dst16.is_null()); - let dsp = &*f.dsp; let ss_ver = (pl != 0 && f.cur.p.layout == Rav1dPixelLayout::I420) as c_int; let ss_hor = (pl != 0 && f.cur.p.layout != Rav1dPixelLayout::I444) as c_int; let h_mul = 4 >> ss_hor; @@ -2347,7 +2346,7 @@ unsafe fn warp_affine( let mut ref_stride = refp.p.stride[(pl != 0) as usize]; if dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height { let emu_edge_buf = BD::select_mut(emu_edge); - ((*f.dsp).mc.emu_edge)( + (f.dsp.mc.emu_edge)( 15, 15, width as intptr_t, @@ -2367,7 +2366,7 @@ unsafe fn warp_affine( .offset(dx as isize); } if !dst16.is_null() { - (dsp.mc.warp8x8t)( + (f.dsp.mc.warp8x8t)( &mut *dst16.offset(x as isize), dstride, ref_ptr.cast(), @@ -2378,7 +2377,7 @@ unsafe fn warp_affine( f.bitdepth_max, ); } else { - (dsp.mc.warp8x8)( + (f.dsp.mc.warp8x8)( dst8.offset(x as isize).cast(), dstride, ref_ptr.cast(), @@ -2467,7 +2466,7 @@ pub(crate) unsafe fn rav1d_recon_b_intra( BD::select(&t.scratch.c2rust_unnamed_0.interintra_edge_pal); interintra_edge_pal.pal[0].as_ptr() }; - (*f.dsp).ipred.pal_pred.call::( + f.dsp.ipred.pal_pred.call::( dst, f.cur.stride[0], pal, @@ -2848,7 +2847,7 @@ pub(crate) unsafe fn rav1d_recon_b_intra( &t.scratch.c2rust_unnamed_0.pal_idx[(bw4 * bh4 * 16) as usize..], ) }; - (*f.dsp).ipred.pal_pred.call::( + f.dsp.ipred.pal_pred.call::( (f.cur.data.data[1] as *mut BD::Pixel).offset(uv_dstoff as isize), f.cur.stride[1], pal[1].as_ptr(), @@ -2856,7 +2855,7 @@ pub(crate) unsafe fn rav1d_recon_b_intra( cbw4 * 4, cbh4 * 4, ); - (*f.dsp).ipred.pal_pred.call::( + f.dsp.ipred.pal_pred.call::( (f.cur.data.data[2] as *mut BD::Pixel).offset(uv_dstoff as isize), f.cur.stride[1], pal[2].as_ptr(), @@ -3165,8 +3164,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( bs: BlockSize, b: &Av1Block, ) -> Result<(), ()> { - let ts = &mut *f.ts.offset(t.ts as isize); - let dsp = &*f.dsp; + let ts: &mut super::internal::Rav1dTileState = &mut *f.ts.offset(t.ts as isize); let bx4 = t.b.x & 31; let by4 = t.b.y & 31; let ss_ver = (f.cur.p.layout == Rav1dPixelLayout::I420) as c_int; @@ -3186,6 +3184,12 @@ pub(crate) unsafe fn rav1d_recon_b_inter( } else { Rav1dPixelLayout::I444 - f.cur.p.layout } as usize; + let chr_layout_idx_w_mask = f + .cur + .p + .layout + .try_into() + .unwrap_or(Rav1dPixelLayoutSubSampled::I444); let cbh4 = bh4 + ss_ver >> ss_ver; let cbw4 = bw4 + ss_hor >> ss_hor; let mut dst = (f.cur.data.data[0] as *mut BD::Pixel) @@ -3289,7 +3293,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( let mut mask = &[][..]; match comp_inter_type { CompInterType::Avg => { - (dsp.mc.avg)( + (f.dsp.mc.avg)( dst.cast(), f.cur.stride[0], tmp[0].as_mut_ptr(), @@ -3301,7 +3305,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( } CompInterType::WeightedAvg => { jnt_weight = f.jnt_weights[b.r#ref()[0] as usize][b.r#ref()[1] as usize] as c_int; - (dsp.mc.w_avg)( + (f.dsp.mc.w_avg)( dst.cast(), f.cur.stride[0], tmp[0].as_mut_ptr(), @@ -3313,8 +3317,8 @@ pub(crate) unsafe fn rav1d_recon_b_inter( ); } CompInterType::Seg => { - dsp.mc.w_mask[chr_layout_idx]( - dst.cast(), + f.dsp.mc.w_mask[chr_layout_idx_w_mask].call( + dst, f.cur.stride[0], tmp[b.mask_sign() as usize].as_mut_ptr(), tmp[(b.mask_sign() == 0) as usize].as_mut_ptr(), @@ -3322,13 +3326,13 @@ pub(crate) unsafe fn rav1d_recon_b_inter( bh4 * 4, seg_mask.as_mut_ptr(), b.mask_sign() as c_int, - f.bitdepth_max, + BD::from_c(f.bitdepth_max), ); mask = &seg_mask[..]; } CompInterType::Wedge => { mask = dav1d_wedge_masks[bs as usize][0][0][b.wedge_idx() as usize]; - (dsp.mc.mask)( + (f.dsp.mc.mask)( dst.cast(), f.cur.stride[0], tmp[b.mask_sign() as usize].as_mut_ptr(), @@ -3389,7 +3393,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( .offset(uvdstoff as isize); match comp_inter_type { CompInterType::Avg => { - (dsp.mc.avg)( + (f.dsp.mc.avg)( uvdst.cast(), f.cur.stride[1], tmp[0].as_mut_ptr(), @@ -3400,7 +3404,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( ); } CompInterType::WeightedAvg => { - (dsp.mc.w_avg)( + (f.dsp.mc.w_avg)( uvdst.cast(), f.cur.stride[1], tmp[0].as_mut_ptr(), @@ -3412,7 +3416,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( ); } CompInterType::Seg | CompInterType::Wedge => { - (dsp.mc.mask)( + (f.dsp.mc.mask)( uvdst.cast(), f.cur.stride[1], tmp[b.mask_sign() as usize].as_mut_ptr(), @@ -3521,7 +3525,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( ); let tl_edge = &tl_edge_array[tl_edge_offset..]; let tmp = &mut interintra_edge_pal.interintra; - (*dsp).ipred.intra_pred[m as usize].call( + f.dsp.ipred.intra_pred[m as usize].call( tmp.as_mut_ptr(), 4 * bw4 as isize * ::core::mem::size_of::() as isize, tl_edge.as_ptr(), @@ -3540,7 +3544,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( dav1d_wedge_masks[bs as usize][0][0][b.wedge_idx() as usize] } }; - (dsp.mc.blend)( + (f.dsp.mc.blend)( dst.cast(), f.cur.stride[0], tmp.as_mut_ptr().cast(), @@ -3819,7 +3823,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( ); let tl_edge = &tl_edge_array[tl_edge_offset..]; let tmp = &mut interintra_edge_pal.interintra; - (*dsp).ipred.intra_pred[m as usize].call( + f.dsp.ipred.intra_pred[m as usize].call( tmp.as_mut_ptr(), cbw4 as isize * 4 * ::core::mem::size_of::() as isize, tl_edge.as_ptr(), @@ -3830,7 +3834,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( 0, BD::from_c(f.bitdepth_max), ); - (dsp.mc.blend)( + (f.dsp.mc.blend)( uvdst.cast(), f.cur.stride[1], tmp.as_mut_ptr().cast(), @@ -4010,7 +4014,7 @@ pub(crate) unsafe fn rav1d_recon_b_inter( "dq", ); } - (dsp.itx.itxfm_add[b.uvtx as usize][txtp as usize]) + (f.dsp.itx.itxfm_add[b.uvtx as usize][txtp as usize]) .expect("non-null function pointer")( uvdst.offset((4 * x) as isize).cast(), f.cur.stride[1], @@ -4264,7 +4268,7 @@ pub(crate) unsafe fn rav1d_filter_sbrow_resize( let dst_w = f.sr_cur.p.p.w + ss_hor >> ss_hor; let src_w = 4 * f.bw + ss_hor >> ss_hor; let img_h = f.cur.p.h - sbsz * 4 * sby + ss_ver >> ss_ver; - ((*f.dsp).mc.resize)( + (f.dsp.mc.resize)( dst.cast(), dst_stride, src.cast(), diff --git a/src/ref.rs b/src/ref.rs index c0087ae3c..df0bb9816 100644 --- a/src/ref.rs +++ b/src/ref.rs @@ -1,7 +1,3 @@ -use crate::src::mem::rav1d_mem_pool_pop; -use crate::src::mem::rav1d_mem_pool_push; -use crate::src::mem::Rav1dMemPool; -use crate::src::mem::Rav1dMemPoolBuffer; use libc::free; use libc::malloc; use std::ffi::c_int; @@ -24,37 +20,6 @@ pub unsafe fn rav1d_ref_inc(r#ref: *mut Rav1dRef) { (*r#ref).ref_cnt.fetch_add(1, Ordering::Relaxed); } -unsafe extern "C" fn pool_free_callback(data: *const u8, user_data: *mut c_void) { - rav1d_mem_pool_push( - data as *mut Rav1dMemPool, - user_data as *mut Rav1dMemPoolBuffer, - ); -} - -pub unsafe fn rav1d_ref_create_using_pool( - pool: *mut Rav1dMemPool, - mut size: usize, -) -> *mut Rav1dRef { - size = size - .wrapping_add(::core::mem::size_of::<*mut c_void>()) - .wrapping_sub(1) - & !(::core::mem::size_of::<*mut c_void>()).wrapping_sub(1); - let buf: *mut Rav1dMemPoolBuffer = - rav1d_mem_pool_pop(pool, size.wrapping_add(::core::mem::size_of::())); - if buf.is_null() { - return 0 as *mut Rav1dRef; - } - let res: *mut Rav1dRef = - &mut *(buf as *mut Rav1dRef).offset(-(1 as c_int) as isize) as *mut Rav1dRef; - (*res).data = (*buf).data; - (*res).const_data = pool as *const c_void; - (*res).ref_cnt = AtomicI32::new(1); - (*res).free_ref = 0 as c_int; - (*res).free_callback = Some(pool_free_callback); - (*res).user_data = buf as *mut c_void; - return res; -} - pub unsafe fn rav1d_ref_wrap( ptr: *const u8, free_callback: Option ()>, diff --git a/src/refmvs.rs b/src/refmvs.rs index 82a52da68..a53b48b4d 100644 --- a/src/refmvs.rs +++ b/src/refmvs.rs @@ -6,6 +6,7 @@ use crate::include::dav1d::headers::Rav1dWarpedMotionType; use crate::src::align::Align16; use crate::src::align::AlignedVec64; use crate::src::disjoint_mut::DisjointMut; +use crate::src::disjoint_mut::DisjointMutArcSlice; use crate::src::env::fix_mv_precision; use crate::src::env::get_gmv_2d; use crate::src::env::get_poc_diff; @@ -25,7 +26,6 @@ use std::ffi::c_uint; use std::marker::PhantomData; use std::mem; use std::ptr; -use std::slice; use zerocopy::FromZeroes; #[cfg(feature = "asm")] @@ -52,6 +52,7 @@ extern "C" { row_start8: c_int, _r: *const FFISafe>>, _ri: &[usize; 31], + _rp: *const FFISafe>, ); } @@ -64,6 +65,8 @@ extern "C" { col_end8: c_int, row_start8: c_int, row_end8: c_int, + _rp_proj: *const FFISafe>>, + _rp_ref: *const FFISafe<[Option>; 7]>, ); } @@ -96,6 +99,7 @@ extern "C" { row_start8: c_int, _r: *const FFISafe>>, _ri: &[usize; 31], + _rp: *const FFISafe>, ); fn dav1d_save_tmvs_avx512icl( rp: *mut refmvs_temporal_block, @@ -108,6 +112,7 @@ extern "C" { row_start8: c_int, _r: *const FFISafe>>, _ri: &[usize; 31], + _rp: *const FFISafe>, ); } @@ -123,7 +128,7 @@ extern "C" { ); } -#[derive(Clone, Copy, Default)] +#[derive(Clone, Copy, Default, PartialEq, Eq)] #[repr(C, packed)] pub struct refmvs_temporal_block { pub mv: mv, @@ -210,8 +215,6 @@ pub(crate) struct RefMvsFrame { pub mfmv_ref2cur: [c_int; 3], pub mfmv_ref2ref: [[c_int; 7]; 3], pub n_mfmvs: c_int, - pub rp: *mut refmvs_temporal_block, - pub rp_ref: *const *mut refmvs_temporal_block, pub rp_proj: DisjointMut>, pub rp_stride: u32, pub r: DisjointMut>, @@ -221,61 +224,6 @@ pub(crate) struct RefMvsFrame { pub n_frame_threads: u32, } -impl RefMvsFrame { - pub fn as_mut_dav1d<'a>(&'a self) -> refmvs_frame<'a> { - let Self { - iw4, - ih4, - iw8, - ih8, - sbsz, - use_ref_frame_mvs, - sign_bias, - mfmv_sign, - pocdiff, - mfmv_ref, - mfmv_ref2cur, - mfmv_ref2ref, - n_mfmvs, - rp, - rp_ref, - ref rp_proj, - rp_stride, - ref r, - r_stride, - n_tile_rows, - n_tile_threads, - n_frame_threads, - } = *self; - refmvs_frame { - _lifetime: PhantomData, - _frm_hdr: ptr::null(), // never used - iw4, - ih4, - iw8, - ih8, - sbsz, - use_ref_frame_mvs, - sign_bias, - mfmv_sign, - pocdiff, - mfmv_ref, - mfmv_ref2cur, - mfmv_ref2ref, - n_mfmvs, - rp, - rp_ref, - rp_proj: rp_proj.as_mut_ptr(), - rp_stride: rp_stride as _, - r: r.as_mut_ptr(), - r_stride: r_stride as _, - n_tile_rows: n_tile_rows as _, - n_tile_threads: n_tile_threads as _, - n_frame_threads: n_frame_threads as _, - } - } -} - #[repr(C)] pub struct refmvs_tile_range { pub start: c_int, @@ -312,6 +260,8 @@ pub(crate) type load_tmvs_fn = unsafe extern "C" fn( col_end8: c_int, row_start8: c_int, row_end8: c_int, + rp_proj: *const FFISafe>>, + rp_ref: *const FFISafe<[Option>; 7]>, ) -> (); pub type save_tmvs_fn = unsafe extern "C" fn( @@ -325,6 +275,7 @@ pub type save_tmvs_fn = unsafe extern "C" fn( row_start8: c_int, r: *const FFISafe>>, ri: &[usize; 31], + rp: *const FFISafe>, ) -> (); #[cfg(all(feature = "asm", any(target_arch = "arm", target_arch = "aarch64"),))] @@ -340,6 +291,7 @@ extern "C" { row_start8: c_int, _r: *const FFISafe>>, _ri: &[usize; 31], + _rp: *const FFISafe>, ); } @@ -355,12 +307,147 @@ pub type splat_mv_fn = unsafe extern "C" fn( #[repr(C)] pub(crate) struct Rav1dRefmvsDSPContext { - pub load_tmvs: load_tmvs_fn, - pub save_tmvs: save_tmvs_fn, - pub splat_mv: splat_mv_fn, + load_tmvs: load_tmvs_fn, + save_tmvs: save_tmvs_fn, + splat_mv: splat_mv_fn, } impl Rav1dRefmvsDSPContext { + pub unsafe fn load_tmvs( + &self, + rf: &RefMvsFrame, + rp: &Option>, + rp_ref: &[Option>; 7], + tile_row_idx: c_int, + col_start8: c_int, + col_end8: c_int, + row_start8: c_int, + row_end8: c_int, + ) { + let RefMvsFrame { + iw4, + ih4, + iw8, + ih8, + sbsz, + use_ref_frame_mvs, + sign_bias, + mfmv_sign, + pocdiff, + mfmv_ref, + mfmv_ref2cur, + mfmv_ref2ref, + n_mfmvs, + ref rp_proj, + rp_stride, + ref r, + r_stride, + n_tile_rows, + n_tile_threads, + n_frame_threads, + } = *rf; + fn mvs_to_dav1d( + mvs: &Option>, + ) -> *mut refmvs_temporal_block { + mvs.as_ref() + .map(|rp| rp.inner.as_mut_ptr()) + .unwrap_or_else(ptr::null_mut) + } + let rp_ref_dav1d = rp_ref.each_ref().map(mvs_to_dav1d); + let rf_dav1d = refmvs_frame { + _lifetime: PhantomData, + _frm_hdr: ptr::null(), // never used + iw4, + ih4, + iw8, + ih8, + sbsz, + use_ref_frame_mvs, + sign_bias, + mfmv_sign, + pocdiff, + mfmv_ref, + mfmv_ref2cur, + mfmv_ref2ref, + n_mfmvs, + rp: mvs_to_dav1d(rp), + rp_ref: rp_ref_dav1d.as_ptr(), + rp_proj: rp_proj.as_mut_ptr(), + rp_stride: rp_stride as _, + r: r.as_mut_ptr(), + r_stride: r_stride as _, + n_tile_rows: n_tile_rows as _, + n_tile_threads: n_tile_threads as _, + n_frame_threads: n_frame_threads as _, + }; + (self.load_tmvs)( + &rf_dav1d, + tile_row_idx, + col_start8, + col_end8, + row_start8, + row_end8, + FFISafe::new(&rf.rp_proj), + FFISafe::new(rp_ref), + ); + } + + // cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors + // into buffers for use in future frame's temporal MV prediction + pub unsafe fn save_tmvs( + &self, + rt: &refmvs_tile, + rf: &RefMvsFrame, + rp: &Option>, + col_start8: c_int, + col_end8: c_int, + row_start8: c_int, + row_end8: c_int, + ) { + assert!(row_start8 >= 0); + assert!((row_end8 - row_start8) as c_uint <= 16); + + let rp = &*rp.as_ref().unwrap(); + + let row_end8 = cmp::min(row_end8, rf.ih8); + let col_end8 = cmp::min(col_end8, rf.iw8); + let stride = rf.rp_stride as usize; + let ref_sign = &rf.mfmv_sign; + let ri = <&[_; 31]>::try_from(&rt.r[6..]).unwrap(); + + // SAFETY: Note that for asm calls, disjointedness is unchecked here, + // even with `#[cfg(debug_assertions)]`. This is because the disjointedness + // is more fine-grained than the pointers passed to asm. + // For the Rust fallback fn, the extra args `&rf.r` and `ri` + // are passed to allow for disjointedness checking. + let rr = &ri.map(|ri| { + if ri > rf.r.len() { + return ptr::null(); + } + // SAFETY: `.add` is in-bounds; checked above. + unsafe { rf.r.as_mut_ptr().cast_const().add(ri) } + }); + + (self.save_tmvs)( + // SAFETY: Note that for asm calls, disjointedness is unchecked here, + // even with `#[cfg(debug_assertions)]`. This is because the disjointedness + // is more fine-grained than the pointers passed to asm. + // For the Rust fallback fn, the extra arg `rp` + // is passed to allow for disjointedness checking. + rp.inner.as_mut_ptr().add(row_start8 as usize * stride), + stride as isize, + rr, + ref_sign, + col_end8, + row_end8, + col_start8, + row_start8, + FFISafe::new(&rf.r), + ri, + FFISafe::new(rp), + ); + } + pub unsafe fn splat_mv( &self, rf: &RefMvsFrame, @@ -1269,53 +1356,6 @@ pub(crate) fn rav1d_refmvs_find( *ctx = refmv_ctx << 4 | globalmv_ctx << 3 | newmv_ctx; } -// cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors -// into buffers for use in future frame's temporal MV prediction -pub(crate) unsafe fn rav1d_refmvs_save_tmvs( - dsp: &Rav1dRefmvsDSPContext, - rt: &refmvs_tile, - rf: &RefMvsFrame, - col_start8: c_int, - col_end8: c_int, - row_start8: c_int, - row_end8: c_int, -) { - assert!(row_start8 >= 0); - assert!((row_end8 - row_start8) as c_uint <= 16); - let row_end8 = cmp::min(row_end8, rf.ih8); - let col_end8 = cmp::min(col_end8, rf.iw8); - let stride = rf.rp_stride as isize; - let ref_sign = &rf.mfmv_sign; - let rp = rf.rp.offset(row_start8 as isize * stride); - let ri = <&[_; 31]>::try_from(&rt.r[6..]).unwrap(); - - // SAFETY: Note that for asm calls, disjointedness is unchecked here, - // even with `#[cfg(debug_assertions)]`. This is because the disjointedness - // is more fine-grained than the pointers passed to asm. - // For the Rust fallback fn, the extra args `&rf.r` and `ri` - // are passed to do allow for disjointedness checking. - let rr = &ri.map(|ri| { - if ri > rf.r.len() { - return ptr::null(); - } - // SAFETY: `.add` is in-bounds; checked above. - unsafe { rf.r.as_mut_ptr().cast_const().add(ri) } - }); - - (dsp.save_tmvs)( - rp, - stride, - rr, - ref_sign, - col_end8, - row_end8, - col_start8, - row_start8, - FFISafe::new(&rf.r), - ri, - ); -} - pub(crate) fn rav1d_refmvs_tile_sbrow_init( rf: &RefMvsFrame, tile_col_start4: c_int, @@ -1383,7 +1423,11 @@ unsafe extern "C" fn load_tmvs_c( col_end8: c_int, row_start8: c_int, mut row_end8: c_int, + rp_proj: *const FFISafe>>, + rp_ref: *const FFISafe<[Option>; 7]>, ) { + let rp_proj = FFISafe::get(rp_proj); + let rp_ref = FFISafe::get(rp_ref); let rf = &*rf; if rf.n_tile_threads == 1 { @@ -1394,67 +1438,68 @@ unsafe extern "C" fn load_tmvs_c( row_end8 = cmp::min(row_end8, rf.ih8); let col_start8i = cmp::max(col_start8 - 8, 0); let col_end8i = cmp::min(col_end8 + 8, rf.iw8); - let stride = rf.rp_stride; - let mut rp_proj = rf - .rp_proj - .offset(16 * stride * tile_row_idx as isize + (row_start8 & 15) as isize * stride); - for _ in row_start8..row_end8 { - for x in col_start8..col_end8 { - (*rp_proj.offset(x as isize)).mv = mv::INVALID; + let stride = rf.rp_stride as usize; + let rp_proj_offset = 16 * stride * tile_row_idx as usize; + for y in row_start8..row_end8 { + let offset = rp_proj_offset + (y & 15) as usize * stride; + for rp_proj in + &mut *rp_proj.index_mut(offset + col_start8 as usize..offset + col_end8 as usize) + { + rp_proj.mv = mv::INVALID; } - rp_proj = rp_proj.offset(stride as isize); } - rp_proj = rf.rp_proj.offset(16 * stride * tile_row_idx as isize); for n in 0..rf.n_mfmvs { let ref2cur = rf.mfmv_ref2cur[n as usize]; if ref2cur == i32::MIN { continue; } - let r#ref = rf.mfmv_ref[n as usize] as c_int; - let ref_sign = r#ref - 4; - let mut r = (*rf.rp_ref.offset(r#ref as isize)) - .offset(row_start8 as isize * stride) - .cast_const(); + let r#ref = rf.mfmv_ref[n as usize]; + let ref_sign = r#ref as i32 - 4; + let r = &*rp_ref[r#ref as usize].as_ref().unwrap().inner; for y in row_start8..row_end8 { let y_sb_align = y & !7; let y_proj_start = cmp::max(y_sb_align, row_start8); let y_proj_end = cmp::min(y_sb_align + 8, row_end8); let mut x = col_start8i; while x < col_end8i { - let mut rb = r.offset(x as isize); - let b_ref = (*rb).r#ref; - if b_ref == 0 { + let mut rbi = y as usize * stride + x as usize; + let mut rb = *r.index(rbi); + if rb.r#ref == 0 { x += 1; continue; } - let ref2ref = rf.mfmv_ref2ref[n as usize][(b_ref - 1) as usize]; + let ref2ref = rf.mfmv_ref2ref[n as usize][(rb.r#ref - 1) as usize]; if ref2ref == 0 { x += 1; continue; } - let b_mv = (*rb).mv; - let offset = mv_projection(b_mv, ref2cur, ref2ref); + let offset = mv_projection(rb.mv, ref2cur, ref2ref); let mut pos_x = x + apply_sign((offset.x as c_int).abs() >> 6, offset.x as c_int ^ ref_sign); let pos_y = y + apply_sign((offset.y as c_int).abs() >> 6, offset.y as c_int ^ ref_sign); if pos_y >= y_proj_start && pos_y < y_proj_end { - let pos = (pos_y & 15) as isize * stride; + let pos = (pos_y & 15) as usize * stride; loop { let x_sb_align = x & !7; if pos_x >= cmp::max(x_sb_align - 8, col_start8) && pos_x < cmp::min(x_sb_align + 16, col_end8) { - (*rp_proj.offset(pos + pos_x as isize)).mv = (*rb).mv; - (*rp_proj.offset(pos + pos_x as isize)).r#ref = ref2ref as i8; + *rp_proj.index_mut( + rp_proj_offset + (pos as isize + pos_x as isize) as usize, + ) = refmvs_temporal_block { + mv: rb.mv, + r#ref: ref2ref as i8, + }; } x += 1; if x >= col_end8i { break; } - rb = rb.offset(1); - let rb_mv = (*rb).mv; - if (*rb).r#ref != b_ref || rb_mv != b_mv { + let prev_rb = rb; + rbi += 1; + rb = *r.index(rbi); + if rb != prev_rb { break; } pos_x += 1; @@ -1465,21 +1510,21 @@ unsafe extern "C" fn load_tmvs_c( if x >= col_end8i { break; } - rb = rb.offset(1); - let rb_mv = (*rb).mv; - if (*rb).r#ref != b_ref || rb_mv != b_mv { + let prev_rb = rb; + rbi += 1; + rb = *r.index(rbi); + if rb != prev_rb { break; } } } } - r = r.offset(stride as isize); } } } unsafe extern "C" fn save_tmvs_c( - mut rp: *mut refmvs_temporal_block, + _rp: *mut refmvs_temporal_block, stride: ptrdiff_t, _rr: *const [*const refmvs_block; 31], ref_sign: *const [u8; 7], @@ -1489,11 +1534,17 @@ unsafe extern "C" fn save_tmvs_c( row_start8: c_int, r: *const FFISafe>>, ri: &[usize; 31], + rp: *const FFISafe>, ) { let r = FFISafe::get(r); + let rp = FFISafe::get(rp); + let rp = &*rp.inner; let ref_sign = &*ref_sign; + + let stride = stride as usize; let [col_end8, row_end8, col_start8, row_start8] = [col_end8, row_end8, col_start8, row_start8].map(|it| it as usize); + for y in row_start8..row_end8 { let b = ri[(y & 15) * 2]; let mut x = col_start8; @@ -1511,10 +1562,10 @@ unsafe extern "C" fn save_tmvs_c( } }; let block = block(1).or_else(|| block(0)).unwrap_or_default(); - slice::from_raw_parts_mut(rp.add(x), bw8 as usize).fill(block); + let offset = y * stride + x; + rp.index_mut(offset..offset + bw8 as usize).fill(block); x += bw8 as usize; } - rp = rp.offset(stride as isize); } } @@ -1523,9 +1574,8 @@ pub(crate) fn rav1d_refmvs_init_frame( seq_hdr: &Rav1dSequenceHeader, frm_hdr: &Rav1dFrameHeader, ref_poc: &[c_uint; 7], - rp: *mut refmvs_temporal_block, ref_ref_poc: &[[c_uint; 7]; 7], - rp_ref: &[*mut refmvs_temporal_block; 7], + rp_ref: &[Option>; 7], n_tile_threads: u32, n_frame_threads: u32, ) -> Rav1dResult { @@ -1560,8 +1610,6 @@ pub(crate) fn rav1d_refmvs_init_frame( rf.n_tile_rows = n_tile_rows; rf.n_tile_threads = n_tile_threads; rf.n_frame_threads = n_frame_threads; - rf.rp = rp; - rf.rp_ref = rp_ref.as_ptr(); let poc = frm_hdr.frame_offset as c_uint; for i in 0..7 { let poc_diff = get_poc_diff(seq_hdr.order_hint_n_bits, ref_poc[i] as c_int, poc as c_int); @@ -1578,12 +1626,12 @@ pub(crate) fn rav1d_refmvs_init_frame( rf.n_mfmvs = 0; if frm_hdr.use_ref_frame_mvs != 0 && seq_hdr.order_hint_n_bits != 0 { let mut total = 2; - if !rp_ref[0].is_null() && ref_ref_poc[0][6] != ref_poc[3] { + if rp_ref[0].is_some() && ref_ref_poc[0][6] != ref_poc[3] { rf.mfmv_ref[rf.n_mfmvs as usize] = 0; // last rf.n_mfmvs += 1; total = 3; } - if !rp_ref[4].is_null() + if rp_ref[4].is_some() && get_poc_diff( seq_hdr.order_hint_n_bits, ref_poc[4] as c_int, @@ -1593,7 +1641,7 @@ pub(crate) fn rav1d_refmvs_init_frame( rf.mfmv_ref[rf.n_mfmvs as usize] = 4; // bwd rf.n_mfmvs += 1; } - if !rp_ref[5].is_null() + if rp_ref[5].is_some() && get_poc_diff( seq_hdr.order_hint_n_bits, ref_poc[5] as c_int, @@ -1604,7 +1652,7 @@ pub(crate) fn rav1d_refmvs_init_frame( rf.n_mfmvs += 1; } if rf.n_mfmvs < total - && !rp_ref[6].is_null() + && rp_ref[6].is_some() && get_poc_diff( seq_hdr.order_hint_n_bits, ref_poc[6] as c_int, @@ -1614,7 +1662,7 @@ pub(crate) fn rav1d_refmvs_init_frame( rf.mfmv_ref[rf.n_mfmvs as usize] = 6; // altref rf.n_mfmvs += 1; } - if rf.n_mfmvs < total && !rp_ref[1].is_null() { + if rf.n_mfmvs < total && rp_ref[1].is_some() { rf.mfmv_ref[rf.n_mfmvs as usize] = 1; // last2 rf.n_mfmvs += 1; } diff --git a/src/thread_task.rs b/src/thread_task.rs index 526e1ce21..fd3720daf 100644 --- a/src/thread_task.rs +++ b/src/thread_task.rs @@ -16,6 +16,7 @@ use crate::src::error::Rav1dResult; use crate::src::fg_apply::rav1d_apply_grain_row; use crate::src::fg_apply::rav1d_prep_grain; use crate::src::internal::Rav1dContext; +use crate::src::internal::Rav1dDSPContext; use crate::src::internal::Rav1dFrameData; use crate::src::internal::Rav1dTaskContext; use crate::src::internal::Rav1dTaskContext_task_thread; @@ -595,17 +596,12 @@ unsafe fn abort_frame(c: &Rav1dContext, f: &mut Rav1dFrameData, error: Rav1dResu #[inline] unsafe fn delayed_fg_task<'l, 'ttd: 'l>( - c: &Rav1dContext, ttd: &'ttd TaskThreadData, task_thread_lock: &'l mut Option>, ) { let delayed_fg = &mut task_thread_lock.as_mut().unwrap(); let in_0 = delayed_fg.in_0; let out = delayed_fg.out; - let mut off = 0; - if (*out).p.bpc != 8 as c_int { - off = ((*out).p.bpc >> 1) - 4; - } let mut row; let mut progmax; let mut done; @@ -618,18 +614,18 @@ unsafe fn delayed_fg_task<'l, 'ttd: 'l>( // TODO(SJC): the thread lock was dropped here, but we need the grain out of it... match (*out).p.bpc { #[cfg(feature = "bitdepth_8")] - 8 => { + bpc @ 8 => { rav1d_prep_grain::( - &(*(c.dsp).as_ptr().offset(0)).fg, + &Rav1dDSPContext::get(bpc).as_ref().unwrap().fg, &mut *out, &*in_0, BitDepth8::select_mut(&mut delayed_fg.grain), ); } #[cfg(feature = "bitdepth_16")] - 10 | 12 => { + bpc @ 10 | bpc @ 12 => { rav1d_prep_grain::( - &(*(c.dsp).as_ptr().offset(off as isize)).fg, + &Rav1dDSPContext::get(bpc).as_ref().unwrap().fg, &mut *out, &*in_0, BitDepth16::select_mut(&mut delayed_fg.grain), @@ -666,9 +662,9 @@ unsafe fn delayed_fg_task<'l, 'ttd: 'l>( let delayed_fg = ttd.delayed_fg.lock().unwrap(); match (*out).p.bpc { #[cfg(feature = "bitdepth_8")] - 8 => { + bpc @ 8 => { rav1d_apply_grain_row::( - &(*((*c).dsp).as_ptr().offset(0)).fg, + &Rav1dDSPContext::get(bpc).as_ref().unwrap().fg, &mut *out, &*in_0, BitDepth8::select(&delayed_fg.grain), @@ -676,9 +672,9 @@ unsafe fn delayed_fg_task<'l, 'ttd: 'l>( ); } #[cfg(feature = "bitdepth_16")] - 10 | 12 => { + bpc @ 10 | bpc @ 12 => { rav1d_apply_grain_row::( - &(*((*c).dsp).as_ptr().offset(off as isize)).fg, + &Rav1dDSPContext::get(bpc).as_ref().unwrap().fg, &mut *out, &*in_0, BitDepth16::select(&delayed_fg.grain), @@ -745,7 +741,7 @@ pub unsafe fn rav1d_worker_task(c: &Rav1dContext, task_thread: Arc