diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst index 70289d6815d2a..03f7510384d26 100644 --- a/Documentation/userspace-api/iommufd.rst +++ b/Documentation/userspace-api/iommufd.rst @@ -63,6 +63,13 @@ Following IOMMUFD objects are exposed to userspace: space usually has mappings from guest-level I/O virtual addresses to guest- level physical addresses. +- IOMMUFD_FAULT, representing a software queue for an HWPT reporting IO page + faults using the IOMMU HW's PRI (Page Request Interface). This queue object + provides user space an FD to poll the page fault events and also to respond + to those events. A FAULT object must be created first to get a fault_id that + could be then used to allocate a fault-enabled HWPT via the IOMMU_HWPT_ALLOC + command by setting the IOMMU_HWPT_FAULT_ID_VALID bit in its flags field. + - IOMMUFD_OBJ_VIOMMU, representing a slice of the physical IOMMU instance, passed to or shared with a VM. It may be some HW-accelerated virtualization features and some SW resources used by the VM. For examples: @@ -109,6 +116,25 @@ Following IOMMUFD objects are exposed to userspace: vIOMMU, which is a separate ioctl call from attaching the same device to an HWPT_PAGING that the vIOMMU holds. +- IOMMUFD_OBJ_VEVENTQ, representing a software queue for a vIOMMU to report its + events such as translation faults occurred to a nested stage-1 (excluding I/O + page faults that should go through IOMMUFD_OBJ_FAULT) and HW-specific events. + This queue object provides user space an FD to poll/read the vIOMMU events. A + vIOMMU object must be created first to get its viommu_id, which could be then + used to allocate a vEVENTQ. Each vIOMMU can support multiple types of vEVENTS, + but is confined to one vEVENTQ per vEVENTQ type. + +- IOMMUFD_OBJ_HW_QUEUE, representing a hardware accelerated queue, as a subset + of IOMMU's virtualization features, for the IOMMU HW to directly read or write + the virtual queue memory owned by a guest OS. This HW-acceleration feature can + allow VM to work with the IOMMU HW directly without a VM Exit, so as to reduce + overhead from the hypercalls. Along with the HW QUEUE object, iommufd provides + user space an mmap interface for VMM to mmap a physical MMIO region from the + host physical address space to the guest physical address space, allowing the + guest OS to directly control the allocated HW QUEUE. Thus, when allocating a + HW QUEUE, the VMM must request a pair of mmap info (offset/length) and pass in + exactly to an mmap syscall via its offset and length arguments. + All user-visible objects are destroyed via the IOMMU_DESTROY uAPI. The diagrams below show relationships between user-visible objects and kernel @@ -251,8 +277,11 @@ User visible objects are backed by following datastructures: - iommufd_device for IOMMUFD_OBJ_DEVICE. - iommufd_hwpt_paging for IOMMUFD_OBJ_HWPT_PAGING. - iommufd_hwpt_nested for IOMMUFD_OBJ_HWPT_NESTED. +- iommufd_fault for IOMMUFD_OBJ_FAULT. - iommufd_viommu for IOMMUFD_OBJ_VIOMMU. - iommufd_vdevice for IOMMUFD_OBJ_VDEVICE. +- iommufd_veventq for IOMMUFD_OBJ_VEVENTQ. +- iommufd_hw_queue for IOMMUFD_OBJ_HW_QUEUE. Several terminologies when looking at these datastructures: diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 1f25423de3833..5461f7d9806a9 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1762,3 +1762,12 @@ CONFIG_CORESIGHT_STM=m CONFIG_CORESIGHT_CPU_DEBUG=m CONFIG_CORESIGHT_CTI=m CONFIG_MEMTEST=y +CONFIG_NVGRACE_GPU_VFIO_PCI=m +CONFIG_NVGRACE_EGM=m +CONFIG_VFIO_DEVICE_CDEV=y +# CONFIG_VFIO_CONTAINER is not set +CONFIG_FAULT_INJECTION=y +CONFIG_IOMMUFD_DRIVER=y +CONFIG_IOMMUFD=y +CONFIG_IOMMUFD_TEST=y +CONFIG_IOMMUFD_VFIO_CONTAINER=y diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a582d25eb1c8b..7402419ec0163 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1472,6 +1472,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool s2_force_noncacheable = false, vfio_allow_any_uc = false; unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; + unsigned long mt; struct kvm *kvm = vcpu->kvm; struct vm_area_struct *vma; short vma_shift; @@ -1593,6 +1594,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = min(vma_pagesize, (long)max_map_size); } + mt = FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot)); + /* * Both the canonical IPA and fault IPA must be hugepage-aligned to * ensure we find the right PFN and lay down the mapping in the right @@ -1676,7 +1679,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, writable = false; } - if (exec_fault && s2_force_noncacheable) + if (exec_fault && s2_force_noncacheable && mt != MT_NORMAL) return -ENOEXEC; /* diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 79ea97d4797ec..8be91974e786d 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -440,25 +440,24 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void apply_seal_endbr(s32 *start, s32 *end) { } -void apply_retpolines(s32 *start, s32 *end, struct module *mod) +void apply_retpolines(s32 *start, s32 *end) { } -void apply_returns(s32 *start, s32 *end, struct module *mod) +void apply_returns(s32 *start, s32 *end) { } void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { } -void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod) +void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index aa62949335ece..77777fc78d0b4 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -48,8 +48,7 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len), - NULL); + image->alt_len)); return 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a46b792a171cb..099b63eb178d7 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2845,7 +2845,7 @@ static bool is_uprobe_at_func_entry(struct pt_regs *regs) return true; /* endbr64 (64-bit only) */ - if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn)) + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) return true; return false; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e3903b731305c..a2141665239b5 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -87,16 +87,16 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; * instructions were patched in already: */ extern int alternatives_patched; -struct module; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod); -extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); -extern void apply_returns(s32 *start, s32 *end, struct module *mod); -extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); +extern void apply_returns(s32 *start, s32 *end); +extern void apply_seal_endbr(s32 *start, s32 *end); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, - s32 *start_cfi, s32 *end_cfi, struct module *mod); + s32 *start_cfi, s32 *end_cfi); + +struct module; struct callthunk_sites { s32 *call_start, *call_end; diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f9cb4d07df58f..f2265246249ad 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -36,21 +36,9 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) static inline unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) { -#ifdef CONFIG_X86_KERNEL_IBT - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) + if (is_endbr((void*)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; -#endif + return fentry_ip; } #define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip) diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 1e59581d500ca..d955e0d1cbf15 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -65,7 +65,7 @@ static inline __attribute_const__ u32 gen_endbr_poison(void) return 0x001f0f66; /* osp nopl (%rax) */ } -static inline bool is_endbr(u32 val) +static inline bool __is_endbr(u32 val) { if (val == gen_endbr_poison()) return true; @@ -74,6 +74,7 @@ static inline bool is_endbr(u32 val) return val == gen_endbr(); } +extern __noendbr bool is_endbr(u32 *val); extern __noendbr u64 ibt_save(bool disable); extern __noendbr void ibt_restore(u64 save); @@ -98,7 +99,7 @@ extern __noendbr void ibt_restore(u64 save); #define __noendbr -static inline bool is_endbr(u32 val) { return false; } +static inline bool is_endbr(u32 *val) { return false; } static inline u64 ibt_save(bool disable) { return 0; } static inline void ibt_restore(u64 save) { } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c71b575bf2292..9a252bb0d34b8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -392,10 +392,8 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, - struct module *mod) +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { - u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; @@ -405,14 +403,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, } if (a->instrlen != 6 || - wr_instr[0] != CALL_RIP_REL_OPCODE || - wr_instr[1] != CALL_RIP_REL_MODRM) { + instr[0] != CALL_RIP_REL_OPCODE || + instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ - disp = *(s32 *)(wr_instr + 2); + disp = *(s32 *)(instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ @@ -450,8 +448,7 @@ static inline u8 * instr_va(struct alt_instr *i) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end, - struct module *mod) + struct alt_instr *end) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; @@ -480,7 +477,6 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; - u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -494,11 +490,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, } instr = instr_va(a); - wr_instr = module_writable_address(mod, instr); - replacement = (u8 *)&a->repl_offset + a->repl_offset; - wr_replacement = module_writable_address(mod, replacement); - BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -509,9 +501,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - memcpy(insn_buff, wr_instr, a->instrlen); + memcpy(insn_buff, instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); - text_poke_early(wr_instr, insn_buff, a->instrlen); + text_poke_early(instr, insn_buff, a->instrlen); continue; } @@ -521,12 +513,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); - memcpy(insn_buff, wr_replacement, a->replacementlen); + memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { - insn_buff_sz = alt_replace_call(instr, insn_buff, a, - mod); + insn_buff_sz = alt_replace_call(instr, insn_buff, a); if (insn_buff_sz < 0) continue; } @@ -536,11 +527,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(wr_instr, insn_buff, insn_buff_sz); + text_poke_early(instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -731,20 +722,18 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -772,9 +761,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } @@ -810,8 +799,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; @@ -820,13 +808,12 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -846,38 +833,43 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } #else -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr); - -static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) +__noendbr bool is_endbr(u32 *val) { - u32 endbr, poison = gen_endbr_poison(); + u32 endbr; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) - return; + __get_kernel_nofault(&endbr, val, u32, Efault); + return __is_endbr(endbr); - if (!is_endbr(endbr)) { +Efault: + return false; +} + +static void poison_cfi(void *addr); + +static void __init_or_module poison_endbr(void *addr, bool warn) +{ + u32 poison = gen_endbr_poison(); + + if (!is_endbr(addr)) { WARN_ON_ONCE(warn); return; } @@ -889,7 +881,7 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(wr_addr, &poison, 4); + text_poke_early(addr, &poison, 4); } /* @@ -898,23 +890,22 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, wr_addr, true); + poison_endbr(addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16, wr_addr - 16); + poison_cfi(addr - 16); } } #else -void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -1005,7 +996,7 @@ static u32 cfi_seed __ro_after_init; static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; - while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) @@ -1136,7 +1127,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_disable_callers(s32 *start, s32 *end) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1148,23 +1139,20 @@ static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); - + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, jmp, 2); + text_poke_early(addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_enable_callers(s32 *start, s32 *end) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1174,115 +1162,106 @@ static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, mov, 2); + text_poke_early(addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(wr_addr); + hash = decode_preamble_hash(addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(wr_addr + 1, &hash, 4); + text_poke_early(addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) +static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(wr_addr); + hash = decode_preamble_hash(addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; - text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) +static void cfi_rewrite_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr + 16, wr_addr + 16, false); + poison_endbr(addr+16, false); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(wr_addr + 2, &hash, 4); + text_poke_early(addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (hash) { - text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } @@ -1291,9 +1270,8 @@ static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { - bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, @@ -1311,7 +1289,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_disable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1322,11 +1300,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } - ret = cfi_rand_preamble(start_cfi, end_cfi, mod); + ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rand_callers(start_retpoline, end_retpoline); if (ret) goto err; } @@ -1338,7 +1316,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_enable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1348,17 +1326,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); + ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi, mod); + cfi_rewrite_endbr(start_cfi, end_cfi); if (builtin) pr_info("Using FineIBT CFI\n"); @@ -1377,7 +1355,7 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr, void *wr_addr) +static void poison_cfi(void *addr) { switch (cfi_mode) { case CFI_FINEIBT: @@ -1389,8 +1367,8 @@ static void poison_cfi(void *addr, void *wr_addr) * ud2 * 1: nop */ - poison_endbr(addr, wr_addr, false); - poison_hash(wr_addr + fineibt_preamble_hash); + poison_endbr(addr, false); + poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: @@ -1399,7 +1377,7 @@ static void poison_cfi(void *addr, void *wr_addr) * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(wr_addr + 1); + poison_hash(addr + 1); break; default: @@ -1410,21 +1388,22 @@ static void poison_cfi(void *addr, void *wr_addr) #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr) { } +static void poison_cfi(void *addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, mod); + start_cfi, end_cfi, + /* .builtin = */ false); } #ifdef CONFIG_SMP @@ -1721,16 +1700,16 @@ void __init alternative_instructions(void) paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, NULL); + __cfi_sites, __cfi_sites_end, true); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); - apply_returns(__return_sites, __return_sites_end, NULL); + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); - apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); + apply_alternatives(__alt_instructions, __alt_instructions_end); /* * Now all calls are established. Apply the call thunks if @@ -1741,7 +1720,7 @@ void __init alternative_instructions(void) /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 166bc0ea3bdff..cace6e8d7cc77 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,13 +118,10 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) { + if (ftrace_poke_late) text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - } else { - mutex_lock(&text_mutex); - text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); - mutex_unlock(&text_mutex); - } + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; } @@ -321,7 +318,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - void *ret; + int ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -352,15 +349,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = text_poke_copy(trampoline, (void *)start_offset, size); - if (WARN_ON(!ret)) + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - text_poke_copy(ip, retq, sizeof(retq)); + memcpy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -368,7 +365,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - if (!text_poke_copy(ip, x86_nops[2], 2)) + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); + if (ret < 0) goto fail; } @@ -381,7 +379,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - text_poke_copy(ptr, &ops, sizeof(unsigned long)); + *ptr = (unsigned long)ops; op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -397,7 +395,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -407,9 +405,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - text_poke_copy_locked(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE, false); + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 72e6a45e7ec24..09608fd936876 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -373,16 +373,7 @@ static bool can_probe(unsigned long paddr) kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - u32 insn; - - /* - * Since 'addr' is not guaranteed to be safe to access, use - * copy_from_kernel_nofault() to read the instruction: - */ - if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) - return NULL; - - if (is_endbr(insn)) { + if (is_endbr((u32 *)addr)) { *on_func_entry = !offset || offset == 4; if (*on_func_entry) offset = 4; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8984abd91c001..837450b6e882f 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -146,21 +146,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - void *wr_loc = module_writable_address(me, loc); - - if (memcmp(wr_loc, &zero, size)) { + if (memcmp(loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(wr_loc, &val, size); + write(loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -227,7 +224,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; @@ -236,6 +233,8 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -266,20 +265,20 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size, me); + apply_retpolines(rseg, rseg + retpolines->sh_size); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size, me); + apply_returns(rseg, rseg + returns->sh_size); } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size, me); + apply_alternatives(aseg, aseg + alt->sh_size); } if (calls || alt) { struct callthunk_sites cs = {}; @@ -298,28 +297,8 @@ int module_finalize(const Elf_Ehdr *hdr, } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); } - - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - - return 0; -} - -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *locks = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - } - if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -329,6 +308,10 @@ int module_post_finalize(const Elf_Ehdr *hdr, text, text_end); } + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a43fc5af973d2..f36508b67278a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -641,7 +641,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * See emit_prologue(), for IBT builds the trampoline hook is preceded * with an ENDBR instruction. */ - if (is_endbr(*(u32 *)ip)) + if (is_endbr(ip)) ip += ENDBR_INSN_SIZE; return __bpf_arch_text_poke(ip, t, old_addr, new_addr); @@ -3036,7 +3036,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* skip patched call instruction and point orig_call to actual * body of the kernel function. */ - if (is_endbr(*(u32 *)orig_call)) + if (is_endbr(orig_call)) orig_call += ENDBR_INSN_SIZE; orig_call += X86_PATCH_SIZE; } diff --git a/debian.nvidia-6.14/config/annotations b/debian.nvidia-6.14/config/annotations index 739111f9002e4..b7df95c86bb93 100644 --- a/debian.nvidia-6.14/config/annotations +++ b/debian.nvidia-6.14/config/annotations @@ -196,8 +196,10 @@ CONFIG_DRM_PANIC_SCREEN_QR_CODE policy<{'amd64': '-', 'arm64': ' CONFIG_GCC_VERSION policy<{'amd64': '130300', 'arm64': '130300'}> CONFIG_HAVE_RUST policy<{'amd64': 'y', 'arm64': '-'}> CONFIG_IOMMUFD_VFIO_CONTAINER policy<{'arm64': 'y'}> +CONFIG_IRQ_MSI_IOMMU policy<{'amd64': '-', 'arm64': 'y'}> CONFIG_LD_VERSION policy<{'amd64': '24200', 'arm64': '24200'}> CONFIG_MTD_NAND_CORE policy<{'amd64': 'm', 'arm64': 'y'}> +CONFIG_NVGRACE_EGM policy<{'arm64': 'm'}> CONFIG_NVIDIA_FFA_EC policy<{'arm64': 'y'}> CONFIG_PAHOLE_VERSION policy<{'amd64': '125', 'arm64': '125'}> CONFIG_PINCTRL_MT8901 policy<{'arm64': 'y'}> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index b727724946556..3a0c68fdc05e3 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -486,20 +486,10 @@ static void ghes_kick_task_work(struct callback_head *head) static bool ghes_do_memory_failure(u64 physical_addr, int flags) { - unsigned long pfn; - if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE)) return false; - pfn = PHYS_PFN(physical_addr); - if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) { - pr_warn_ratelimited(FW_WARN GHES_PFX - "Invalid address in generic error data: %#llx\n", - physical_addr); - return false; - } - - memory_failure_queue(pfn, flags); + memory_failure_queue(PHYS_PFN(physical_addr), flags); return true; } diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index b1726a3515f6f..5c78fa6ae7725 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -1080,6 +1080,7 @@ void software_node_notify(struct device *dev) if (!swnode) return; + kobject_get(&swnode->kobj); ret = sysfs_create_link(&dev->kobj, &swnode->kobj, "software_node"); if (ret) return; @@ -1089,8 +1090,6 @@ void software_node_notify(struct device *dev) sysfs_remove_link(&dev->kobj, "software_node"); return; } - - kobject_get(&swnode->kobj); } void software_node_notify_remove(struct device *dev) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index ec1b5e32b9725..5124e7431fe31 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -154,7 +154,6 @@ config IOMMU_DMA select DMA_OPS_HELPERS select IOMMU_API select IOMMU_IOVA - select IRQ_MSI_IOMMU select NEED_SG_DMA_LENGTH select NEED_SG_DMA_FLAGS if SWIOTLB diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index b425baef968b9..d9bea8f1f636d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -7,13 +7,22 @@ #include "arm-smmu-v3.h" -void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) +void *arm_smmu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); + const struct arm_smmu_impl_ops *impl_ops = master->smmu->impl_ops; struct iommu_hw_info_arm_smmuv3 *info; u32 __iomem *base_idr; unsigned int i; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) { + if (!impl_ops || !impl_ops->hw_info) + return ERR_PTR(-EOPNOTSUPP); + return impl_ops->hw_info(master->smmu, length, type); + } + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); @@ -30,15 +39,6 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) return info; } -static struct iommu_domain * -arm_smmu_get_msi_mapping_domain(struct iommu_domain *domain) -{ - struct arm_smmu_nested_domain *nested_domain = - container_of(domain, struct arm_smmu_nested_domain, domain); - - return &nested_domain->vsmmu->s2_parent->domain; -} - static void arm_smmu_make_nested_cd_table_ste( struct arm_smmu_ste *target, struct arm_smmu_master *master, struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) @@ -96,6 +96,47 @@ static void arm_smmu_make_nested_domain_ste( } } +int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain) +{ + struct arm_smmu_vmaster *vmaster; + unsigned long vsid; + int ret; + + iommu_group_mutex_assert(state->master->dev); + + ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core, + state->master->dev, &vsid); + if (ret) + return ret; + + vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL); + if (!vmaster) + return -ENOMEM; + vmaster->vsmmu = nested_domain->vsmmu; + vmaster->vsid = vsid; + state->vmaster = vmaster; + + return 0; +} + +void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state) +{ + struct arm_smmu_master *master = state->master; + + mutex_lock(&master->smmu->streams_mutex); + kfree(master->vmaster); + master->vmaster = state->vmaster; + mutex_unlock(&master->smmu->streams_mutex); +} + +void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) +{ + struct arm_smmu_attach_state state = { .master = master }; + + arm_smmu_attach_commit_vmaster(&state); +} + static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, struct device *dev) { @@ -145,7 +186,6 @@ static void arm_smmu_domain_nested_free(struct iommu_domain *domain) } static const struct iommu_domain_ops arm_smmu_nested_ops = { - .get_msi_mapping_domain = arm_smmu_get_msi_mapping_domain, .attach_dev = arm_smmu_attach_dev_nested, .free = arm_smmu_domain_nested_free, }; @@ -185,7 +225,7 @@ static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg, return 0; } -static struct iommu_domain * +struct iommu_domain * arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, const struct iommu_user_data *user_data) { @@ -296,8 +336,8 @@ static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu, return 0; } -static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, - struct iommu_user_data_array *array) +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) { struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); struct arm_smmu_device *smmu = vsmmu->smmu; @@ -351,25 +391,14 @@ static const struct iommufd_viommu_ops arm_vsmmu_ops = { .cache_invalidate = arm_vsmmu_cache_invalidate, }; -struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, - struct iommu_domain *parent, - struct iommufd_ctx *ictx, - unsigned int viommu_type) +size_t arm_smmu_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type) { - struct arm_smmu_device *smmu = - iommu_get_iommu_dev(dev, struct arm_smmu_device, iommu); struct arm_smmu_master *master = dev_iommu_priv_get(dev); - struct arm_smmu_domain *s2_parent = to_smmu_domain(parent); - struct arm_vsmmu *vsmmu; - - if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3) - return ERR_PTR(-EOPNOTSUPP); + struct arm_smmu_device *smmu = master->smmu; if (!(smmu->features & ARM_SMMU_FEAT_NESTING)) - return ERR_PTR(-EOPNOTSUPP); - - if (s2_parent->smmu != master->smmu) - return ERR_PTR(-EINVAL); + return 0; /* * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW @@ -377,7 +406,7 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, * any change to remove this. */ if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) - return ERR_PTR(-EOPNOTSUPP); + return 0; /* * Must support some way to prevent the VM from bypassing the cache @@ -389,19 +418,58 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, */ if (!arm_smmu_master_canwbs(master) && !(smmu->features & ARM_SMMU_FEAT_S2FWB)) - return ERR_PTR(-EOPNOTSUPP); + return 0; + + if (smmu->impl_ops && smmu->impl_ops->vsmmu_size && + viommu_type == smmu->impl_ops->vsmmu_type) + return smmu->impl_ops->vsmmu_size; + + if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3) + return 0; + + return VIOMMU_STRUCT_SIZE(struct arm_vsmmu, core); +} - vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core, - &arm_vsmmu_ops); - if (IS_ERR(vsmmu)) - return ERR_CAST(vsmmu); +int arm_vsmmu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + struct arm_smmu_device *smmu = + container_of(viommu->iommu_dev, struct arm_smmu_device, iommu); + struct arm_smmu_domain *s2_parent = to_smmu_domain(parent_domain); + + if (s2_parent->smmu != smmu) + return -EINVAL; vsmmu->smmu = smmu; vsmmu->s2_parent = s2_parent; /* FIXME Move VMID allocation from the S2 domain allocation to here */ vsmmu->vmid = s2_parent->s2_cfg.vmid; - return &vsmmu->core; + if (smmu->impl_ops && smmu->impl_ops->vsmmu_init && + viommu->type == smmu->impl_ops->vsmmu_type) + return smmu->impl_ops->vsmmu_init(vsmmu, user_data); + + viommu->ops = &arm_vsmmu_ops; + return 0; +} + +int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt) +{ + struct iommu_vevent_arm_smmuv3 vevt; + int i; + + lockdep_assert_held(&vmaster->vsmmu->smmu->streams_mutex); + + vevt.evt[0] = cpu_to_le64((evt[0] & ~EVTQ_0_SID) | + FIELD_PREP(EVTQ_0_SID, vmaster->vsid)); + for (i = 1; i < EVTQ_ENT_DWORDS; i++) + vevt.evt[i] = cpu_to_le64(evt[i]); + + return iommufd_viommu_report_event(&vmaster->vsmmu->core, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3, &vevt, + sizeof(vevt)); } MODULE_IMPORT_NS("IOMMUFD"); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index e495334d1c43a..b8a2f17712671 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1813,8 +1813,8 @@ static void arm_smmu_decode_event(struct arm_smmu_device *smmu, u64 *raw, mutex_unlock(&smmu->streams_mutex); } -static int arm_smmu_handle_event(struct arm_smmu_device *smmu, - struct arm_smmu_event *event) +static int arm_smmu_handle_event(struct arm_smmu_device *smmu, u64 *evt, + struct arm_smmu_event *event) { int ret = 0; u32 perm = 0; @@ -1823,6 +1823,10 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, struct iommu_fault *flt = &fault_evt.fault; switch (event->id) { + case EVT_ID_BAD_STE_CONFIG: + case EVT_ID_STREAM_DISABLED_FAULT: + case EVT_ID_BAD_SUBSTREAMID_CONFIG: + case EVT_ID_BAD_CD_CONFIG: case EVT_ID_TRANSLATION_FAULT: case EVT_ID_ADDR_SIZE_FAULT: case EVT_ID_ACCESS_FAULT: @@ -1832,31 +1836,30 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, return -EOPNOTSUPP; } - if (!event->stall) - return -EOPNOTSUPP; - - if (event->read) - perm |= IOMMU_FAULT_PERM_READ; - else - perm |= IOMMU_FAULT_PERM_WRITE; + if (event->stall) { + if (event->read) + perm |= IOMMU_FAULT_PERM_READ; + else + perm |= IOMMU_FAULT_PERM_WRITE; - if (event->instruction) - perm |= IOMMU_FAULT_PERM_EXEC; + if (event->instruction) + perm |= IOMMU_FAULT_PERM_EXEC; - if (event->privileged) - perm |= IOMMU_FAULT_PERM_PRIV; + if (event->privileged) + perm |= IOMMU_FAULT_PERM_PRIV; - flt->type = IOMMU_FAULT_PAGE_REQ; - flt->prm = (struct iommu_fault_page_request) { - .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, - .grpid = event->stag, - .perm = perm, - .addr = event->iova, - }; + flt->type = IOMMU_FAULT_PAGE_REQ; + flt->prm = (struct iommu_fault_page_request){ + .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, + .grpid = event->stag, + .perm = perm, + .addr = event->iova, + }; - if (event->ssv) { - flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - flt->prm.pasid = event->ssid; + if (event->ssv) { + flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + flt->prm.pasid = event->ssid; + } } mutex_lock(&smmu->streams_mutex); @@ -1866,7 +1869,12 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, goto out_unlock; } - ret = iommu_report_device_fault(master->dev, &fault_evt); + if (event->stall) + ret = iommu_report_device_fault(master->dev, &fault_evt); + else if (master->vmaster && !event->s2) + ret = arm_vmaster_report_event(master->vmaster, evt); + else + ret = -EOPNOTSUPP; /* Unhandled events should be pinned */ out_unlock: mutex_unlock(&smmu->streams_mutex); return ret; @@ -1944,7 +1952,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) do { while (!queue_remove_raw(q, evt)) { arm_smmu_decode_event(smmu, evt, &event); - if (arm_smmu_handle_event(smmu, &event)) + if (arm_smmu_handle_event(smmu, evt, &event)) arm_smmu_dump_event(smmu, evt, &event, &rs); put_device(event.dev); @@ -2803,6 +2811,7 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(new_domain); unsigned long flags; + int ret; /* * arm_smmu_share_asid() must not see two domains pointing to the same @@ -2832,9 +2841,18 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, } if (smmu_domain) { + if (new_domain->type == IOMMU_DOMAIN_NESTED) { + ret = arm_smmu_attach_prepare_vmaster( + state, to_smmu_nested_domain(new_domain)); + if (ret) + return ret; + } + master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL); - if (!master_domain) + if (!master_domain) { + kfree(state->vmaster); return -ENOMEM; + } master_domain->master = master; master_domain->ssid = state->ssid; if (new_domain->type == IOMMU_DOMAIN_NESTED) @@ -2861,6 +2879,7 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); kfree(master_domain); + kfree(state->vmaster); return -EINVAL; } @@ -2893,6 +2912,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) lockdep_assert_held(&arm_smmu_asid_lock); + arm_smmu_attach_commit_vmaster(state); + if (state->ats_enabled && !master->ats_enabled) { arm_smmu_enable_ats(master); } else if (state->ats_enabled && master->ats_enabled) { @@ -3162,6 +3183,7 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); + arm_smmu_master_clear_vmaster(master); arm_smmu_make_bypass_ste(master->smmu, &ste); arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); return 0; @@ -3180,7 +3202,9 @@ static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, struct device *dev) { struct arm_smmu_ste ste; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + arm_smmu_master_clear_vmaster(master); arm_smmu_make_abort_ste(&ste); arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_TERMINATE); @@ -3650,7 +3674,8 @@ static struct iommu_ops arm_smmu_ops = { .dev_disable_feat = arm_smmu_dev_disable_feature, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, - .viommu_alloc = arm_vsmmu_alloc, + .get_viommu_size = arm_smmu_get_viommu_size, + .viommu_init = arm_vsmmu_init, .user_pasid_table = 1, .pgsize_bitmap = -1UL, /* Restricted during device attach */ .owner = THIS_MODULE, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 7290bd4c2bb0a..b5841e4f54385 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -16,6 +16,7 @@ #include struct arm_smmu_device; +struct arm_vsmmu; /* MMIO registers */ #define ARM_SMMU_IDR0 0x0 @@ -720,6 +721,17 @@ struct arm_smmu_impl_ops { int (*init_structures)(struct arm_smmu_device *smmu); struct arm_smmu_cmdq *(*get_secondary_cmdq)( struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent); + /* + * An implementation should define its own type other than the default + * IOMMU_HW_INFO_TYPE_ARM_SMMUV3. And it must validate the input @type + * to return its own structure. + */ + void *(*hw_info)(struct arm_smmu_device *smmu, u32 *length, + enum iommu_hw_info_type *type); + const size_t vsmmu_size; + const enum iommu_viommu_type vsmmu_type; + int (*vsmmu_init)(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data); }; /* An SMMUv3 instance */ @@ -800,6 +812,11 @@ struct arm_smmu_stream { struct rb_node node; }; +struct arm_smmu_vmaster { + struct arm_vsmmu *vsmmu; + unsigned long vsid; +}; + struct arm_smmu_event { u8 stall : 1, ssv : 1, @@ -825,6 +842,7 @@ struct arm_smmu_master { struct arm_smmu_device *smmu; struct device *dev; struct arm_smmu_stream *streams; + struct arm_smmu_vmaster *vmaster; /* use smmu->streams_mutex */ /* Locked by the iommu core using the group mutex */ struct arm_smmu_ctx_desc_cfg cd_table; unsigned int num_streams; @@ -973,6 +991,7 @@ struct arm_smmu_attach_state { bool disable_ats; ioasid_t ssid; /* Resulting state */ + struct arm_smmu_vmaster *vmaster; bool ats_enabled; }; @@ -1051,14 +1070,52 @@ struct arm_vsmmu { }; #if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD) -void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type); -struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, - struct iommu_domain *parent, - struct iommufd_ctx *ictx, - unsigned int viommu_type); +void *arm_smmu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type); +size_t arm_smmu_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type); +int arm_vsmmu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data); +int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain); +void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); +void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); +int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt); +struct iommu_domain * +arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data); +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array); #else +#define arm_smmu_get_viommu_size NULL #define arm_smmu_hw_info NULL -#define arm_vsmmu_alloc NULL +#define arm_vsmmu_init NULL +#define arm_vsmmu_alloc_domain_nested NULL +#define arm_vsmmu_cache_invalidate NULL + +static inline int +arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain) +{ + return 0; +} + +static inline void +arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state) +{ +} + +static inline void +arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) +{ +} + +static inline int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, + u64 *evt) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ #endif /* _ARM_SMMU_V3_H */ diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index dd7d030d2e890..eb90af5093d89 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include @@ -26,8 +28,10 @@ #define CMDQV_EN BIT(0) #define TEGRA241_CMDQV_PARAM 0x0004 +#define CMDQV_NUM_SID_PER_VM_LOG2 GENMASK(15, 12) #define CMDQV_NUM_VINTF_LOG2 GENMASK(11, 8) #define CMDQV_NUM_VCMDQ_LOG2 GENMASK(7, 4) +#define CMDQV_VER GENMASK(3, 0) #define TEGRA241_CMDQV_STATUS 0x0008 #define CMDQV_ENABLED BIT(0) @@ -53,6 +57,9 @@ #define VINTF_STATUS GENMASK(3, 1) #define VINTF_ENABLED BIT(0) +#define TEGRA241_VINTF_SID_MATCH(s) (0x0040 + 0x4*(s)) +#define TEGRA241_VINTF_SID_REPLACE(s) (0x0080 + 0x4*(s)) + #define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \ (0x00C0 + 0x8*(m)) #define LVCMDQ_ERR_MAP_NUM_64 2 @@ -114,16 +121,20 @@ MODULE_PARM_DESC(bypass_vcmdq, /** * struct tegra241_vcmdq - Virtual Command Queue + * @core: Embedded iommufd_hw_queue structure * @idx: Global index in the CMDQV * @lidx: Local index in the VINTF * @enabled: Enable status * @cmdqv: Parent CMDQV pointer * @vintf: Parent VINTF pointer + * @prev: Previous LVCMDQ to depend on * @cmdq: Command Queue struct * @page0: MMIO Page0 base address * @page1: MMIO Page1 base address */ struct tegra241_vcmdq { + struct iommufd_hw_queue core; + u16 idx; u16 lidx; @@ -131,22 +142,30 @@ struct tegra241_vcmdq { struct tegra241_cmdqv *cmdqv; struct tegra241_vintf *vintf; + struct tegra241_vcmdq *prev; struct arm_smmu_cmdq cmdq; void __iomem *page0; void __iomem *page1; }; +#define hw_queue_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core) /** * struct tegra241_vintf - Virtual Interface + * @vsmmu: Embedded arm_vsmmu structure * @idx: Global index in the CMDQV * @enabled: Enable status * @hyp_own: Owned by hypervisor (in-kernel) * @cmdqv: Parent CMDQV pointer * @lvcmdqs: List of logical VCMDQ pointers + * @lvcmdq_mutex: Lock to serialize user-allocated lvcmdqs * @base: MMIO base address + * @mmap_offset: Offset argument for mmap() syscall + * @sids: Stream ID mapping resources */ struct tegra241_vintf { + struct arm_vsmmu vsmmu; + u16 idx; bool enabled; @@ -154,19 +173,41 @@ struct tegra241_vintf { struct tegra241_cmdqv *cmdqv; struct tegra241_vcmdq **lvcmdqs; + struct mutex lvcmdq_mutex; /* user space race */ void __iomem *base; + unsigned long mmap_offset; + + struct ida sids; }; +#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, vsmmu.core) + +/** + * struct tegra241_vintf_sid - Virtual Interface Stream ID Mapping + * @core: Embedded iommufd_vdevice structure, holding virtual Stream ID + * @vintf: Parent VINTF pointer + * @sid: Physical Stream ID + * @idx: Mapping index in the VINTF + */ +struct tegra241_vintf_sid { + struct iommufd_vdevice core; + struct tegra241_vintf *vintf; + u32 sid; + u8 idx; +}; +#define vdev_to_vsid(v) container_of(v, struct tegra241_vintf_sid, core) /** * struct tegra241_cmdqv - CMDQ-V for SMMUv3 * @smmu: SMMUv3 device * @dev: CMDQV device * @base: MMIO base address + * @base_phys: MMIO physical base address, for mmap * @irq: IRQ number * @num_vintfs: Total number of VINTFs * @num_vcmdqs: Total number of VCMDQs * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF + * @num_sids_per_vintf: Total number of SID mappings per VINTF * @vintf_ids: VINTF id allocator * @vintfs: List of VINTFs */ @@ -175,12 +216,14 @@ struct tegra241_cmdqv { struct device *dev; void __iomem *base; + phys_addr_t base_phys; int irq; /* CMDQV Hardware Params */ u16 num_vintfs; u16 num_vcmdqs; u16 num_lvcmdqs_per_vintf; + u16 num_sids_per_vintf; struct ida vintf_ids; @@ -252,6 +295,20 @@ static inline int vcmdq_write_config(struct tegra241_vcmdq *vcmdq, u32 regval) /* ISR Functions */ +static void tegra241_vintf_user_handle_error(struct tegra241_vintf *vintf) +{ + struct iommufd_viommu *viommu = &vintf->vsmmu.core; + struct iommu_vevent_tegra241_cmdqv vevent_data; + int i; + + for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) + vevent_data.lvcmdq_err_map[i] = + readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i))); + + iommufd_viommu_report_event(viommu, IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV, + &vevent_data, sizeof(vevent_data)); +} + static void tegra241_vintf0_handle_error(struct tegra241_vintf *vintf) { int i; @@ -297,6 +354,14 @@ static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid) vintf_map &= ~BIT_ULL(0); } + /* Handle other user VINTFs and their LVCMDQs */ + while (vintf_map) { + unsigned long idx = __ffs64(vintf_map); + + tegra241_vintf_user_handle_error(cmdqv->vintfs[idx]); + vintf_map &= ~BIT_ULL(idx); + } + return IRQ_HANDLED; } @@ -351,6 +416,30 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, /* HW Reset Functions */ +/* + * When a guest-owned VCMDQ is disabled, if the guest did not enqueue a CMD_SYNC + * following an ATC_INV command at the end of the guest queue while this ATC_INV + * is timed out, the TIMEOUT will not be reported until this VCMDQ gets assigned + * to the next VM, which will be a false alarm potentially causing some unwanted + * behavior in the new VM. Thus, a guest-owned VCMDQ must flush the TIMEOUT when + * it gets disabled. This can be done by just issuing a CMD_SYNC to SMMU CMDQ. + */ +static void tegra241_vcmdq_hw_flush_timeout(struct tegra241_vcmdq *vcmdq) +{ + struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu; + u64 cmd_sync[CMDQ_ENT_DWORDS] = {}; + + cmd_sync[0] = FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) | + FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_NONE); + + /* + * It does not hurt to insert another CMD_SYNC, taking advantage of the + * arm_smmu_cmdq_issue_cmdlist() that waits for the CMD_SYNC completion. + */ + arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, cmd_sync, 1, true); +} + +/* This function is for LVCMDQ, so @vcmdq must not be unmapped yet */ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) { char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); @@ -363,6 +452,8 @@ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)), readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS))); } + tegra241_vcmdq_hw_flush_timeout(vcmdq); + writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, PROD)); writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, CONS)); writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, BASE)); @@ -379,6 +470,7 @@ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) dev_dbg(vcmdq->cmdqv->dev, "%sdeinited\n", h); } +/* This function is for LVCMDQ, so @vcmdq must be mapped prior */ static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq) { char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); @@ -404,14 +496,45 @@ static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq) return 0; } +/* Unmap a global VCMDQ from the pre-assigned LVCMDQ */ +static void tegra241_vcmdq_unmap_lvcmdq(struct tegra241_vcmdq *vcmdq) +{ + u32 regval = readl(REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + + writel(regval & ~CMDQV_CMDQ_ALLOCATED, + REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + dev_dbg(vcmdq->cmdqv->dev, "%sunmapped\n", h); +} + static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf) { - u16 lidx; + u16 lidx = vintf->cmdqv->num_lvcmdqs_per_vintf; + int sidx; - for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) - if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) + /* HW requires to unmap LVCMDQs in descending order */ + while (lidx--) { + if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) { tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]); + tegra241_vcmdq_unmap_lvcmdq(vintf->lvcmdqs[lidx]); + } + } vintf_write_config(vintf, 0); + for (sidx = 0; sidx < vintf->cmdqv->num_sids_per_vintf; sidx++) { + writel(0, REG_VINTF(vintf, SID_MATCH(sidx))); + writel(0, REG_VINTF(vintf, SID_REPLACE(sidx))); + } +} + +/* Map a global VCMDQ to the pre-assigned LVCMDQ */ +static void tegra241_vcmdq_map_lvcmdq(struct tegra241_vcmdq *vcmdq) +{ + u32 regval = readl(REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + + writel(regval | CMDQV_CMDQ_ALLOCATED, + REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + dev_dbg(vcmdq->cmdqv->dev, "%smapped\n", h); } static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own) @@ -429,7 +552,8 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own) * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a * restricted set of supported commands. */ - regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own); + regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) | + FIELD_PREP(VINTF_VMID, vintf->vsmmu.vmid); writel(regval, REG_VINTF(vintf, CONFIG)); ret = vintf_write_config(vintf, regval | VINTF_EN); @@ -441,8 +565,10 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own) */ vintf->hyp_own = !!(VINTF_HYP_OWN & readl(REG_VINTF(vintf, CONFIG))); + /* HW requires to map LVCMDQs in ascending order */ for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) { if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) { + tegra241_vcmdq_map_lvcmdq(vintf->lvcmdqs[lidx]); ret = tegra241_vcmdq_hw_init(vintf->lvcmdqs[lidx]); if (ret) { tegra241_vintf_hw_deinit(vintf); @@ -476,7 +602,6 @@ static int tegra241_cmdqv_hw_reset(struct arm_smmu_device *smmu) for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) { regval = FIELD_PREP(CMDQV_CMDQ_ALLOC_VINTF, idx); regval |= FIELD_PREP(CMDQV_CMDQ_ALLOC_LVCMDQ, lidx); - regval |= CMDQV_CMDQ_ALLOCATED; writel_relaxed(regval, REG_CMDQV(cmdqv, CMDQ_ALLOC(qidx++))); } @@ -555,7 +680,9 @@ static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx) dev_dbg(vintf->cmdqv->dev, "%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64)); - kfree(vcmdq); + /* Guest-owned VCMDQ is free-ed with hw_queue by iommufd core */ + if (vcmdq->vintf->hyp_own) + kfree(vcmdq); } static struct tegra241_vcmdq * @@ -628,28 +755,27 @@ static int tegra241_cmdqv_init_vintf(struct tegra241_cmdqv *cmdqv, u16 max_idx, /* Remove Helpers */ -static void tegra241_vintf_remove_lvcmdq(struct tegra241_vintf *vintf, u16 lidx) -{ - tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]); - tegra241_vintf_free_lvcmdq(vintf, lidx); -} - static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx) { struct tegra241_vintf *vintf = cmdqv->vintfs[idx]; u16 lidx; + tegra241_vintf_hw_deinit(vintf); + /* Remove LVCMDQ resources */ for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) if (vintf->lvcmdqs[lidx]) - tegra241_vintf_remove_lvcmdq(vintf, lidx); - - /* Remove VINTF resources */ - tegra241_vintf_hw_deinit(vintf); + tegra241_vintf_free_lvcmdq(vintf, lidx); dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx); tegra241_cmdqv_deinit_vintf(cmdqv, idx); - kfree(vintf); + if (!vintf->hyp_own) { + mutex_destroy(&vintf->lvcmdq_mutex); + ida_destroy(&vintf->sids); + /* Guest-owned VINTF is free-ed with viommu by iommufd core */ + } else { + kfree(vintf); + } } static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu) @@ -677,10 +803,45 @@ static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu) put_device(cmdqv->dev); /* smmu->impl_dev */ } +static int +tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data); + +static void *tegra241_cmdqv_hw_info(struct arm_smmu_device *smmu, u32 *length, + enum iommu_hw_info_type *type) +{ + struct tegra241_cmdqv *cmdqv = + container_of(smmu, struct tegra241_cmdqv, smmu); + struct iommu_hw_info_tegra241_cmdqv *info; + u32 regval; + + if (*type != IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV) + return ERR_PTR(-EOPNOTSUPP); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM)); + info->log2vcmdqs = ilog2(cmdqv->num_lvcmdqs_per_vintf); + info->log2vsids = ilog2(cmdqv->num_sids_per_vintf); + info->version = FIELD_GET(CMDQV_VER, regval); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV; + return info; +} + static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = { + /* For in-kernel use */ .get_secondary_cmdq = tegra241_cmdqv_get_cmdq, .device_reset = tegra241_cmdqv_hw_reset, .device_remove = tegra241_cmdqv_remove, + /* For user-space use */ + .hw_info = tegra241_cmdqv_hw_info, + .vsmmu_size = VIOMMU_STRUCT_SIZE(struct tegra241_vintf, vsmmu.core), + .vsmmu_type = IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + .vsmmu_init = tegra241_cmdqv_init_vintf_user, }; /* Probe Functions */ @@ -822,10 +983,12 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, cmdqv->irq = irq; cmdqv->base = base; cmdqv->dev = smmu->impl_dev; + cmdqv->base_phys = res->start; if (cmdqv->irq > 0) { - ret = request_irq(irq, tegra241_cmdqv_isr, 0, "tegra241-cmdqv", - cmdqv); + ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr, + IRQF_ONESHOT, "tegra241-cmdqv", + cmdqv); if (ret) { dev_err(cmdqv->dev, "failed to request irq (%d): %d\n", cmdqv->irq, ret); @@ -837,6 +1000,8 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval); cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval); cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs; + cmdqv->num_sids_per_vintf = + 1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval); cmdqv->vintfs = kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL); @@ -890,3 +1055,279 @@ struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu) put_device(smmu->impl_dev); return ERR_PTR(-ENODEV); } + +/* User space VINTF and VCMDQ Functions */ + +static size_t tegra241_vintf_get_vcmdq_size(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type) +{ + if (queue_type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV) + return 0; + return HW_QUEUE_STRUCT_SIZE(struct tegra241_vcmdq, core); +} + +static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq) +{ + char header[64]; + + /* Configure the vcmdq only; User space does the enabling */ + writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE)); + + dev_dbg(vcmdq->cmdqv->dev, "%sinited at host PA 0x%llx size 0x%lx\n", + lvcmdq_error_header(vcmdq, header, 64), + vcmdq->cmdq.q.q_base & VCMDQ_ADDR, + 1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE)); + return 0; +} + +static void +tegra241_vintf_destroy_lvcmdq_user(struct iommufd_hw_queue *hw_queue) +{ + struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue); + + mutex_lock(&vcmdq->vintf->lvcmdq_mutex); + tegra241_vcmdq_hw_deinit(vcmdq); + tegra241_vcmdq_unmap_lvcmdq(vcmdq); + tegra241_vintf_free_lvcmdq(vcmdq->vintf, vcmdq->lidx); + if (vcmdq->prev) + iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core); + mutex_unlock(&vcmdq->vintf->lvcmdq_mutex); +} + +static int tegra241_vintf_alloc_lvcmdq_user(struct iommufd_hw_queue *hw_queue, + u32 lidx, phys_addr_t base_addr_pa) +{ + struct tegra241_vintf *vintf = viommu_to_vintf(hw_queue->viommu); + struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue); + struct tegra241_cmdqv *cmdqv = vintf->cmdqv; + struct arm_smmu_device *smmu = &cmdqv->smmu; + struct tegra241_vcmdq *prev = NULL; + u32 log2size, max_n_shift; + char header[64]; + int ret; + + if (hw_queue->type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV) + return -EOPNOTSUPP; + if (lidx >= cmdqv->num_lvcmdqs_per_vintf) + return -EINVAL; + + mutex_lock(&vintf->lvcmdq_mutex); + + if (vintf->lvcmdqs[lidx]) { + ret = -EEXIST; + goto unlock; + } + + /* + * HW requires to map LVCMDQs in ascending order, so reject if the + * previous lvcmdqs is not allocated yet. + */ + if (lidx) { + prev = vintf->lvcmdqs[lidx - 1]; + if (!prev) { + ret = -EIO; + goto unlock; + } + } + + /* + * hw_queue->length must be a power of 2, in range of + * [ 32, 2 ^ (idr[1].CMDQS + CMDQ_ENT_SZ_SHIFT) ] + */ + max_n_shift = FIELD_GET(IDR1_CMDQS, + readl_relaxed(smmu->base + ARM_SMMU_IDR1)); + if (!is_power_of_2(hw_queue->length) || hw_queue->length < 32 || + hw_queue->length > (1 << (max_n_shift + CMDQ_ENT_SZ_SHIFT))) { + ret = -EINVAL; + goto unlock; + } + log2size = ilog2(hw_queue->length) - CMDQ_ENT_SZ_SHIFT; + + /* base_addr_pa must be aligned to hw_queue->length */ + if (base_addr_pa & ~VCMDQ_ADDR || + base_addr_pa & (hw_queue->length - 1)) { + ret = -EINVAL; + goto unlock; + } + + /* + * HW requires to unmap LVCMDQs in descending order, so destroy() must + * follow this rule. Set a dependency on its previous LVCMDQ so iommufd + * core will help enforce it. + */ + if (prev) { + ret = iommufd_hw_queue_depend(vcmdq, prev, core); + if (ret) + goto unlock; + } + vcmdq->prev = prev; + + ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq); + if (ret) + goto undepend_vcmdq; + + dev_dbg(cmdqv->dev, "%sallocated\n", + lvcmdq_error_header(vcmdq, header, 64)); + + tegra241_vcmdq_map_lvcmdq(vcmdq); + + vcmdq->cmdq.q.q_base = base_addr_pa & VCMDQ_ADDR; + vcmdq->cmdq.q.q_base |= log2size; + + ret = tegra241_vcmdq_hw_init_user(vcmdq); + if (ret) + goto unmap_lvcmdq; + + hw_queue->destroy = &tegra241_vintf_destroy_lvcmdq_user; + mutex_unlock(&vintf->lvcmdq_mutex); + return 0; + +unmap_lvcmdq: + tegra241_vcmdq_unmap_lvcmdq(vcmdq); + tegra241_vintf_deinit_lvcmdq(vintf, lidx); +undepend_vcmdq: + if (vcmdq->prev) + iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core); +unlock: + mutex_unlock(&vintf->lvcmdq_mutex); + return ret; +} + +static void tegra241_cmdqv_destroy_vintf_user(struct iommufd_viommu *viommu) +{ + struct tegra241_vintf *vintf = viommu_to_vintf(viommu); + + if (vintf->mmap_offset) + iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, + vintf->mmap_offset); + tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx); +} + +static void tegra241_vintf_destroy_vsid(struct iommufd_vdevice *vdev) +{ + struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev); + struct tegra241_vintf *vintf = vsid->vintf; + + writel(0, REG_VINTF(vintf, SID_MATCH(vsid->idx))); + writel(0, REG_VINTF(vintf, SID_REPLACE(vsid->idx))); + ida_free(&vintf->sids, vsid->idx); + dev_dbg(vintf->cmdqv->dev, + "VINTF%u: deallocated SID_REPLACE%d for pSID=%x\n", vintf->idx, + vsid->idx, vsid->sid); +} + +static int tegra241_vintf_init_vsid(struct iommufd_vdevice *vdev) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(vdev->dev); + struct tegra241_vintf *vintf = viommu_to_vintf(vdev->viommu); + struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev); + struct arm_smmu_stream *stream = &master->streams[0]; + u64 virt_sid = vdev->virt_id; + int sidx; + + if (virt_sid > UINT_MAX) + return -EINVAL; + + WARN_ON_ONCE(master->num_streams != 1); + + /* Find an empty pair of SID_REPLACE and SID_MATCH */ + sidx = ida_alloc_max(&vintf->sids, vintf->cmdqv->num_sids_per_vintf - 1, + GFP_KERNEL); + if (sidx < 0) + return sidx; + + writel(stream->id, REG_VINTF(vintf, SID_REPLACE(sidx))); + writel(virt_sid << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(sidx))); + dev_dbg(vintf->cmdqv->dev, + "VINTF%u: allocated SID_REPLACE%d for pSID=%x, vSID=%x\n", + vintf->idx, sidx, stream->id, (u32)virt_sid); + + vsid->idx = sidx; + vsid->vintf = vintf; + vsid->sid = stream->id; + + vdev->destroy = &tegra241_vintf_destroy_vsid; + return 0; +} + +static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = { + .destroy = tegra241_cmdqv_destroy_vintf_user, + .alloc_domain_nested = arm_vsmmu_alloc_domain_nested, + /* Non-accelerated commands will be still handled by the kernel */ + .cache_invalidate = arm_vsmmu_cache_invalidate, + .vdevice_size = VDEVICE_STRUCT_SIZE(struct tegra241_vintf_sid, core), + .vdevice_init = tegra241_vintf_init_vsid, + .get_hw_queue_size = tegra241_vintf_get_vcmdq_size, + .hw_queue_init_phys = tegra241_vintf_alloc_lvcmdq_user, +}; + +static int +tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data) +{ + struct tegra241_cmdqv *cmdqv = + container_of(vsmmu->smmu, struct tegra241_cmdqv, smmu); + struct tegra241_vintf *vintf = viommu_to_vintf(&vsmmu->core); + struct iommu_viommu_tegra241_cmdqv data; + phys_addr_t page0_base; + int ret; + + if (!user_data) + return -EINVAL; + + ret = iommu_copy_struct_from_user(&data, user_data, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + out_vintf_mmap_length); + if (ret) + return ret; + + ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf); + if (ret < 0) { + dev_err(cmdqv->dev, "no more available vintf\n"); + return ret; + } + + /* + * Initialize the user-owned VINTF without a LVCMDQ, as it cannot pre- + * allocate a LVCMDQ until user space wants one, for security reasons. + * It is different than the kernel-owned VINTF0, which had pre-assigned + * and pre-allocated global VCMDQs that would be mapped to the LVCMDQs + * by the tegra241_vintf_hw_init() call. + */ + ret = tegra241_vintf_hw_init(vintf, false); + if (ret) + goto deinit_vintf; + + page0_base = cmdqv->base_phys + TEGRA241_VINTFi_PAGE0(vintf->idx); + ret = iommufd_viommu_alloc_mmap(&vintf->vsmmu.core, page0_base, SZ_64K, + &vintf->mmap_offset); + if (ret) + goto hw_deinit_vintf; + + data.out_vintf_mmap_length = SZ_64K; + data.out_vintf_mmap_offset = vintf->mmap_offset; + ret = iommu_copy_struct_to_user(user_data, &data, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + out_vintf_mmap_length); + if (ret) + goto free_mmap; + + ida_init(&vintf->sids); + mutex_init(&vintf->lvcmdq_mutex); + + dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n", vintf->idx, + vintf->vsmmu.vmid); + + vsmmu->core.ops = &tegra241_cmdqv_viommu_ops; + return 0; + +free_mmap: + iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, vintf->mmap_offset); +hw_deinit_vintf: + tegra241_vintf_hw_deinit(vintf); +deinit_vintf: + tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx); + return ret; +} + +MODULE_IMPORT_NS("IOMMUFD"); diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index de205a34ffc6d..ea9f8e484e354 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1486,7 +1486,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) out_cfg_free: kfree(cfg); out_free: - iommu_fwspec_free(dev); return ERR_PTR(ret); } diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f1580bd8a42a2..8cc5397d7dfc1 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -41,11 +42,6 @@ struct iommu_dma_msi_page { phys_addr_t phys; }; -enum iommu_dma_cookie_type { - IOMMU_DMA_IOVA_COOKIE, - IOMMU_DMA_MSI_COOKIE, -}; - enum iommu_dma_queue_type { IOMMU_DMA_OPTS_PER_CPU_QUEUE, IOMMU_DMA_OPTS_SINGLE_QUEUE, @@ -58,35 +54,31 @@ struct iommu_dma_options { }; struct iommu_dma_cookie { - enum iommu_dma_cookie_type type; + struct iova_domain iovad; + struct list_head msi_page_list; + /* Flush queue */ union { - /* Full allocator for IOMMU_DMA_IOVA_COOKIE */ - struct { - struct iova_domain iovad; - /* Flush queue */ - union { - struct iova_fq *single_fq; - struct iova_fq __percpu *percpu_fq; - }; - /* Number of TLB flushes that have been started */ - atomic64_t fq_flush_start_cnt; - /* Number of TLB flushes that have been finished */ - atomic64_t fq_flush_finish_cnt; - /* Timer to regularily empty the flush queues */ - struct timer_list fq_timer; - /* 1 when timer is active, 0 when not */ - atomic_t fq_timer_on; - }; - /* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */ - dma_addr_t msi_iova; + struct iova_fq *single_fq; + struct iova_fq __percpu *percpu_fq; }; - struct list_head msi_page_list; - + /* Number of TLB flushes that have been started */ + atomic64_t fq_flush_start_cnt; + /* Number of TLB flushes that have been finished */ + atomic64_t fq_flush_finish_cnt; + /* Timer to regularily empty the flush queues */ + struct timer_list fq_timer; + /* 1 when timer is active, 0 when not */ + atomic_t fq_timer_on; /* Domain for flush queue callback; NULL if flush queue not in use */ - struct iommu_domain *fq_domain; + struct iommu_domain *fq_domain; /* Options for dma-iommu use */ - struct iommu_dma_options options; - struct mutex mutex; + struct iommu_dma_options options; + struct mutex mutex; +}; + +struct iommu_dma_msi_cookie { + dma_addr_t msi_iova; + struct list_head msi_page_list; }; static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled); @@ -365,39 +357,25 @@ int iommu_dma_init_fq(struct iommu_domain *domain) return 0; } -static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie) -{ - if (cookie->type == IOMMU_DMA_IOVA_COOKIE) - return cookie->iovad.granule; - return PAGE_SIZE; -} - -static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type) -{ - struct iommu_dma_cookie *cookie; - - cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); - if (cookie) { - INIT_LIST_HEAD(&cookie->msi_page_list); - cookie->type = type; - } - return cookie; -} - /** * iommu_get_dma_cookie - Acquire DMA-API resources for a domain * @domain: IOMMU domain to prepare for DMA-API usage */ int iommu_get_dma_cookie(struct iommu_domain *domain) { - if (domain->iova_cookie) + struct iommu_dma_cookie *cookie; + + if (domain->cookie_type != IOMMU_COOKIE_NONE) return -EEXIST; - domain->iova_cookie = cookie_alloc(IOMMU_DMA_IOVA_COOKIE); - if (!domain->iova_cookie) + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); + if (!cookie) return -ENOMEM; - mutex_init(&domain->iova_cookie->mutex); + mutex_init(&cookie->mutex); + INIT_LIST_HEAD(&cookie->msi_page_list); + domain->cookie_type = IOMMU_COOKIE_DMA_IOVA; + domain->iova_cookie = cookie; return 0; } @@ -415,48 +393,56 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) */ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { - struct iommu_dma_cookie *cookie; + struct iommu_dma_msi_cookie *cookie; if (domain->type != IOMMU_DOMAIN_UNMANAGED) return -EINVAL; - if (domain->iova_cookie) + if (domain->cookie_type != IOMMU_COOKIE_NONE) return -EEXIST; - cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE); + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); if (!cookie) return -ENOMEM; cookie->msi_iova = base; - domain->iova_cookie = cookie; + INIT_LIST_HEAD(&cookie->msi_page_list); + domain->cookie_type = IOMMU_COOKIE_DMA_MSI; + domain->msi_cookie = cookie; return 0; } EXPORT_SYMBOL(iommu_get_msi_cookie); /** * iommu_put_dma_cookie - Release a domain's DMA mapping resources - * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or - * iommu_get_msi_cookie() + * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() */ void iommu_put_dma_cookie(struct iommu_domain *domain) { struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; - if (!cookie) - return; - - if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule) { + if (cookie->iovad.granule) { iommu_dma_free_fq(cookie); put_iova_domain(&cookie->iovad); } + list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) + kfree(msi); + kfree(cookie); +} - list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) { - list_del(&msi->list); +/** + * iommu_put_msi_cookie - Release a domain's MSI mapping resources + * @domain: IOMMU domain previously prepared by iommu_get_msi_cookie() + */ +void iommu_put_msi_cookie(struct iommu_domain *domain) +{ + struct iommu_dma_msi_cookie *cookie = domain->msi_cookie; + struct iommu_dma_msi_page *msi, *tmp; + + list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) kfree(msi); - } kfree(cookie); - domain->iova_cookie = NULL; } /** @@ -676,7 +662,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev struct iova_domain *iovad; int ret; - if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE) + if (!cookie || domain->cookie_type != IOMMU_COOKIE_DMA_IOVA) return -EINVAL; iovad = &cookie->iovad; @@ -766,9 +752,9 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, struct iova_domain *iovad = &cookie->iovad; unsigned long shift, iova_len, iova; - if (cookie->type == IOMMU_DMA_MSI_COOKIE) { - cookie->msi_iova += size; - return cookie->msi_iova - size; + if (domain->cookie_type == IOMMU_COOKIE_DMA_MSI) { + domain->msi_cookie->msi_iova += size; + return domain->msi_cookie->msi_iova - size; } shift = iova_shift(iovad); @@ -805,16 +791,16 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, return (dma_addr_t)iova << shift; } -static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie, - dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather) +static void iommu_dma_free_iova(struct iommu_domain *domain, dma_addr_t iova, + size_t size, struct iommu_iotlb_gather *gather) { - struct iova_domain *iovad = &cookie->iovad; + struct iova_domain *iovad = &domain->iova_cookie->iovad; /* The MSI case is only ever cleaning up its most recent allocation */ - if (cookie->type == IOMMU_DMA_MSI_COOKIE) - cookie->msi_iova -= size; + if (domain->cookie_type == IOMMU_COOKIE_DMA_MSI) + domain->msi_cookie->msi_iova -= size; else if (gather && gather->queued) - queue_iova(cookie, iova_pfn(iovad, iova), + queue_iova(domain->iova_cookie, iova_pfn(iovad, iova), size >> iova_shift(iovad), &gather->freelist); else @@ -842,7 +828,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, if (!iotlb_gather.queued) iommu_iotlb_sync(domain, &iotlb_gather); - iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); + iommu_dma_free_iova(domain, dma_addr, size, &iotlb_gather); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, @@ -870,7 +856,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, return DMA_MAPPING_ERROR; if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) { - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); return DMA_MAPPING_ERROR; } return iova + iova_off; @@ -1007,7 +993,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, out_free_sg: sg_free_table(sgt); out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); out_free_pages: __iommu_dma_free_pages(pages, count); return NULL; @@ -1484,7 +1470,7 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return __finalise_sg(dev, sg, nents, iova); out_free_iova: - iommu_dma_free_iova(cookie, iova, iova_len, NULL); + iommu_dma_free_iova(domain, iova, iova_len, NULL); out_restore_sg: __invalidate_sg(sg, nents); out: @@ -1762,17 +1748,47 @@ void iommu_setup_dma_ops(struct device *dev) dev->dma_iommu = false; } +static bool has_msi_cookie(const struct iommu_domain *domain) +{ + return domain && (domain->cookie_type == IOMMU_COOKIE_DMA_IOVA || + domain->cookie_type == IOMMU_COOKIE_DMA_MSI); +} + +static size_t cookie_msi_granule(const struct iommu_domain *domain) +{ + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + return domain->iova_cookie->iovad.granule; + case IOMMU_COOKIE_DMA_MSI: + return PAGE_SIZE; + default: + BUG(); + }; +} + +static struct list_head *cookie_msi_pages(const struct iommu_domain *domain) +{ + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + return &domain->iova_cookie->msi_page_list; + case IOMMU_COOKIE_DMA_MSI: + return &domain->msi_cookie->msi_page_list; + default: + BUG(); + }; +} + static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, phys_addr_t msi_addr, struct iommu_domain *domain) { - struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct list_head *msi_page_list = cookie_msi_pages(domain); struct iommu_dma_msi_page *msi_page; dma_addr_t iova; int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; - size_t size = cookie_msi_granule(cookie); + size_t size = cookie_msi_granule(domain); msi_addr &= ~(phys_addr_t)(size - 1); - list_for_each_entry(msi_page, &cookie->msi_page_list, list) + list_for_each_entry(msi_page, msi_page_list, list) if (msi_page->phys == msi_addr) return msi_page; @@ -1790,84 +1806,35 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, INIT_LIST_HEAD(&msi_page->list); msi_page->phys = msi_addr; msi_page->iova = iova; - list_add(&msi_page->list, &cookie->msi_page_list); + list_add(&msi_page->list, msi_page_list); return msi_page; out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); out_free_page: kfree(msi_page); return NULL; } -/* - * Nested domains may not have an MSI cookie or accept mappings, but they may - * be related to a domain which does, so we let them tell us what they need. - */ -static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) -{ - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - - if (domain && domain->type == IOMMU_DOMAIN_NESTED && - domain->ops && domain->ops->get_msi_mapping_domain) - domain = domain->ops->get_msi_mapping_domain(domain); - return domain; -} - -/** - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain - * @desc: MSI descriptor, will store the MSI page - * @msi_addr: MSI target address to be mapped - * - * Return: 0 on success or negative error code if the mapping failed. - */ -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev); - struct iommu_dma_msi_page *msi_page; - static DEFINE_MUTEX(msi_prepare_lock); /* see below */ + const struct iommu_dma_msi_page *msi_page; - if (!domain || !domain->iova_cookie) { - desc->iommu_cookie = NULL; + if (!has_msi_cookie(domain)) { + msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } - /* - * In fact the whole prepare operation should already be serialised by - * irq_domain_mutex further up the callchain, but that's pretty subtle - * on its own, so consider this locking as failsafe documentation... - */ - mutex_lock(&msi_prepare_lock); + iommu_group_mutex_assert(dev); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); - mutex_unlock(&msi_prepare_lock); - - msi_desc_set_iommu_cookie(desc, msi_page); - if (!msi_page) return -ENOMEM; - return 0; -} -/** - * iommu_dma_compose_msi_msg() - Apply translation to an MSI message - * @desc: MSI descriptor prepared by iommu_dma_prepare_msi() - * @msg: MSI message containing target physical address - */ -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ - struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev); - const struct iommu_dma_msi_page *msi_page; - - msi_page = msi_desc_get_iommu_cookie(desc); - - if (!domain || !domain->iova_cookie || WARN_ON(!msi_page)) - return; - - msg->address_hi = upper_32_bits(msi_page->iova); - msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1; - msg->address_lo += lower_32_bits(msi_page->iova); + msi_desc_set_iommu_msi_iova(desc, msi_page->iova, + ilog2(cookie_msi_granule(domain))); + return 0; } static int iommu_dma_init(void) diff --git a/drivers/iommu/dma-iommu.h b/drivers/iommu/dma-iommu.h index c12d63457c764..eca201c1f9639 100644 --- a/drivers/iommu/dma-iommu.h +++ b/drivers/iommu/dma-iommu.h @@ -13,11 +13,15 @@ void iommu_setup_dma_ops(struct device *dev); int iommu_get_dma_cookie(struct iommu_domain *domain); void iommu_put_dma_cookie(struct iommu_domain *domain); +void iommu_put_msi_cookie(struct iommu_domain *domain); int iommu_dma_init_fq(struct iommu_domain *domain); void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); +int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + extern bool iommu_dma_forcedac; #else /* CONFIG_IOMMU_DMA */ @@ -40,9 +44,19 @@ static inline void iommu_put_dma_cookie(struct iommu_domain *domain) { } +static inline void iommu_put_msi_cookie(struct iommu_domain *domain) +{ +} + static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) { } +static inline int iommu_dma_sw_msi(struct iommu_domain *domain, + struct msi_desc *desc, phys_addr_t msi_addr) +{ + return -ENODEV; +} + #endif /* CONFIG_IOMMU_DMA */ #endif /* __DMA_IOMMU_H */ diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6ceed4b3091e7..16cd6dff4c513 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3374,7 +3374,8 @@ intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, bool first_stage; if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID))) return ERR_PTR(-EOPNOTSUPP); if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); @@ -4230,12 +4231,17 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, return ret; } -static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) +static void *intel_iommu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct iommu_hw_info_vtd *vtd; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_INTEL_VTD) + return ERR_PTR(-EOPNOTSUPP); + vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); if (!vtd) return ERR_PTR(-ENOMEM); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index e80e32a0e973f..1e149169ee77b 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -198,7 +198,7 @@ intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, struct dmar_domain *domain; int ret; - if (!nested_supported(iommu) || flags) + if (!nested_supported(iommu) || flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); /* Must be nested domain */ diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index de5b54eaa8bf1..e236b932e7668 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -5,6 +5,7 @@ #define __LINUX_IOMMU_PRIV_H #include +#include static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) { @@ -17,6 +18,8 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } +void dev_iommu_free(struct device *dev); + const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwspec) @@ -24,8 +27,7 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain); +void iommu_fwspec_free(struct device *dev); int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, @@ -46,4 +48,19 @@ void iommu_detach_group_handle(struct iommu_domain *domain, int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle); + +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) && IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#else /* !CONFIG_IOMMUFD_DRIVER_CORE || !CONFIG_IRQ_MSI_IOMMU */ +static inline int iommufd_sw_msi(struct iommu_domain *domain, + struct msi_desc *desc, phys_addr_t msi_addr) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_IOMMUFD_DRIVER_CORE && CONFIG_IRQ_MSI_IOMMU */ + +int iommu_replace_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle); #endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 503c5d23c1ea2..9dcf2014da4d9 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -299,17 +299,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; - if (ops->domain_alloc_sva) { - domain = ops->domain_alloc_sva(dev, mm); - if (IS_ERR(domain)) - return domain; - } else { - domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); - if (!domain) - return ERR_PTR(-ENOMEM); - } + if (!ops->domain_alloc_sva) + return ERR_PTR(-EOPNOTSUPP); + + domain = ops->domain_alloc_sva(dev, mm); + if (IS_ERR(domain)) + return domain; domain->type = IOMMU_DOMAIN_SVA; + domain->cookie_type = IOMMU_COOKIE_SVA; mmgrab(mm); domain->mm = mm; domain->owner = ops; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b3235cb3220ce..c0430bb1c73ab 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,9 @@ static unsigned int iommu_def_domain_type __read_mostly; static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; +/* Tags used with xa_tag_pointer() in group->pasid_array */ +enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 }; + struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; @@ -360,7 +364,7 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) return param; } -static void dev_iommu_free(struct device *dev) +void dev_iommu_free(struct device *dev) { struct dev_iommu *param = dev->iommu; @@ -521,6 +525,13 @@ static void iommu_deinit_device(struct device *dev) #endif } +static struct iommu_domain *pasid_array_entry_to_domain(void *entry) +{ + if (xa_pointer_tag(entry) == IOMMU_PASID_ARRAY_DOMAIN) + return xa_untag_pointer(entry); + return ((struct iommu_attach_handle *)xa_untag_pointer(entry))->domain; +} + DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) @@ -1772,15 +1783,13 @@ static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev) if (ops->identity_domain) return ops->identity_domain; - /* Older drivers create the identity domain via ops->domain_alloc() */ - if (!ops->domain_alloc) + if (ops->domain_alloc_identity) { + domain = ops->domain_alloc_identity(dev); + if (IS_ERR(domain)) + return domain; + } else { return ERR_PTR(-EOPNOTSUPP); - - domain = ops->domain_alloc(IOMMU_DOMAIN_IDENTITY); - if (IS_ERR(domain)) - return domain; - if (!domain) - return ERR_PTR(-ENOMEM); + } iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops); return domain; @@ -2127,8 +2136,10 @@ void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token) { - BUG_ON(!domain); + if (WARN_ON(!domain || domain->cookie_type != IOMMU_COOKIE_NONE)) + return; + domain->cookie_type = IOMMU_COOKIE_FAULT_HANDLER; domain->handler = handler; domain->handler_token = token; } @@ -2166,8 +2177,10 @@ __iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, domain = ops->domain_alloc_paging(dev); else if (ops->domain_alloc_paging_flags) domain = ops->domain_alloc_paging_flags(dev, flags, NULL); +#if IS_ENABLED(CONFIG_FSL_PAMU) else if (ops->domain_alloc && !flags) domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED); +#endif else return ERR_PTR(-EOPNOTSUPP); @@ -2198,9 +2211,19 @@ EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags); void iommu_domain_free(struct iommu_domain *domain) { - if (domain->type == IOMMU_DOMAIN_SVA) + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + iommu_put_dma_cookie(domain); + break; + case IOMMU_COOKIE_DMA_MSI: + iommu_put_msi_cookie(domain); + break; + case IOMMU_COOKIE_SVA: mmdrop(domain->mm); - iommu_put_dma_cookie(domain); + break; + default: + break; + } if (domain->ops->free) domain->ops->free(domain); } @@ -2324,6 +2347,17 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev) return dev->iommu_group->default_domain; } +static void *iommu_make_pasid_array_entry(struct iommu_domain *domain, + struct iommu_attach_handle *handle) +{ + if (handle) { + handle->domain = domain; + return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE); + } + + return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN); +} + static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { @@ -2364,32 +2398,6 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_attach_group); -/** - * iommu_group_replace_domain - replace the domain that a group is attached to - * @group: IOMMU group that will be attached to the new domain - * @new_domain: new IOMMU domain to replace with - * - * This API allows the group to switch domains without being forced to go to - * the blocking domain in-between. - * - * If the currently attached domain is a core domain (e.g. a default_domain), - * it will act just like the iommu_attach_group(). - */ -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain) -{ - int ret; - - if (!new_domain) - return -EINVAL; - - mutex_lock(&group->mutex); - ret = __iommu_group_set_domain(group, new_domain); - mutex_unlock(&group->mutex); - return ret; -} -EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, "IOMMUFD_INTERNAL"); - static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, @@ -2866,7 +2874,8 @@ int report_iommu_fault(struct iommu_domain *domain, struct device *dev, * if upper layers showed interest and installed a fault handler, * invoke it. */ - if (domain->handler) + if (domain->cookie_type == IOMMU_COOKIE_FAULT_HANDLER && + domain->handler) ret = domain->handler(domain, dev, iova, flags, domain->handler_token); @@ -3026,7 +3035,6 @@ void iommu_fwspec_free(struct device *dev) dev_iommu_fwspec_set(dev, NULL); } } -EXPORT_SYMBOL_GPL(iommu_fwspec_free); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) { @@ -3505,7 +3513,8 @@ static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, } static int __iommu_set_group_pasid(struct iommu_domain *domain, - struct iommu_group *group, ioasid_t pasid) + struct iommu_group *group, ioasid_t pasid, + struct iommu_domain *old) { struct group_device *device, *last_gdev; int ret; @@ -3513,7 +3522,7 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, for_each_group_device(group, device) { if (device->dev->iommu->max_pasids > 0) { ret = domain->ops->set_dev_pasid(domain, device->dev, - pasid, NULL); + pasid, old); if (ret) goto err_revert; } @@ -3526,8 +3535,18 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, for_each_group_device(group, device) { if (device == last_gdev) break; - if (device->dev->iommu->max_pasids > 0) - iommu_remove_dev_pasid(device->dev, pasid, domain); + if (device->dev->iommu->max_pasids > 0) { + /* + * If no old domain, undo the succeeded devices/pasid. + * Otherwise, rollback the succeeded devices/pasid to + * the old domain. And it is a driver bug to fail + * attaching with a previously good domain. + */ + if (!old || + WARN_ON(old->ops->set_dev_pasid(old, device->dev, + pasid, domain))) + iommu_remove_dev_pasid(device->dev, pasid, domain); + } } return ret; } @@ -3551,6 +3570,9 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, * @pasid: the pasid of the device. * @handle: the attach handle. * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle if it intends to pass a valid handle. + * * Return: 0 on success, or an error. */ int iommu_attach_device_pasid(struct iommu_domain *domain, @@ -3561,6 +3583,7 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, struct iommu_group *group = dev->iommu_group; struct group_device *device; const struct iommu_ops *ops; + void *entry; int ret; if (!group) @@ -3590,22 +3613,128 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, } } - if (handle) - handle->domain = domain; + entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, handle, GFP_KERNEL); + /* + * Entry present is a failure case. Use xa_insert() instead of + * xa_reserve(). + */ + ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; - ret = __iommu_set_group_pasid(domain, group, pasid); - if (ret) - xa_erase(&group->pasid_array, pasid); + ret = __iommu_set_group_pasid(domain, group, pasid, NULL); + if (ret) { + xa_release(&group->pasid_array, pasid); + goto out_unlock; + } + + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); +/** + * iommu_replace_device_pasid - Replace the domain that a specific pasid + * of the device is attached to + * @domain: the new iommu domain + * @dev: the attached device. + * @pasid: the pasid of the device. + * @handle: the attach handle. + * + * This API allows the pasid to switch domains. The @pasid should have been + * attached. Otherwise, this fails. The pasid will keep the old configuration + * if replacement failed. + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle if it intends to pass a valid handle. + * + * Return 0 on success, or an error. + */ +int iommu_replace_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle) +{ + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; + struct iommu_attach_handle *entry; + struct iommu_domain *curr_domain; + void *curr; + int ret; + + if (!group) + return -ENODEV; + + if (!domain->ops->set_dev_pasid) + return -EOPNOTSUPP; + + if (dev_iommu_ops(dev) != domain->owner || + pasid == IOMMU_NO_PASID || !handle) + return -EINVAL; + + mutex_lock(&group->mutex); + entry = iommu_make_pasid_array_entry(domain, handle); + curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, + XA_ZERO_ENTRY, GFP_KERNEL); + if (xa_is_err(curr)) { + ret = xa_err(curr); + goto out_unlock; + } + + /* + * No domain (with or without handle) attached, hence not + * a replace case. + */ + if (!curr) { + xa_release(&group->pasid_array, pasid); + ret = -EINVAL; + goto out_unlock; + } + + /* + * Reusing handle is problematic as there are paths that refers + * the handle without lock. To avoid race, reject the callers that + * attempt it. + */ + if (curr == entry) { + WARN_ON(1); + ret = -EINVAL; + goto out_unlock; + } + + curr_domain = pasid_array_entry_to_domain(curr); + ret = 0; + + if (curr_domain != domain) { + ret = __iommu_set_group_pasid(domain, group, + pasid, curr_domain); + if (ret) + goto out_unlock; + } + + /* + * The above xa_cmpxchg() reserved the memory, and the + * group->mutex is held, this cannot fail. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + +out_unlock: + mutex_unlock(&group->mutex); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, "IOMMUFD_INTERNAL"); + /* * iommu_detach_device_pasid() - Detach the domain from pasid of device * @domain: the iommu domain. @@ -3673,13 +3802,17 @@ struct iommu_attach_handle * iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type) { struct iommu_attach_handle *handle; + void *entry; xa_lock(&group->pasid_array); - handle = xa_load(&group->pasid_array, pasid); - if (!handle) + entry = xa_load(&group->pasid_array, pasid); + if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) { handle = ERR_PTR(-ENOENT); - else if (type && handle->domain->type != type) - handle = ERR_PTR(-EBUSY); + } else { + handle = xa_untag_pointer(entry); + if (type && handle->domain->type != type) + handle = ERR_PTR(-EBUSY); + } xa_unlock(&group->pasid_array); return handle; @@ -3697,30 +3830,43 @@ EXPORT_SYMBOL_NS_GPL(iommu_attach_handle_get, "IOMMUFD_INTERNAL"); * This is a variant of iommu_attach_group(). It allows the caller to provide * an attach handle and use it when the domain is attached. This is currently * used by IOMMUFD to deliver the I/O page faults. + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle. */ int iommu_attach_group_handle(struct iommu_domain *domain, struct iommu_group *group, struct iommu_attach_handle *handle) { + void *entry; int ret; - if (handle) - handle->domain = domain; + if (!handle) + return -EINVAL; mutex_lock(&group->mutex); - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + entry = iommu_make_pasid_array_entry(domain, handle); + ret = xa_insert(&group->pasid_array, + IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) - goto err_unlock; + goto out_unlock; ret = __iommu_attach_group(domain, group); - if (ret) - goto err_erase; - mutex_unlock(&group->mutex); + if (ret) { + xa_release(&group->pasid_array, IOMMU_NO_PASID); + goto out_unlock; + } - return 0; -err_erase: - xa_erase(&group->pasid_array, IOMMU_NO_PASID); -err_unlock: + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + IOMMU_NO_PASID, entry, GFP_KERNEL))); + +out_unlock: mutex_unlock(&group->mutex); return ret; } @@ -3750,33 +3896,37 @@ EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL"); * @new_domain: new IOMMU domain to replace with * @handle: attach handle * - * This is a variant of iommu_group_replace_domain(). It allows the caller to - * provide an attach handle for the new domain and use it when the domain is - * attached. + * This API allows the group to switch domains without being forced to go to + * the blocking domain in-between. It allows the caller to provide an attach + * handle for the new domain and use it when the domain is attached. + * + * If the currently attached domain is a core domain (e.g. a default_domain), + * it will act just like the iommu_attach_group_handle(). + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle. */ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle) { - void *curr; + void *curr, *entry; int ret; - if (!new_domain) + if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - if (handle) { - ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); - if (ret) - goto err_unlock; - handle->domain = new_domain; - } + entry = iommu_make_pasid_array_entry(new_domain, handle); + ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); + if (ret) + goto err_unlock; ret = __iommu_group_set_domain(group, new_domain); if (ret) goto err_release; - curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); WARN_ON(xa_is_err(curr)); mutex_unlock(&group->mutex); @@ -3789,3 +3939,45 @@ int iommu_replace_group_handle(struct iommu_group *group, return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +/** + * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain + * @desc: MSI descriptor, will store the MSI page + * @msi_addr: MSI target address to be mapped + * + * The implementation of sw_msi() should take msi_addr and map it to + * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the + * mapping information. + * + * Return: 0 on success or negative error code if the mapping failed. + */ +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommu_group *group = dev->iommu_group; + int ret = 0; + + if (!group) + return 0; + + mutex_lock(&group->mutex); + /* An IDENTITY domain must pass through */ + if (group->domain && group->domain->type != IOMMU_DOMAIN_IDENTITY) { + switch (group->domain->cookie_type) { + case IOMMU_COOKIE_DMA_MSI: + case IOMMU_COOKIE_DMA_IOVA: + ret = iommu_dma_sw_msi(group->domain, desc, msi_addr); + break; + case IOMMU_COOKIE_IOMMUFD: + ret = iommufd_sw_msi(group->domain, desc, msi_addr); + break; + default: + ret = -EOPNOTSUPP; + break; + } + } + mutex_unlock(&group->mutex); + return ret; +} +#endif /* CONFIG_IRQ_MSI_IOMMU */ diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 0a07f9449fd9c..2beeb4f60ee53 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config IOMMUFD_DRIVER_CORE - tristate + bool default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n config IOMMUFD diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index cb784da6cddca..71d692c9a8f49 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ device.o \ - fault.o \ + eventq.o \ hw_pagetable.o \ io_pagetable.o \ ioas.o \ diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 3c7800d4ab622..72916f2fd38e6 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -3,6 +3,7 @@ */ #include #include +#include #include #include @@ -17,12 +18,17 @@ MODULE_PARM_DESC( "Allow IOMMUFD to bind to devices even if the platform cannot isolate " "the MSI interrupt window. Enabling this is a security weakness."); +struct iommufd_attach { + struct iommufd_hw_pagetable *hwpt; + struct xarray device_array; +}; + static void iommufd_group_release(struct kref *kref) { struct iommufd_group *igroup = container_of(kref, struct iommufd_group, ref); - WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list)); + WARN_ON(!xa_empty(&igroup->pasid_attach)); xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, NULL, GFP_KERNEL); @@ -89,7 +95,7 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, kref_init(&new_igroup->ref); mutex_init(&new_igroup->lock); - INIT_LIST_HEAD(&new_igroup->device_list); + xa_init(&new_igroup->pasid_attach); new_igroup->sw_msi_start = PHYS_ADDR_MAX; /* group reference moves into new_igroup */ new_igroup->group = group; @@ -293,56 +299,83 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); +static unsigned int iommufd_group_device_num(struct iommufd_group *igroup, + ioasid_t pasid) +{ + struct iommufd_attach *attach; + struct iommufd_device *idev; + unsigned int count = 0; + unsigned long index; + + lockdep_assert_held(&igroup->lock); + + attach = xa_load(&igroup->pasid_attach, pasid); + if (attach) + xa_for_each(&attach->device_array, index, idev) + count++; + return count; +} + +#ifdef CONFIG_IRQ_MSI_IOMMU static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { - phys_addr_t sw_msi_start = igroup->sw_msi_start; - int rc; + struct iommufd_ctx *ictx = igroup->ictx; + struct iommufd_sw_msi_map *cur; + + if (igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; /* - * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to - * call iommu_get_msi_cookie() on its behalf. This is necessary to setup - * the MSI window so iommu_dma_prepare_msi() can install pages into our - * domain after request_irq(). If it is not done interrupts will not - * work on this domain. - * - * FIXME: This is conceptually broken for iommufd since we want to allow - * userspace to change the domains, eg switch from an identity IOAS to a - * DMA IOAS. There is currently no way to create a MSI window that - * matches what the IRQ layer actually expects in a newly created - * domain. + * Install all the MSI pages the device has been using into the domain */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt_paging->common.domain, - sw_msi_start); + guard(mutex)(&ictx->sw_msi_lock); + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + int rc; + + if (cur->sw_msi_start != igroup->sw_msi_start || + !test_bit(cur->id, igroup->required_sw_msi.bitmap)) + continue; + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur); if (rc) return rc; - - /* - * iommu_get_msi_cookie() can only be called once per domain, - * it returns -EBUSY on later calls. - */ - hwpt_paging->msi_cookie = true; } return 0; } +#else +static inline int +iommufd_group_setup_msi(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + return 0; +} +#endif + +static bool +iommufd_group_first_attach(struct iommufd_group *igroup, ioasid_t pasid) +{ + lockdep_assert_held(&igroup->lock); + return !xa_load(&igroup->pasid_attach, pasid); +} static int iommufd_device_attach_reserved_iova(struct iommufd_device *idev, struct iommufd_hwpt_paging *hwpt_paging) { + struct iommufd_group *igroup = idev->igroup; int rc; - lockdep_assert_held(&idev->igroup->lock); + lockdep_assert_held(&igroup->lock); rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, idev->dev, - &idev->igroup->sw_msi_start); + &igroup->sw_msi_start); if (rc) return rc; - if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging); + if (iommufd_group_first_attach(igroup, IOMMU_NO_PASID)) { + rc = iommufd_group_setup_msi(igroup, hwpt_paging); if (rc) { iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); @@ -354,24 +387,54 @@ iommufd_device_attach_reserved_iova(struct iommufd_device *idev, /* The device attach/detach/replace helpers for attach_handle */ -/* Check if idev is attached to igroup->hwpt */ -static bool iommufd_device_is_attached(struct iommufd_device *idev) +static bool iommufd_device_is_attached(struct iommufd_device *idev, + ioasid_t pasid) { - struct iommufd_device *cur; + struct iommufd_attach *attach; - list_for_each_entry(cur, &idev->igroup->device_list, group_item) - if (cur == idev) - return true; - return false; + attach = xa_load(&idev->igroup->pasid_attach, pasid); + return xa_load(&attach->device_array, idev->obj.id); +} + +static int iommufd_hwpt_pasid_compat(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev, + ioasid_t pasid) +{ + struct iommufd_group *igroup = idev->igroup; + + lockdep_assert_held(&igroup->lock); + + if (pasid == IOMMU_NO_PASID) { + unsigned long start = IOMMU_NO_PASID; + + if (!hwpt->pasid_compat && + xa_find_after(&igroup->pasid_attach, + &start, UINT_MAX, XA_PRESENT)) + return -EINVAL; + } else { + struct iommufd_attach *attach; + + if (!hwpt->pasid_compat) + return -EINVAL; + + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + if (attach && attach->hwpt && !attach->hwpt->pasid_compat) + return -EINVAL; + } + + return 0; } static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) + struct iommufd_device *idev, + ioasid_t pasid) { struct iommufd_attach_handle *handle; int rc; - lockdep_assert_held(&idev->igroup->lock); + rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid); + if (rc) + return rc; handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) @@ -384,8 +447,12 @@ static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, } handle->idev = idev; - rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, - &handle->handle); + if (pasid == IOMMU_NO_PASID) + rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, + &handle->handle); + else + rc = iommu_attach_device_pasid(hwpt->domain, idev->dev, pasid, + &handle->handle); if (rc) goto out_disable_iopf; @@ -400,26 +467,30 @@ static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, } static struct iommufd_attach_handle * -iommufd_device_get_attach_handle(struct iommufd_device *idev) +iommufd_device_get_attach_handle(struct iommufd_device *idev, ioasid_t pasid) { struct iommu_attach_handle *handle; lockdep_assert_held(&idev->igroup->lock); - handle = - iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); + handle = iommu_attach_handle_get(idev->igroup->group, pasid, 0); if (IS_ERR(handle)) return NULL; return to_iommufd_handle(handle); } static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) + struct iommufd_device *idev, + ioasid_t pasid) { struct iommufd_attach_handle *handle; - handle = iommufd_device_get_attach_handle(idev); - iommu_detach_group_handle(hwpt->domain, idev->igroup->group); + handle = iommufd_device_get_attach_handle(idev, pasid); + if (pasid == IOMMU_NO_PASID) + iommu_detach_group_handle(hwpt->domain, idev->igroup->group); + else + iommu_detach_device_pasid(hwpt->domain, idev->dev, pasid); + if (hwpt->fault) { iommufd_auto_response_faults(hwpt, handle); iommufd_fault_iopf_disable(idev); @@ -428,13 +499,19 @@ static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, } static int iommufd_hwpt_replace_device(struct iommufd_device *idev, + ioasid_t pasid, struct iommufd_hw_pagetable *hwpt, struct iommufd_hw_pagetable *old) { - struct iommufd_attach_handle *handle, *old_handle = - iommufd_device_get_attach_handle(idev); + struct iommufd_attach_handle *handle, *old_handle; int rc; + rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid); + if (rc) + return rc; + + old_handle = iommufd_device_get_attach_handle(idev, pasid); + handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) return -ENOMEM; @@ -446,8 +523,12 @@ static int iommufd_hwpt_replace_device(struct iommufd_device *idev, } handle->idev = idev; - rc = iommu_replace_group_handle(idev->igroup->group, hwpt->domain, - &handle->handle); + if (pasid == IOMMU_NO_PASID) + rc = iommu_replace_group_handle(idev->igroup->group, + hwpt->domain, &handle->handle); + else + rc = iommu_replace_device_pasid(hwpt->domain, idev->dev, + pasid, &handle->handle); if (rc) goto out_disable_iopf; @@ -469,22 +550,51 @@ static int iommufd_hwpt_replace_device(struct iommufd_device *idev, } int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) + struct iommufd_device *idev, ioasid_t pasid) { struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID; + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hw_pagetable *old_hwpt; + struct iommufd_attach *attach; int rc; - mutex_lock(&idev->igroup->lock); + mutex_lock(&igroup->lock); - if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) { - rc = -EINVAL; + attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL, + XA_ZERO_ENTRY, GFP_KERNEL); + if (xa_is_err(attach)) { + rc = xa_err(attach); goto err_unlock; } - if (hwpt_paging) { + if (!attach) { + attach = kzalloc(sizeof(*attach), GFP_KERNEL); + if (!attach) { + rc = -ENOMEM; + goto err_release_pasid; + } + xa_init(&attach->device_array); + } + + old_hwpt = attach->hwpt; + + rc = xa_insert(&attach->device_array, idev->obj.id, XA_ZERO_ENTRY, + GFP_KERNEL); + if (rc) { + WARN_ON(rc == -EBUSY && !old_hwpt); + goto err_free_attach; + } + + if (old_hwpt && old_hwpt != hwpt) { + rc = -EINVAL; + goto err_release_devid; + } + + if (attach_resv) { rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging); if (rc) - goto err_unlock; + goto err_release_devid; } /* @@ -494,51 +604,74 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * reserved regions are only updated during individual device * attachment. */ - if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_hwpt_attach_device(hwpt, idev); + if (iommufd_group_first_attach(igroup, pasid)) { + rc = iommufd_hwpt_attach_device(hwpt, idev, pasid); if (rc) goto err_unresv; - idev->igroup->hwpt = hwpt; + attach->hwpt = hwpt; + WARN_ON(xa_is_err(xa_store(&igroup->pasid_attach, pasid, attach, + GFP_KERNEL))); } refcount_inc(&hwpt->obj.users); - list_add_tail(&idev->group_item, &idev->igroup->device_list); - mutex_unlock(&idev->igroup->lock); + WARN_ON(xa_is_err(xa_store(&attach->device_array, idev->obj.id, + idev, GFP_KERNEL))); + mutex_unlock(&igroup->lock); return 0; err_unresv: - if (hwpt_paging) + if (attach_resv) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); +err_release_devid: + xa_release(&attach->device_array, idev->obj.id); +err_free_attach: + if (iommufd_group_first_attach(igroup, pasid)) + kfree(attach); +err_release_pasid: + if (iommufd_group_first_attach(igroup, pasid)) + xa_release(&igroup->pasid_attach, pasid); err_unlock: - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); return rc; } struct iommufd_hw_pagetable * -iommufd_hw_pagetable_detach(struct iommufd_device *idev) +iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) { - struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; - struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_attach *attach; - mutex_lock(&idev->igroup->lock); - list_del(&idev->group_item); - if (list_empty(&idev->igroup->device_list)) { - iommufd_hwpt_detach_device(hwpt, idev); - idev->igroup->hwpt = NULL; + mutex_lock(&igroup->lock); + attach = xa_load(&igroup->pasid_attach, pasid); + if (!attach) { + mutex_unlock(&igroup->lock); + return NULL; } - if (hwpt_paging) + + hwpt = attach->hwpt; + hwpt_paging = find_hwpt_paging(hwpt); + + xa_erase(&attach->device_array, idev->obj.id); + if (xa_empty(&attach->device_array)) { + iommufd_hwpt_detach_device(hwpt, idev, pasid); + xa_erase(&igroup->pasid_attach, pasid); + kfree(attach); + } + if (hwpt_paging && pasid == IOMMU_NO_PASID) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); /* Caller must destroy hwpt */ return hwpt; } static struct iommufd_hw_pagetable * -iommufd_device_do_attach(struct iommufd_device *idev, +iommufd_device_do_attach(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_hw_pagetable *hwpt) { int rc; - rc = iommufd_hw_pagetable_attach(hwpt, idev); + rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid); if (rc) return ERR_PTR(rc); return NULL; @@ -548,11 +681,14 @@ static void iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { + struct iommufd_attach *attach; struct iommufd_device *cur; + unsigned long index; lockdep_assert_held(&igroup->lock); - list_for_each_entry(cur, &igroup->device_list, group_item) + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + xa_for_each(&attach->device_array, index, cur) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); } @@ -561,14 +697,17 @@ iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { struct iommufd_hwpt_paging *old_hwpt_paging; + struct iommufd_attach *attach; struct iommufd_device *cur; + unsigned long index; int rc; lockdep_assert_held(&igroup->lock); - old_hwpt_paging = find_hwpt_paging(igroup->hwpt); + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + old_hwpt_paging = find_hwpt_paging(attach->hwpt); if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) { + xa_for_each(&attach->device_array, index, cur) { rc = iopt_table_enforce_dev_resv_regions( &hwpt_paging->ioas->iopt, cur->dev, NULL); if (rc) @@ -587,74 +726,81 @@ iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup, } static struct iommufd_hw_pagetable * -iommufd_device_do_replace(struct iommufd_device *idev, +iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_hw_pagetable *hwpt) { struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID; struct iommufd_hwpt_paging *old_hwpt_paging; struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; + struct iommufd_attach *attach; unsigned int num_devices; int rc; - mutex_lock(&idev->igroup->lock); + mutex_lock(&igroup->lock); - if (igroup->hwpt == NULL) { + attach = xa_load(&igroup->pasid_attach, pasid); + if (!attach) { rc = -EINVAL; goto err_unlock; } - if (!iommufd_device_is_attached(idev)) { + old_hwpt = attach->hwpt; + + WARN_ON(!old_hwpt || xa_empty(&attach->device_array)); + + if (!iommufd_device_is_attached(idev, pasid)) { rc = -EINVAL; goto err_unlock; } - if (hwpt == igroup->hwpt) { - mutex_unlock(&idev->igroup->lock); + if (hwpt == old_hwpt) { + mutex_unlock(&igroup->lock); return NULL; } - old_hwpt = igroup->hwpt; - if (hwpt_paging) { + if (attach_resv) { rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging); if (rc) goto err_unlock; } - rc = iommufd_hwpt_replace_device(idev, hwpt, old_hwpt); + rc = iommufd_hwpt_replace_device(idev, pasid, hwpt, old_hwpt); if (rc) goto err_unresv; old_hwpt_paging = find_hwpt_paging(old_hwpt); - if (old_hwpt_paging && + if (old_hwpt_paging && pasid == IOMMU_NO_PASID && (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas)) iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging); - igroup->hwpt = hwpt; + attach->hwpt = hwpt; - num_devices = list_count_nodes(&igroup->device_list); + num_devices = iommufd_group_device_num(igroup, pasid); /* - * Move the refcounts held by the device_list to the new hwpt. Retain a + * Move the refcounts held by the device_array to the new hwpt. Retain a * refcount for this thread as the caller will free it. */ refcount_add(num_devices, &hwpt->obj.users); if (num_devices > 1) WARN_ON(refcount_sub_and_test(num_devices - 1, &old_hwpt->obj.users)); - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); /* Caller must destroy old_hwpt */ return old_hwpt; err_unresv: - if (hwpt_paging) + if (attach_resv) iommufd_group_remove_reserved_iova(igroup, hwpt_paging); err_unlock: - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); return ERR_PTR(rc); } typedef struct iommufd_hw_pagetable *(*attach_fn)( - struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); + struct iommufd_device *idev, ioasid_t pasid, + struct iommufd_hw_pagetable *hwpt); /* * When automatically managing the domains we search for a compatible domain in @@ -662,7 +808,7 @@ typedef struct iommufd_hw_pagetable *(*attach_fn)( * Automatic domain selection will never pick a manually created domain. */ static struct iommufd_hw_pagetable * -iommufd_device_auto_get_domain(struct iommufd_device *idev, +iommufd_device_auto_get_domain(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_ioas *ioas, u32 *pt_id, attach_fn do_attach) { @@ -691,7 +837,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, hwpt = &hwpt_paging->common; if (!iommufd_lock_obj(&hwpt->obj)) continue; - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) { iommufd_put_object(idev->ictx, &hwpt->obj); /* @@ -709,8 +855,8 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } - hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, - immediate_attach, NULL); + hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, pasid, + 0, immediate_attach, NULL); if (IS_ERR(hwpt_paging)) { destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; @@ -718,7 +864,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, hwpt = &hwpt_paging->common; if (!immediate_attach) { - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) goto out_abort; } else { @@ -739,8 +885,9 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, return destroy_hwpt; } -static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, - attach_fn do_attach) +static int iommufd_device_change_pt(struct iommufd_device *idev, + ioasid_t pasid, + u32 *pt_id, attach_fn do_attach) { struct iommufd_hw_pagetable *destroy_hwpt; struct iommufd_object *pt_obj; @@ -755,7 +902,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) goto out_put_pt_obj; break; @@ -764,8 +911,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, struct iommufd_ioas *ioas = container_of(pt_obj, struct iommufd_ioas, obj); - destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id, - do_attach); + destroy_hwpt = iommufd_device_auto_get_domain(idev, pasid, ioas, + pt_id, do_attach); if (IS_ERR(destroy_hwpt)) goto out_put_pt_obj; break; @@ -787,22 +934,26 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, } /** - * iommufd_device_attach - Connect a device to an iommu_domain + * iommufd_device_attach - Connect a device/pasid to an iommu_domain * @idev: device to attach + * @pasid: pasid to attach * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING * Output the IOMMUFD_OBJ_HWPT_PAGING ID * - * This connects the device to an iommu_domain, either automatically or manually - * selected. Once this completes the device could do DMA. + * This connects the device/pasid to an iommu_domain, either automatically + * or manually selected. Once this completes the device could do DMA with + * @pasid. @pasid is IOMMU_NO_PASID if this attach is for no pasid usage. * * The caller should return the resulting pt_id back to userspace. * This function is undone by calling iommufd_device_detach(). */ -int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id) { int rc; - rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach); + rc = iommufd_device_change_pt(idev, pasid, pt_id, + &iommufd_device_do_attach); if (rc) return rc; @@ -816,8 +967,9 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD"); /** - * iommufd_device_replace - Change the device's iommu_domain + * iommufd_device_replace - Change the device/pasid's iommu_domain * @idev: device to change + * @pasid: pasid to change * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING * Output the IOMMUFD_OBJ_HWPT_PAGING ID * @@ -828,27 +980,33 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD"); * * If it fails then no change is made to the attachment. The iommu driver may * implement this so there is no disruption in translation. This can only be - * called if iommufd_device_attach() has already succeeded. + * called if iommufd_device_attach() has already succeeded. @pasid is + * IOMMU_NO_PASID for no pasid usage. */ -int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) +int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id) { - return iommufd_device_change_pt(idev, pt_id, + return iommufd_device_change_pt(idev, pasid, pt_id, &iommufd_device_do_replace); } EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, "IOMMUFD"); /** - * iommufd_device_detach - Disconnect a device to an iommu_domain + * iommufd_device_detach - Disconnect a device/device to an iommu_domain * @idev: device to detach + * @pasid: pasid to detach * * Undo iommufd_device_attach(). This disconnects the idev from the previously * attached pt_id. The device returns back to a blocked DMA translation. + * @pasid is IOMMU_NO_PASID for no pasid usage. */ -void iommufd_device_detach(struct iommufd_device *idev) +void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid) { struct iommufd_hw_pagetable *hwpt; - hwpt = iommufd_hw_pagetable_detach(idev); + hwpt = iommufd_hw_pagetable_detach(idev, pasid); + if (!hwpt) + return; iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } @@ -891,7 +1049,7 @@ static int iommufd_access_change_ioas(struct iommufd_access *access, } if (cur_ioas) { - if (access->ops->unmap) { + if (!iommufd_access_is_internal(access) && access->ops->unmap) { mutex_unlock(&access->ioas_lock); access->ops->unmap(access->data, 0, ULONG_MAX); mutex_lock(&access->ioas_lock); @@ -927,7 +1085,39 @@ void iommufd_access_destroy_object(struct iommufd_object *obj) if (access->ioas) WARN_ON(iommufd_access_change_ioas(access, NULL)); mutex_unlock(&access->ioas_lock); - iommufd_ctx_put(access->ictx); + if (!iommufd_access_is_internal(access)) + iommufd_ctx_put(access->ictx); +} + +static struct iommufd_access *__iommufd_access_create(struct iommufd_ctx *ictx) +{ + struct iommufd_access *access; + + /* + * There is no uAPI for the access object, but to keep things symmetric + * use the object infrastructure anyhow. + */ + access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + if (IS_ERR(access)) + return access; + + /* The calling driver is a user until iommufd_access_destroy() */ + refcount_inc(&access->obj.users); + mutex_init(&access->ioas_lock); + return access; +} + +struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx) +{ + struct iommufd_access *access; + + access = __iommufd_access_create(ictx); + if (IS_ERR(access)) + return access; + access->iova_alignment = PAGE_SIZE; + + iommufd_object_finalize(ictx, &access->obj); + return access; } /** @@ -949,11 +1139,7 @@ iommufd_access_create(struct iommufd_ctx *ictx, { struct iommufd_access *access; - /* - * There is no uAPI for the access object, but to keep things symmetric - * use the object infrastructure anyhow. - */ - access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + access = __iommufd_access_create(ictx); if (IS_ERR(access)) return access; @@ -965,13 +1151,10 @@ iommufd_access_create(struct iommufd_ctx *ictx, else access->iova_alignment = 1; - /* The calling driver is a user until iommufd_access_destroy() */ - refcount_inc(&access->obj.users); access->ictx = ictx; iommufd_ctx_get(ictx); iommufd_object_finalize(ictx, &access->obj); *id = access->obj.id; - mutex_init(&access->ioas_lock); return access; } EXPORT_SYMBOL_NS_GPL(iommufd_access_create, "IOMMUFD"); @@ -1016,6 +1199,22 @@ int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) } EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, "IOMMUFD"); +int iommufd_access_attach_internal(struct iommufd_access *access, + struct iommufd_ioas *ioas) +{ + int rc; + + mutex_lock(&access->ioas_lock); + if (WARN_ON(access->ioas)) { + mutex_unlock(&access->ioas_lock); + return -EINVAL; + } + + rc = iommufd_access_change_ioas(access, ioas); + mutex_unlock(&access->ioas_lock); + return rc; +} + int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) { int rc; @@ -1057,7 +1256,8 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, xa_lock(&ioas->iopt.access_list); xa_for_each(&ioas->iopt.access_list, index, access) { - if (!iommufd_lock_obj(&access->obj)) + if (!iommufd_lock_obj(&access->obj) || + iommufd_access_is_internal(access)) continue; xa_unlock(&ioas->iopt.access_list); @@ -1081,6 +1281,7 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, void iommufd_access_unpin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length) { + bool internal = iommufd_access_is_internal(access); struct iopt_area_contig_iter iter; struct io_pagetable *iopt; unsigned long last_iova; @@ -1107,7 +1308,8 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, area, iopt_area_iova_to_index(area, iter.cur_iova), iopt_area_iova_to_index( area, - min(last_iova, iopt_area_last_iova(area)))); + min(last_iova, iopt_area_last_iova(area))), + internal); WARN_ON(!iopt_area_contig_done(&iter)); up_read(&iopt->iova_rwsem); mutex_unlock(&access->ioas_lock); @@ -1156,6 +1358,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length, struct page **out_pages, unsigned int flags) { + bool internal = iommufd_access_is_internal(access); struct iopt_area_contig_iter iter; struct io_pagetable *iopt; unsigned long last_iova; @@ -1164,7 +1367,8 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, /* Driver's ops don't support pin_pages */ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && - WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap)) + WARN_ON(access->iova_alignment != PAGE_SIZE || + (!internal && !access->ops->unmap))) return -EINVAL; if (!length) @@ -1198,7 +1402,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, } rc = iopt_area_add_access(area, index, last_index, out_pages, - flags); + flags, internal); if (rc) goto err_remove; out_pages += last_index - index + 1; @@ -1221,7 +1425,8 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, iopt_area_iova_to_index(area, iter.cur_iova), iopt_area_iova_to_index( area, min(last_iova, - iopt_area_last_iova(area)))); + iopt_area_last_iova(area))), + internal); } up_read(&iopt->iova_rwsem); mutex_unlock(&access->ioas_lock); @@ -1295,6 +1500,7 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD"); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) { + const u32 SUPPORTED_FLAGS = IOMMU_HW_INFO_FLAG_INPUT_TYPE; struct iommu_hw_info *cmd = ucmd->cmd; void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr); const struct iommu_ops *ops; @@ -1304,9 +1510,15 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) void *data; int rc; - if (cmd->flags || cmd->__reserved) + if (cmd->flags & ~SUPPORTED_FLAGS) + return -EOPNOTSUPP; + if (cmd->__reserved[0] || cmd->__reserved[1] || cmd->__reserved[2]) return -EOPNOTSUPP; + /* Clear the type field since drivers don't support a random input */ + if (!(cmd->flags & IOMMU_HW_INFO_FLAG_INPUT_TYPE)) + cmd->in_data_type = IOMMU_HW_INFO_TYPE_DEFAULT; + idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); @@ -1325,7 +1537,7 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) */ if (WARN_ON_ONCE(cmd->out_data_type == IOMMU_HW_INFO_TYPE_NONE)) { - rc = -ENODEV; + rc = -EOPNOTSUPP; goto out_free; } } else { @@ -1361,6 +1573,36 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + cmd->out_max_pasid_log2 = 0; + /* + * Currently, all iommu drivers enable PASID in the probe_device() + * op if iommu and device supports it. So the max_pasids stored in + * dev->iommu indicates both PASID support and enable status. A + * non-zero dev->iommu->max_pasids means PASID is supported and + * enabled. The iommufd only reports PASID capability to userspace + * if it's enabled. + */ + if (idev->dev->iommu->max_pasids) { + cmd->out_max_pasid_log2 = ilog2(idev->dev->iommu->max_pasids); + + if (dev_is_pci(idev->dev)) { + struct pci_dev *pdev = to_pci_dev(idev->dev); + int ctrl; + + ctrl = pci_pasid_status(pdev); + + WARN_ON_ONCE(ctrl < 0 || + !(ctrl & PCI_PASID_CTRL_ENABLE)); + + if (ctrl & PCI_PASID_CTRL_EXEC) + cmd->out_capabilities |= + IOMMU_HW_CAP_PCI_PASID_EXEC; + if (ctrl & PCI_PASID_CTRL_PRIV) + cmd->out_capabilities |= + IOMMU_HW_CAP_PCI_PASID_PRIV; + } + } + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_free: kfree(data); diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 2d98b04ff1cb7..e4eae20bcd4e8 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -3,38 +3,85 @@ */ #include "iommufd_private.h" -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type) +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +int _iommufd_object_depend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) { - struct iommufd_object *obj; + /* Reject self dependency that dead locks */ + if (obj_dependent == obj_depended) + return -EINVAL; + /* Only support dependency between two objects of the same type */ + if (obj_dependent->type != obj_depended->type) + return -EINVAL; + + refcount_inc(&obj_depended->users); + return 0; +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_depend, "IOMMUFD"); + +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +void _iommufd_object_undepend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ + if (WARN_ON_ONCE(obj_dependent == obj_depended || + obj_dependent->type != obj_depended->type)) + return; + + refcount_dec(&obj_depended->users); +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_undepend, "IOMMUFD"); + +/* + * Allocate an @offset to return to user space to use for an mmap() syscall + * + * Driver should use a per-structure helper in include/linux/iommufd.h + */ +int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner, + phys_addr_t mmio_addr, size_t length, + unsigned long *offset) +{ + struct iommufd_mmap *immap; + unsigned long startp; int rc; - obj = kzalloc(size, GFP_KERNEL_ACCOUNT); - if (!obj) - return ERR_PTR(-ENOMEM); - obj->type = type; - /* Starts out bias'd by 1 until it is removed from the xarray */ - refcount_set(&obj->shortterm_users, 1); - refcount_set(&obj->users, 1); + if (!PAGE_ALIGNED(mmio_addr)) + return -EINVAL; + if (!length || !PAGE_ALIGNED(length)) + return -EINVAL; - /* - * Reserve an ID in the xarray but do not publish the pointer yet since - * the caller hasn't initialized it yet. Once the pointer is published - * in the xarray and visible to other threads we can't reliably destroy - * it anymore, so the caller must complete all errorable operations - * before calling iommufd_object_finalize(). - */ - rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b, - GFP_KERNEL_ACCOUNT); - if (rc) - goto out_free; - return obj; -out_free: - kfree(obj); - return ERR_PTR(rc); + immap = kzalloc(sizeof(*immap), GFP_KERNEL); + if (!immap) + return -ENOMEM; + immap->owner = owner; + immap->length = length; + immap->mmio_addr = mmio_addr; + + /* Skip the first page to ease caller identifying the returned offset */ + rc = mtree_alloc_range(&ictx->mt_mmap, &startp, immap, immap->length, + PAGE_SIZE, ULONG_MAX, GFP_KERNEL); + if (rc < 0) { + kfree(immap); + return rc; + } + + /* mmap() syscall will right-shift the offset in vma->vm_pgoff too */ + immap->vm_pgoff = startp >> PAGE_SHIFT; + *offset = startp; + return 0; +} +EXPORT_SYMBOL_NS_GPL(_iommufd_alloc_mmap, "IOMMUFD"); + +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +void _iommufd_destroy_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, unsigned long offset) +{ + struct iommufd_mmap *immap; + + immap = mtree_erase(&ictx->mt_mmap, offset); + WARN_ON_ONCE(!immap || immap->owner != owner); + kfree(immap); } -EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, "IOMMUFD"); +EXPORT_SYMBOL_NS_GPL(_iommufd_destroy_mmap, "IOMMUFD"); /* Caller should xa_lock(&viommu->vdevs) to protect the return value */ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, @@ -49,5 +96,203 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, } EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, "IOMMUFD"); +/* Return -ENOENT if device is not associated to the vIOMMU */ +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id) +{ + struct iommufd_vdevice *vdev; + unsigned long index; + int rc = -ENOENT; + + if (WARN_ON_ONCE(!vdev_id)) + return -EINVAL; + + xa_lock(&viommu->vdevs); + xa_for_each(&viommu->vdevs, index, vdev) { + if (vdev->dev == dev) { + *vdev_id = vdev->virt_id; + rc = 0; + break; + } + } + xa_unlock(&viommu->vdevs); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_get_vdev_id, "IOMMUFD"); + +/* + * Typically called in driver's threaded IRQ handler. + * The @type and @event_data must be defined in include/uapi/linux/iommufd.h + */ +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len) +{ + struct iommufd_veventq *veventq; + struct iommufd_vevent *vevent; + int rc = 0; + + if (WARN_ON_ONCE(!data_len || !event_data)) + return -EINVAL; + + down_read(&viommu->veventqs_rwsem); + + veventq = iommufd_viommu_find_veventq(viommu, type); + if (!veventq) { + rc = -EOPNOTSUPP; + goto out_unlock_veventqs; + } + + spin_lock(&veventq->common.lock); + if (veventq->num_events == veventq->depth) { + vevent = &veventq->lost_events_header; + goto out_set_header; + } + + vevent = kzalloc(struct_size(vevent, event_data, data_len), GFP_ATOMIC); + if (!vevent) { + rc = -ENOMEM; + vevent = &veventq->lost_events_header; + goto out_set_header; + } + memcpy(vevent->event_data, event_data, data_len); + vevent->data_len = data_len; + veventq->num_events++; + +out_set_header: + iommufd_vevent_handler(veventq, vevent); + spin_unlock(&veventq->common.lock); +out_unlock_veventqs: + up_read(&viommu->veventqs_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_event, "IOMMUFD"); + +#ifdef CONFIG_IRQ_MSI_IOMMU +/* + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq + * layer. The mapping to IOVA is global to the iommufd file descriptor, every + * domain that is attached to a device using the same MSI parameters will use + * the same IOVA. + */ +static struct iommufd_sw_msi_map * +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr, + phys_addr_t sw_msi_start) +{ + struct iommufd_sw_msi_map *cur; + unsigned int max_pgoff = 0; + + lockdep_assert_held(&ictx->sw_msi_lock); + + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + if (cur->sw_msi_start != sw_msi_start) + continue; + max_pgoff = max(max_pgoff, cur->pgoff + 1); + if (cur->msi_addr == msi_addr) + return cur; + } + + if (ictx->sw_msi_id >= + BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap)) + return ERR_PTR(-EOVERFLOW); + + cur = kzalloc(sizeof(*cur), GFP_KERNEL); + if (!cur) + return ERR_PTR(-ENOMEM); + + cur->sw_msi_start = sw_msi_start; + cur->msi_addr = msi_addr; + cur->pgoff = max_pgoff; + cur->id = ictx->sw_msi_id++; + list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list); + return cur; +} + +int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map) +{ + unsigned long iova; + + lockdep_assert_held(&ictx->sw_msi_lock); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) { + int rc; + + rc = iommu_map(hwpt_paging->common.domain, iova, + msi_map->msi_addr, PAGE_SIZE, + IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO, + GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap); + } + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_sw_msi_install, "IOMMUFD_INTERNAL"); + +/* + * Called by the irq code if the platform translates the MSI address through the + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will + * allocate a fd global iova for the physical page that is the same on all + * domains and devices. + */ +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommufd_hwpt_paging *hwpt_paging; + struct iommu_attach_handle *raw_handle; + struct iommufd_attach_handle *handle; + struct iommufd_sw_msi_map *msi_map; + struct iommufd_ctx *ictx; + unsigned long iova; + int rc; + + /* + * It is safe to call iommu_attach_handle_get() here because the iommu + * core code invokes this under the group mutex which also prevents any + * change of the attach handle for the duration of this function. + */ + iommu_group_mutex_assert(dev); + + raw_handle = + iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); + if (IS_ERR(raw_handle)) + return 0; + hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt); + + handle = to_iommufd_handle(raw_handle); + /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */ + if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; + + ictx = handle->idev->ictx; + guard(mutex)(&ictx->sw_msi_lock); + /* + * The input msi_addr is the exact byte offset of the MSI doorbell, we + * assume the caller has checked that it is contained with a MMIO region + * that is secure to map at PAGE_SIZE. + */ + msi_map = iommufd_sw_msi_get_map(handle->idev->ictx, + msi_addr & PAGE_MASK, + handle->idev->igroup->sw_msi_start); + if (IS_ERR(msi_map)) + return PTR_ERR(msi_map); + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map); + if (rc) + return rc; + __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT); + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_sw_msi, "IOMMUFD"); +#endif + MODULE_DESCRIPTION("iommufd code shared with builtin modules"); +MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c new file mode 100644 index 0000000000000..269b667152b78 --- /dev/null +++ b/drivers/iommu/iommufd/eventq.c @@ -0,0 +1,592 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Intel Corporation + */ +#define pr_fmt(fmt) "iommufd: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../iommu-priv.h" +#include "iommufd_private.h" + +/* IOMMUFD_OBJ_FAULT Functions */ + +int iommufd_fault_iopf_enable(struct iommufd_device *idev) +{ + struct device *dev = idev->dev; + int ret; + + /* + * Once we turn on PCI/PRI support for VF, the response failure code + * should not be forwarded to the hardware due to PRI being a shared + * resource between PF and VFs. There is no coordination for this + * shared capability. This waits for a vPRI reset to recover. + */ + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if (pdev->is_virtfn && pci_pri_supported(pdev)) + return -EINVAL; + } + + mutex_lock(&idev->iopf_lock); + /* Device iopf has already been on. */ + if (++idev->iopf_enabled > 1) { + mutex_unlock(&idev->iopf_lock); + return 0; + } + + ret = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_IOPF); + if (ret) + --idev->iopf_enabled; + mutex_unlock(&idev->iopf_lock); + + return ret; +} + +void iommufd_fault_iopf_disable(struct iommufd_device *idev) +{ + mutex_lock(&idev->iopf_lock); + if (!WARN_ON(idev->iopf_enabled == 0)) { + if (--idev->iopf_enabled == 0) + iommu_dev_disable_feature(idev->dev, IOMMU_DEV_FEAT_IOPF); + } + mutex_unlock(&idev->iopf_lock); +} + +void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, + struct iommufd_attach_handle *handle) +{ + struct iommufd_fault *fault = hwpt->fault; + struct iopf_group *group, *next; + struct list_head free_list; + unsigned long index; + + if (!fault) + return; + INIT_LIST_HEAD(&free_list); + + mutex_lock(&fault->mutex); + spin_lock(&fault->common.lock); + list_for_each_entry_safe(group, next, &fault->common.deliver, node) { + if (group->attach_handle != &handle->handle) + continue; + list_move(&group->node, &free_list); + } + spin_unlock(&fault->common.lock); + + list_for_each_entry_safe(group, next, &free_list, node) { + list_del(&group->node); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + + xa_for_each(&fault->response, index, group) { + if (group->attach_handle != &handle->handle) + continue; + xa_erase(&fault->response, index); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + mutex_unlock(&fault->mutex); +} + +void iommufd_fault_destroy(struct iommufd_object *obj) +{ + struct iommufd_eventq *eventq = + container_of(obj, struct iommufd_eventq, obj); + struct iommufd_fault *fault = eventq_to_fault(eventq); + struct iopf_group *group, *next; + unsigned long index; + + /* + * The iommufd object's reference count is zero at this point. + * We can be confident that no other threads are currently + * accessing this pointer. Therefore, acquiring the mutex here + * is unnecessary. + */ + list_for_each_entry_safe(group, next, &fault->common.deliver, node) { + list_del(&group->node); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + xa_for_each(&fault->response, index, group) { + xa_erase(&fault->response, index); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + xa_destroy(&fault->response); + mutex_destroy(&fault->mutex); +} + +static void iommufd_compose_fault_message(struct iommu_fault *fault, + struct iommu_hwpt_pgfault *hwpt_fault, + struct iommufd_device *idev, + u32 cookie) +{ + hwpt_fault->flags = fault->prm.flags; + hwpt_fault->dev_id = idev->obj.id; + hwpt_fault->pasid = fault->prm.pasid; + hwpt_fault->grpid = fault->prm.grpid; + hwpt_fault->perm = fault->prm.perm; + hwpt_fault->addr = fault->prm.addr; + hwpt_fault->length = 0; + hwpt_fault->cookie = cookie; +} + +/* Fetch the first node out of the fault->deliver list */ +static struct iopf_group * +iommufd_fault_deliver_fetch(struct iommufd_fault *fault) +{ + struct list_head *list = &fault->common.deliver; + struct iopf_group *group = NULL; + + spin_lock(&fault->common.lock); + if (!list_empty(list)) { + group = list_first_entry(list, struct iopf_group, node); + list_del(&group->node); + } + spin_unlock(&fault->common.lock); + return group; +} + +/* Restore a node back to the head of the fault->deliver list */ +static void iommufd_fault_deliver_restore(struct iommufd_fault *fault, + struct iopf_group *group) +{ + spin_lock(&fault->common.lock); + list_add(&group->node, &fault->common.deliver); + spin_unlock(&fault->common.lock); +} + +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + size_t fault_size = sizeof(struct iommu_hwpt_pgfault); + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_fault *fault = eventq_to_fault(eventq); + struct iommu_hwpt_pgfault data = {}; + struct iommufd_device *idev; + struct iopf_group *group; + struct iopf_fault *iopf; + size_t done = 0; + int rc = 0; + + if (*ppos || count % fault_size) + return -ESPIPE; + + mutex_lock(&fault->mutex); + while ((group = iommufd_fault_deliver_fetch(fault))) { + if (done >= count || + group->fault_count * fault_size > count - done) { + iommufd_fault_deliver_restore(fault, group); + break; + } + + rc = xa_alloc(&fault->response, &group->cookie, group, + xa_limit_32b, GFP_KERNEL); + if (rc) { + iommufd_fault_deliver_restore(fault, group); + break; + } + + idev = to_iommufd_handle(group->attach_handle)->idev; + list_for_each_entry(iopf, &group->faults, list) { + iommufd_compose_fault_message(&iopf->fault, + &data, idev, + group->cookie); + if (copy_to_user(buf + done, &data, fault_size)) { + xa_erase(&fault->response, group->cookie); + iommufd_fault_deliver_restore(fault, group); + rc = -EFAULT; + break; + } + done += fault_size; + } + } + mutex_unlock(&fault->mutex); + + return done == 0 ? rc : done; +} + +static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf, + size_t count, loff_t *ppos) +{ + size_t response_size = sizeof(struct iommu_hwpt_page_response); + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_fault *fault = eventq_to_fault(eventq); + struct iommu_hwpt_page_response response; + struct iopf_group *group; + size_t done = 0; + int rc = 0; + + if (*ppos || count % response_size) + return -ESPIPE; + + mutex_lock(&fault->mutex); + while (count > done) { + rc = copy_from_user(&response, buf + done, response_size); + if (rc) + break; + + static_assert((int)IOMMUFD_PAGE_RESP_SUCCESS == + (int)IOMMU_PAGE_RESP_SUCCESS); + static_assert((int)IOMMUFD_PAGE_RESP_INVALID == + (int)IOMMU_PAGE_RESP_INVALID); + if (response.code != IOMMUFD_PAGE_RESP_SUCCESS && + response.code != IOMMUFD_PAGE_RESP_INVALID) { + rc = -EINVAL; + break; + } + + group = xa_erase(&fault->response, response.cookie); + if (!group) { + rc = -EINVAL; + break; + } + + iopf_group_response(group, response.code); + iopf_free_group(group); + done += response_size; + } + mutex_unlock(&fault->mutex); + + return done == 0 ? rc : done; +} + +/* IOMMUFD_OBJ_VEVENTQ Functions */ + +void iommufd_veventq_abort(struct iommufd_object *obj) +{ + struct iommufd_eventq *eventq = + container_of(obj, struct iommufd_eventq, obj); + struct iommufd_veventq *veventq = eventq_to_veventq(eventq); + struct iommufd_viommu *viommu = veventq->viommu; + struct iommufd_vevent *cur, *next; + + lockdep_assert_held_write(&viommu->veventqs_rwsem); + + list_for_each_entry_safe(cur, next, &eventq->deliver, node) { + list_del(&cur->node); + if (cur != &veventq->lost_events_header) + kfree(cur); + } + + refcount_dec(&viommu->obj.users); + list_del(&veventq->node); +} + +void iommufd_veventq_destroy(struct iommufd_object *obj) +{ + struct iommufd_veventq *veventq = eventq_to_veventq( + container_of(obj, struct iommufd_eventq, obj)); + + down_write(&veventq->viommu->veventqs_rwsem); + iommufd_veventq_abort(obj); + up_write(&veventq->viommu->veventqs_rwsem); +} + +static struct iommufd_vevent * +iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq) +{ + struct iommufd_eventq *eventq = &veventq->common; + struct list_head *list = &eventq->deliver; + struct iommufd_vevent *vevent = NULL; + + spin_lock(&eventq->lock); + if (!list_empty(list)) { + struct iommufd_vevent *next; + + next = list_first_entry(list, struct iommufd_vevent, node); + /* Make a copy of the lost_events_header for copy_to_user */ + if (next == &veventq->lost_events_header) { + vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC); + if (!vevent) + goto out_unlock; + } + list_del(&next->node); + if (vevent) + memcpy(vevent, next, sizeof(*vevent)); + else + vevent = next; + } +out_unlock: + spin_unlock(&eventq->lock); + return vevent; +} + +static void iommufd_veventq_deliver_restore(struct iommufd_veventq *veventq, + struct iommufd_vevent *vevent) +{ + struct iommufd_eventq *eventq = &veventq->common; + struct list_head *list = &eventq->deliver; + + spin_lock(&eventq->lock); + if (vevent_for_lost_events_header(vevent)) { + /* Remove the copy of the lost_events_header */ + kfree(vevent); + vevent = NULL; + /* An empty list needs the lost_events_header back */ + if (list_empty(list)) + vevent = &veventq->lost_events_header; + } + if (vevent) + list_add(&vevent->node, list); + spin_unlock(&eventq->lock); +} + +static ssize_t iommufd_veventq_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_veventq *veventq = eventq_to_veventq(eventq); + struct iommufd_vevent_header *hdr; + struct iommufd_vevent *cur; + size_t done = 0; + int rc = 0; + + if (*ppos) + return -ESPIPE; + + while ((cur = iommufd_veventq_deliver_fetch(veventq))) { + /* Validate the remaining bytes against the header size */ + if (done >= count || sizeof(*hdr) > count - done) { + iommufd_veventq_deliver_restore(veventq, cur); + break; + } + hdr = &cur->header; + + /* If being a normal vEVENT, validate against the full size */ + if (!vevent_for_lost_events_header(cur) && + sizeof(hdr) + cur->data_len > count - done) { + iommufd_veventq_deliver_restore(veventq, cur); + break; + } + + if (copy_to_user(buf + done, hdr, sizeof(*hdr))) { + iommufd_veventq_deliver_restore(veventq, cur); + rc = -EFAULT; + break; + } + done += sizeof(*hdr); + + if (cur->data_len && + copy_to_user(buf + done, cur->event_data, cur->data_len)) { + iommufd_veventq_deliver_restore(veventq, cur); + rc = -EFAULT; + break; + } + spin_lock(&eventq->lock); + if (!vevent_for_lost_events_header(cur)) + veventq->num_events--; + spin_unlock(&eventq->lock); + done += cur->data_len; + kfree(cur); + } + + return done == 0 ? rc : done; +} + +/* Common Event Queue Functions */ + +static __poll_t iommufd_eventq_fops_poll(struct file *filep, + struct poll_table_struct *wait) +{ + struct iommufd_eventq *eventq = filep->private_data; + __poll_t pollflags = 0; + + if (eventq->obj.type == IOMMUFD_OBJ_FAULT) + pollflags |= EPOLLOUT; + + poll_wait(filep, &eventq->wait_queue, wait); + spin_lock(&eventq->lock); + if (!list_empty(&eventq->deliver)) + pollflags |= EPOLLIN | EPOLLRDNORM; + spin_unlock(&eventq->lock); + + return pollflags; +} + +static int iommufd_eventq_fops_release(struct inode *inode, struct file *filep) +{ + struct iommufd_eventq *eventq = filep->private_data; + + refcount_dec(&eventq->obj.users); + iommufd_ctx_put(eventq->ictx); + return 0; +} + +#define INIT_EVENTQ_FOPS(read_op, write_op) \ + ((const struct file_operations){ \ + .owner = THIS_MODULE, \ + .open = nonseekable_open, \ + .read = read_op, \ + .write = write_op, \ + .poll = iommufd_eventq_fops_poll, \ + .release = iommufd_eventq_fops_release, \ + }) + +static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name, + struct iommufd_ctx *ictx, + const struct file_operations *fops) +{ + struct file *filep; + int fdno; + + spin_lock_init(&eventq->lock); + INIT_LIST_HEAD(&eventq->deliver); + init_waitqueue_head(&eventq->wait_queue); + + filep = anon_inode_getfile(name, fops, eventq, O_RDWR); + if (IS_ERR(filep)) + return PTR_ERR(filep); + + eventq->ictx = ictx; + iommufd_ctx_get(eventq->ictx); + eventq->filep = filep; + refcount_inc(&eventq->obj.users); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) + fput(filep); + return fdno; +} + +static const struct file_operations iommufd_fault_fops = + INIT_EVENTQ_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write); + +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_fault_alloc *cmd = ucmd->cmd; + struct iommufd_fault *fault; + int fdno; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + fault = __iommufd_object_alloc_ucmd(ucmd, fault, IOMMUFD_OBJ_FAULT, + common.obj); + if (IS_ERR(fault)) + return PTR_ERR(fault); + + xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); + mutex_init(&fault->mutex); + + fdno = iommufd_eventq_init(&fault->common, "[iommufd-pgfault]", + ucmd->ictx, &iommufd_fault_fops); + if (fdno < 0) + return fdno; + + cmd->out_fault_id = fault->common.obj.id; + cmd->out_fault_fd = fdno; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put_fdno; + + fd_install(fdno, fault->common.filep); + + return 0; +out_put_fdno: + put_unused_fd(fdno); + fput(fault->common.filep); + return rc; +} + +int iommufd_fault_iopf_handler(struct iopf_group *group) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_fault *fault; + + hwpt = group->attach_handle->domain->iommufd_hwpt; + fault = hwpt->fault; + + spin_lock(&fault->common.lock); + list_add_tail(&group->node, &fault->common.deliver); + spin_unlock(&fault->common.lock); + + wake_up_interruptible(&fault->common.wait_queue); + + return 0; +} + +static const struct file_operations iommufd_veventq_fops = + INIT_EVENTQ_FOPS(iommufd_veventq_fops_read, NULL); + +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_veventq_alloc *cmd = ucmd->cmd; + struct iommufd_veventq *veventq; + struct iommufd_viommu *viommu; + int fdno; + int rc; + + if (cmd->flags || cmd->__reserved || + cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->veventq_depth) + return -EINVAL; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + down_write(&viommu->veventqs_rwsem); + + if (iommufd_viommu_find_veventq(viommu, cmd->type)) { + rc = -EEXIST; + goto out_unlock_veventqs; + } + + veventq = __iommufd_object_alloc(ucmd->ictx, veventq, + IOMMUFD_OBJ_VEVENTQ, common.obj); + if (IS_ERR(veventq)) { + rc = PTR_ERR(veventq); + goto out_unlock_veventqs; + } + + veventq->type = cmd->type; + veventq->viommu = viommu; + refcount_inc(&viommu->obj.users); + veventq->depth = cmd->veventq_depth; + list_add_tail(&veventq->node, &viommu->veventqs); + veventq->lost_events_header.header.flags = + IOMMU_VEVENTQ_FLAG_LOST_EVENTS; + + fdno = iommufd_eventq_init(&veventq->common, "[iommufd-viommu-event]", + ucmd->ictx, &iommufd_veventq_fops); + if (fdno < 0) { + rc = fdno; + goto out_abort; + } + + cmd->out_veventq_id = veventq->common.obj.id; + cmd->out_veventq_fd = fdno; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put_fdno; + + iommufd_object_finalize(ucmd->ictx, &veventq->common.obj); + fd_install(fdno, veventq->common.filep); + goto out_unlock_veventqs; + +out_put_fdno: + put_unused_fd(fdno); + fput(veventq->common.filep); +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj); +out_unlock_veventqs: + up_write(&viommu->veventqs_rwsem); + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c deleted file mode 100644 index cb844e6799d4f..0000000000000 --- a/drivers/iommu/iommufd/fault.c +++ /dev/null @@ -1,462 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Copyright (C) 2024 Intel Corporation - */ -#define pr_fmt(fmt) "iommufd: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../iommu-priv.h" -#include "iommufd_private.h" - -int iommufd_fault_iopf_enable(struct iommufd_device *idev) -{ - struct device *dev = idev->dev; - int ret; - - /* - * Once we turn on PCI/PRI support for VF, the response failure code - * should not be forwarded to the hardware due to PRI being a shared - * resource between PF and VFs. There is no coordination for this - * shared capability. This waits for a vPRI reset to recover. - */ - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - - if (pdev->is_virtfn && pci_pri_supported(pdev)) - return -EINVAL; - } - - mutex_lock(&idev->iopf_lock); - /* Device iopf has already been on. */ - if (++idev->iopf_enabled > 1) { - mutex_unlock(&idev->iopf_lock); - return 0; - } - - ret = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_IOPF); - if (ret) - --idev->iopf_enabled; - mutex_unlock(&idev->iopf_lock); - - return ret; -} - -void iommufd_fault_iopf_disable(struct iommufd_device *idev) -{ - mutex_lock(&idev->iopf_lock); - if (!WARN_ON(idev->iopf_enabled == 0)) { - if (--idev->iopf_enabled == 0) - iommu_dev_disable_feature(idev->dev, IOMMU_DEV_FEAT_IOPF); - } - mutex_unlock(&idev->iopf_lock); -} - -static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - struct iommufd_attach_handle *handle; - int ret; - - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->idev = idev; - ret = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, - &handle->handle); - if (ret) - kfree(handle); - - return ret; -} - -int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - int ret; - - if (!hwpt->fault) - return -EINVAL; - - ret = iommufd_fault_iopf_enable(idev); - if (ret) - return ret; - - ret = __fault_domain_attach_dev(hwpt, idev); - if (ret) - iommufd_fault_iopf_disable(idev); - - return ret; -} - -void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, - struct iommufd_attach_handle *handle) -{ - struct iommufd_fault *fault = hwpt->fault; - struct iopf_group *group, *next; - struct list_head free_list; - unsigned long index; - - if (!fault) - return; - INIT_LIST_HEAD(&free_list); - - mutex_lock(&fault->mutex); - spin_lock(&fault->lock); - list_for_each_entry_safe(group, next, &fault->deliver, node) { - if (group->attach_handle != &handle->handle) - continue; - list_move(&group->node, &free_list); - } - spin_unlock(&fault->lock); - - list_for_each_entry_safe(group, next, &free_list, node) { - list_del(&group->node); - iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); - iopf_free_group(group); - } - - xa_for_each(&fault->response, index, group) { - if (group->attach_handle != &handle->handle) - continue; - xa_erase(&fault->response, index); - iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); - iopf_free_group(group); - } - mutex_unlock(&fault->mutex); -} - -static struct iommufd_attach_handle * -iommufd_device_get_attach_handle(struct iommufd_device *idev) -{ - struct iommu_attach_handle *handle; - - handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); - if (IS_ERR(handle)) - return NULL; - - return to_iommufd_handle(handle); -} - -void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - struct iommufd_attach_handle *handle; - - handle = iommufd_device_get_attach_handle(idev); - iommu_detach_group_handle(hwpt->domain, idev->igroup->group); - iommufd_auto_response_faults(hwpt, handle); - iommufd_fault_iopf_disable(idev); - kfree(handle); -} - -static int __fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - struct iommufd_attach_handle *handle, *curr = NULL; - int ret; - - if (old->fault) - curr = iommufd_device_get_attach_handle(idev); - - if (hwpt->fault) { - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->idev = idev; - ret = iommu_replace_group_handle(idev->igroup->group, - hwpt->domain, &handle->handle); - } else { - ret = iommu_replace_group_handle(idev->igroup->group, - hwpt->domain, NULL); - } - - if (!ret && curr) { - iommufd_auto_response_faults(old, curr); - kfree(curr); - } - - return ret; -} - -int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - bool iopf_off = !hwpt->fault && old->fault; - bool iopf_on = hwpt->fault && !old->fault; - int ret; - - if (iopf_on) { - ret = iommufd_fault_iopf_enable(idev); - if (ret) - return ret; - } - - ret = __fault_domain_replace_dev(idev, hwpt, old); - if (ret) { - if (iopf_on) - iommufd_fault_iopf_disable(idev); - return ret; - } - - if (iopf_off) - iommufd_fault_iopf_disable(idev); - - return 0; -} - -void iommufd_fault_destroy(struct iommufd_object *obj) -{ - struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); - struct iopf_group *group, *next; - unsigned long index; - - /* - * The iommufd object's reference count is zero at this point. - * We can be confident that no other threads are currently - * accessing this pointer. Therefore, acquiring the mutex here - * is unnecessary. - */ - list_for_each_entry_safe(group, next, &fault->deliver, node) { - list_del(&group->node); - iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); - iopf_free_group(group); - } - xa_for_each(&fault->response, index, group) { - xa_erase(&fault->response, index); - iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); - iopf_free_group(group); - } - xa_destroy(&fault->response); - mutex_destroy(&fault->mutex); -} - -static void iommufd_compose_fault_message(struct iommu_fault *fault, - struct iommu_hwpt_pgfault *hwpt_fault, - struct iommufd_device *idev, - u32 cookie) -{ - hwpt_fault->flags = fault->prm.flags; - hwpt_fault->dev_id = idev->obj.id; - hwpt_fault->pasid = fault->prm.pasid; - hwpt_fault->grpid = fault->prm.grpid; - hwpt_fault->perm = fault->prm.perm; - hwpt_fault->addr = fault->prm.addr; - hwpt_fault->length = 0; - hwpt_fault->cookie = cookie; -} - -static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, - size_t count, loff_t *ppos) -{ - size_t fault_size = sizeof(struct iommu_hwpt_pgfault); - struct iommufd_fault *fault = filep->private_data; - struct iommu_hwpt_pgfault data = {}; - struct iommufd_device *idev; - struct iopf_group *group; - struct iopf_fault *iopf; - size_t done = 0; - int rc = 0; - - if (*ppos || count % fault_size) - return -ESPIPE; - - mutex_lock(&fault->mutex); - while ((group = iommufd_fault_deliver_fetch(fault))) { - if (done >= count || - group->fault_count * fault_size > count - done) { - iommufd_fault_deliver_restore(fault, group); - break; - } - - rc = xa_alloc(&fault->response, &group->cookie, group, - xa_limit_32b, GFP_KERNEL); - if (rc) { - iommufd_fault_deliver_restore(fault, group); - break; - } - - idev = to_iommufd_handle(group->attach_handle)->idev; - list_for_each_entry(iopf, &group->faults, list) { - iommufd_compose_fault_message(&iopf->fault, - &data, idev, - group->cookie); - if (copy_to_user(buf + done, &data, fault_size)) { - xa_erase(&fault->response, group->cookie); - iommufd_fault_deliver_restore(fault, group); - rc = -EFAULT; - break; - } - done += fault_size; - } - } - mutex_unlock(&fault->mutex); - - return done == 0 ? rc : done; -} - -static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf, - size_t count, loff_t *ppos) -{ - size_t response_size = sizeof(struct iommu_hwpt_page_response); - struct iommufd_fault *fault = filep->private_data; - struct iommu_hwpt_page_response response; - struct iopf_group *group; - size_t done = 0; - int rc = 0; - - if (*ppos || count % response_size) - return -ESPIPE; - - mutex_lock(&fault->mutex); - while (count > done) { - rc = copy_from_user(&response, buf + done, response_size); - if (rc) - break; - - static_assert((int)IOMMUFD_PAGE_RESP_SUCCESS == - (int)IOMMU_PAGE_RESP_SUCCESS); - static_assert((int)IOMMUFD_PAGE_RESP_INVALID == - (int)IOMMU_PAGE_RESP_INVALID); - if (response.code != IOMMUFD_PAGE_RESP_SUCCESS && - response.code != IOMMUFD_PAGE_RESP_INVALID) { - rc = -EINVAL; - break; - } - - group = xa_erase(&fault->response, response.cookie); - if (!group) { - rc = -EINVAL; - break; - } - - iopf_group_response(group, response.code); - iopf_free_group(group); - done += response_size; - } - mutex_unlock(&fault->mutex); - - return done == 0 ? rc : done; -} - -static __poll_t iommufd_fault_fops_poll(struct file *filep, - struct poll_table_struct *wait) -{ - struct iommufd_fault *fault = filep->private_data; - __poll_t pollflags = EPOLLOUT; - - poll_wait(filep, &fault->wait_queue, wait); - spin_lock(&fault->lock); - if (!list_empty(&fault->deliver)) - pollflags |= EPOLLIN | EPOLLRDNORM; - spin_unlock(&fault->lock); - - return pollflags; -} - -static int iommufd_fault_fops_release(struct inode *inode, struct file *filep) -{ - struct iommufd_fault *fault = filep->private_data; - - refcount_dec(&fault->obj.users); - iommufd_ctx_put(fault->ictx); - return 0; -} - -static const struct file_operations iommufd_fault_fops = { - .owner = THIS_MODULE, - .open = nonseekable_open, - .read = iommufd_fault_fops_read, - .write = iommufd_fault_fops_write, - .poll = iommufd_fault_fops_poll, - .release = iommufd_fault_fops_release, -}; - -int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) -{ - struct iommu_fault_alloc *cmd = ucmd->cmd; - struct iommufd_fault *fault; - struct file *filep; - int fdno; - int rc; - - if (cmd->flags) - return -EOPNOTSUPP; - - fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT); - if (IS_ERR(fault)) - return PTR_ERR(fault); - - fault->ictx = ucmd->ictx; - INIT_LIST_HEAD(&fault->deliver); - xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); - mutex_init(&fault->mutex); - spin_lock_init(&fault->lock); - init_waitqueue_head(&fault->wait_queue); - - filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops, - fault, O_RDWR); - if (IS_ERR(filep)) { - rc = PTR_ERR(filep); - goto out_abort; - } - - refcount_inc(&fault->obj.users); - iommufd_ctx_get(fault->ictx); - fault->filep = filep; - - fdno = get_unused_fd_flags(O_CLOEXEC); - if (fdno < 0) { - rc = fdno; - goto out_fput; - } - - cmd->out_fault_id = fault->obj.id; - cmd->out_fault_fd = fdno; - - rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); - if (rc) - goto out_put_fdno; - iommufd_object_finalize(ucmd->ictx, &fault->obj); - - fd_install(fdno, fault->filep); - - return 0; -out_put_fdno: - put_unused_fd(fdno); -out_fput: - fput(filep); -out_abort: - iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj); - - return rc; -} - -int iommufd_fault_iopf_handler(struct iopf_group *group) -{ - struct iommufd_hw_pagetable *hwpt; - struct iommufd_fault *fault; - - hwpt = group->attach_handle->domain->fault_data; - fault = hwpt->fault; - - spin_lock(&fault->lock); - list_add_tail(&group->node, &fault->deliver); - spin_unlock(&fault->lock); - - wake_up_interruptible(&fault->wait_queue); - - return 0; -} diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 598be26a14e28..fe789c2dc0c97 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -14,7 +14,7 @@ static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt) iommu_domain_free(hwpt->domain); if (hwpt->fault) - refcount_dec(&hwpt->fault->obj.users); + refcount_dec(&hwpt->fault->common.obj.users); } void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) @@ -90,6 +90,7 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for + * @pasid: PASID to get an iommu_domain for * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt * @user_data: The user provided driver specific data describing the domain to @@ -105,13 +106,14 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) */ struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach, + struct iommufd_device *idev, ioasid_t pasid, + u32 flags, bool immediate_attach, const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_FAULT_ID_VALID; + IOMMU_HWPT_FAULT_ID_VALID | + IOMMU_HWPT_ALLOC_PASID; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; @@ -126,12 +128,16 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && !device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) return ERR_PTR(-EOPNOTSUPP); + if ((flags & IOMMU_HWPT_FAULT_ID_VALID) && + (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)) + return ERR_PTR(-EOPNOTSUPP); hwpt_paging = __iommufd_object_alloc( ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); if (IS_ERR(hwpt_paging)) return ERR_CAST(hwpt_paging); hwpt = &hwpt_paging->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; INIT_LIST_HEAD(&hwpt_paging->hwpt_item); /* Pairs with iommufd_hw_pagetable_destroy() */ @@ -156,6 +162,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } } + hwpt->domain->iommufd_hwpt = hwpt; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; /* * Set the coherency mode before we do iopt_table_add_domain() as some @@ -184,7 +192,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, * sequence. Once those drivers are fixed this should be removed. */ if (immediate_attach) { - rc = iommufd_hw_pagetable_attach(hwpt, idev); + rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid); if (rc) goto out_abort; } @@ -197,7 +205,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, out_detach: if (immediate_attach) - iommufd_hw_pagetable_detach(idev); + iommufd_hw_pagetable_detach(idev, pasid); out_abort: iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); @@ -226,7 +234,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt; int rc; - if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) || + if ((flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) || !user_data->len || !ops->domain_alloc_nested) return ERR_PTR(-EOPNOTSUPP); if (parent->auto_domain || !parent->nest_parent || @@ -238,6 +246,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, if (IS_ERR(hwpt_nested)) return ERR_CAST(hwpt_nested); hwpt = &hwpt_nested->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; refcount_inc(&parent->common.obj.users); hwpt_nested->parent = parent; @@ -251,9 +260,11 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, goto out_abort; } hwpt->domain->owner = ops; + hwpt->domain->iommufd_hwpt = hwpt; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { - rc = -EINVAL; + rc = -EOPNOTSUPP; goto out_abort; } return hwpt_nested; @@ -280,7 +291,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, struct iommufd_hw_pagetable *hwpt; int rc; - if (flags & ~IOMMU_HWPT_FAULT_ID_VALID) + if (flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) return ERR_PTR(-EOPNOTSUPP); if (!user_data->len) return ERR_PTR(-EOPNOTSUPP); @@ -292,24 +303,25 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, if (IS_ERR(hwpt_nested)) return ERR_CAST(hwpt_nested); hwpt = &hwpt_nested->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; hwpt_nested->viommu = viommu; refcount_inc(&viommu->obj.users); hwpt_nested->parent = viommu->hwpt; - hwpt->domain = - viommu->ops->alloc_domain_nested(viommu, - flags & ~IOMMU_HWPT_FAULT_ID_VALID, - user_data); + hwpt->domain = viommu->ops->alloc_domain_nested( + viommu, flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; goto out_abort; } + hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->owner = viommu->iommu_dev->ops; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { - rc = -EINVAL; + rc = -EOPNOTSUPP; goto out_abort; } return hwpt_nested; @@ -355,8 +367,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) ioas = container_of(pt_obj, struct iommufd_ioas, obj); mutex_lock(&ioas->mutex); hwpt_paging = iommufd_hwpt_paging_alloc( - ucmd->ictx, ioas, idev, cmd->flags, false, - user_data.len ? &user_data : NULL); + ucmd->ictx, ioas, idev, IOMMU_NO_PASID, cmd->flags, + false, user_data.len ? &user_data : NULL); if (IS_ERR(hwpt_paging)) { rc = PTR_ERR(hwpt_paging); goto out_unlock; @@ -406,9 +418,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } hwpt->fault = fault; hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; - hwpt->domain->fault_data = hwpt; - refcount_inc(&fault->obj.users); - iommufd_put_object(ucmd->ictx, &fault->obj); + refcount_inc(&fault->common.obj.users); + iommufd_put_object(ucmd->ictx, &fault->common.obj); } cmd->out_hwpt_id = hwpt->obj.id; diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 8a790e597e125..abf4aadca96c0 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -719,6 +719,12 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, goto out_unlock_iova; } + /* The area is locked by an object that has not been destroyed */ + if (area->num_locks) { + rc = -EBUSY; + goto out_unlock_iova; + } + if (area_first < start || area_last > last) { rc = -ENOENT; goto out_unlock_iova; @@ -743,8 +749,10 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, iommufd_access_notify_unmap(iopt, area_first, length); /* Something is not responding to unmap requests. */ tries++; - if (WARN_ON(tries > 100)) - return -EDEADLOCK; + if (WARN_ON(tries > 100)) { + rc = -EDEADLOCK; + goto out_unmapped; + } goto again; } @@ -766,6 +774,7 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, out_unlock_iova: up_write(&iopt->iova_rwsem); up_read(&iopt->domains_rwsem); +out_unmapped: if (unmapped) *unmapped = unmapped_bytes; return rc; @@ -1410,8 +1419,7 @@ int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) } void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access, - u32 iopt_access_list_id) + struct iommufd_access *access, u32 iopt_access_list_id) { down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 10c928a9a4633..b6064f4ce4af9 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -48,6 +48,7 @@ struct iopt_area { int iommu_prot; bool prevent_access : 1; unsigned int num_accesses; + unsigned int num_locks; }; struct iopt_allowed { @@ -238,9 +239,9 @@ void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, int iopt_area_add_access(struct iopt_area *area, unsigned long start, unsigned long last, struct page **out_pages, - unsigned int flags); + unsigned int flags, bool lock_area); void iopt_area_remove_access(struct iopt_area *area, unsigned long start, - unsigned long last); + unsigned long last, bool unlock_area); int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, void *data, unsigned long length, unsigned int flags); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 02fe1ada97cc7..ee026fe6f382a 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -19,12 +20,36 @@ struct iommu_group; struct iommu_option; struct iommufd_device; +struct iommufd_sw_msi_map { + struct list_head sw_msi_item; + phys_addr_t sw_msi_start; + phys_addr_t msi_addr; + unsigned int pgoff; + unsigned int id; +}; + +/* Bitmap of struct iommufd_sw_msi_map::id */ +struct iommufd_sw_msi_maps { + DECLARE_BITMAP(bitmap, 64); +}; + +#ifdef CONFIG_IRQ_MSI_IOMMU +int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map); +#endif + struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; + struct maple_tree mt_mmap; + + struct mutex sw_msi_lock; + struct list_head sw_msi_list; + unsigned int sw_msi_id; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -32,6 +57,18 @@ struct iommufd_ctx { struct iommufd_ioas *vfio_ioas; }; +/* Entry for iommufd_ctx::mt_mmap */ +struct iommufd_mmap { + struct iommufd_object *owner; + + /* Page-shifted start position in mt_mmap to validate vma->vm_pgoff */ + unsigned long vm_pgoff; + + /* Physical range for io_remap_pfn_range() */ + phys_addr_t mmio_addr; + size_t length; +}; + /* * The IOVA to PFN map. The map automatically copies the PFNs into multiple * domains and permits sharing of PFNs between io_pagetable instances. This @@ -112,6 +149,7 @@ struct iommufd_ucmd { void __user *ubuffer; u32 user_size; void *cmd; + struct iommufd_object *new_obj; }; int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, @@ -207,6 +245,15 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, iommufd_object_remove(ictx, obj, obj->id, 0); } +/* + * Callers of these normal object allocators must call iommufd_object_finalize() + * to finalize the object, or call iommufd_object_abort_and_destroy() to revert + * the allocation. + */ +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type); + #define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ @@ -219,6 +266,26 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, #define iommufd_object_alloc(ictx, ptr, type) \ __iommufd_object_alloc(ictx, ptr, type, obj) +/* + * Callers of these _ucmd allocators should not call iommufd_object_finalize() + * or iommufd_object_abort_and_destroy(), as the core automatically does that. + */ +struct iommufd_object * +_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, size_t size, + enum iommufd_object_type type); + +#define __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) \ + container_of(_iommufd_object_alloc_ucmd( \ + ucmd, \ + sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ + offsetof(typeof(*(ptr)), \ + obj) != 0), \ + type), \ + typeof(*(ptr)), obj) + +#define iommufd_object_alloc_ucmd(ucmd, ptr, type) \ + __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) + /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The @@ -243,8 +310,7 @@ struct iommufd_ioas { static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx, u32 id) { - return container_of(iommufd_get_object(ictx, id, - IOMMUFD_OBJ_IOAS), + return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_IOAS), struct iommufd_ioas, obj); } @@ -276,6 +342,7 @@ struct iommufd_hw_pagetable { struct iommufd_object obj; struct iommu_domain *domain; struct iommufd_fault *fault; + bool pasid_compat : 1; }; struct iommufd_hwpt_paging { @@ -283,10 +350,10 @@ struct iommufd_hwpt_paging { struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; - bool msi_cookie : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; + struct iommufd_sw_msi_maps present_sw_msi; }; struct iommufd_hwpt_nested { @@ -346,13 +413,13 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach, + struct iommufd_device *idev, ioasid_t pasid, + u32 flags, bool immediate_attach, const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); + struct iommufd_device *idev, ioasid_t pasid); struct iommufd_hw_pagetable * -iommufd_hw_pagetable_detach(struct iommufd_device *idev); +iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid); void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); void iommufd_hwpt_paging_abort(struct iommufd_object *obj); void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); @@ -376,13 +443,15 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, refcount_dec(&hwpt->obj.users); } +struct iommufd_attach; + struct iommufd_group { struct kref ref; struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; - struct iommufd_hw_pagetable *hwpt; - struct list_head device_list; + struct xarray pasid_attach; + struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; @@ -429,53 +498,43 @@ struct iommufd_access { int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access, - u32 iopt_access_list_id); + struct iommufd_access *access, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); -/* - * An iommufd_fault object represents an interface to deliver I/O page faults - * to the user space. These objects are created/destroyed by the user space and - * associated with hardware page table objects during page-table allocation. - */ -struct iommufd_fault { +/* iommufd_access for internal use */ +static inline bool iommufd_access_is_internal(struct iommufd_access *access) +{ + return !access->ictx; +} + +struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx); + +static inline void +iommufd_access_destroy_internal(struct iommufd_ctx *ictx, + struct iommufd_access *access) +{ + iommufd_object_destroy_user(ictx, &access->obj); +} + +int iommufd_access_attach_internal(struct iommufd_access *access, + struct iommufd_ioas *ioas); + +static inline void iommufd_access_detach_internal(struct iommufd_access *access) +{ + iommufd_access_detach(access); +} + +struct iommufd_eventq { struct iommufd_object obj; struct iommufd_ctx *ictx; struct file *filep; spinlock_t lock; /* protects the deliver list */ struct list_head deliver; - struct mutex mutex; /* serializes response flows */ - struct xarray response; struct wait_queue_head wait_queue; }; -/* Fetch the first node out of the fault->deliver list */ -static inline struct iopf_group * -iommufd_fault_deliver_fetch(struct iommufd_fault *fault) -{ - struct list_head *list = &fault->deliver; - struct iopf_group *group = NULL; - - spin_lock(&fault->lock); - if (!list_empty(list)) { - group = list_first_entry(list, struct iopf_group, node); - list_del(&group->node); - } - spin_unlock(&fault->lock); - return group; -} - -/* Restore a node back to the head of the fault->deliver list */ -static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault, - struct iopf_group *group) -{ - spin_lock(&fault->lock); - list_add(&group->node, &fault->deliver); - spin_unlock(&fault->lock); -} - struct iommufd_attach_handle { struct iommu_attach_handle handle; struct iommufd_device *idev; @@ -484,31 +543,108 @@ struct iommufd_attach_handle { /* Convert an iommu attach handle to iommufd handle. */ #define to_iommufd_handle(hdl) container_of(hdl, struct iommufd_attach_handle, handle) +/* + * An iommufd_fault object represents an interface to deliver I/O page faults + * to the user space. These objects are created/destroyed by the user space and + * associated with hardware page table objects during page-table allocation. + */ +struct iommufd_fault { + struct iommufd_eventq common; + struct mutex mutex; /* serializes response flows */ + struct xarray response; +}; + +static inline struct iommufd_fault * +eventq_to_fault(struct iommufd_eventq *eventq) +{ + return container_of(eventq, struct iommufd_fault, common); +} + static inline struct iommufd_fault * iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_FAULT), - struct iommufd_fault, obj); + struct iommufd_fault, common.obj); } int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); void iommufd_fault_destroy(struct iommufd_object *obj); int iommufd_fault_iopf_handler(struct iopf_group *group); -int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); -void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); -int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old); - int iommufd_fault_iopf_enable(struct iommufd_device *idev); void iommufd_fault_iopf_disable(struct iommufd_device *idev); void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle); +/* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */ +struct iommufd_vevent { + struct iommufd_vevent_header header; + struct list_head node; /* for iommufd_eventq::deliver */ + ssize_t data_len; + u64 event_data[] __counted_by(data_len); +}; + +#define vevent_for_lost_events_header(vevent) \ + (vevent->header.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS) + +/* + * An iommufd_veventq object represents an interface to deliver vIOMMU events to + * the user space. It is created/destroyed by the user space and associated with + * a vIOMMU object during the allocations. + */ +struct iommufd_veventq { + struct iommufd_eventq common; + struct iommufd_viommu *viommu; + struct list_head node; /* for iommufd_viommu::veventqs */ + struct iommufd_vevent lost_events_header; + + enum iommu_veventq_type type; + unsigned int depth; + + /* Use common.lock for protection */ + u32 num_events; + u32 sequence; +}; + +static inline struct iommufd_veventq * +eventq_to_veventq(struct iommufd_eventq *eventq) +{ + return container_of(eventq, struct iommufd_veventq, common); +} + +static inline struct iommufd_veventq * +iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_VEVENTQ), + struct iommufd_veventq, common.obj); +} + +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd); +void iommufd_veventq_destroy(struct iommufd_object *obj); +void iommufd_veventq_abort(struct iommufd_object *obj); + +static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq, + struct iommufd_vevent *vevent) +{ + struct iommufd_eventq *eventq = &veventq->common; + + lockdep_assert_held(&eventq->lock); + + /* + * Remove the lost_events_header and add the new node at the same time. + * Note the new node can be lost_events_header, for a sequence update. + */ + if (list_is_last(&veventq->lost_events_header.node, &eventq->deliver)) + list_del(&veventq->lost_events_header.node); + list_add_tail(&vevent->node, &eventq->deliver); + vevent->header.sequence = veventq->sequence; + veventq->sequence = (veventq->sequence + 1) & INT_MAX; + + wake_up_interruptible(&eventq->wait_queue); +} + static inline struct iommufd_viommu * iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) { @@ -517,18 +653,27 @@ iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) struct iommufd_viommu, obj); } +static inline struct iommufd_veventq * +iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, + enum iommu_veventq_type type) +{ + struct iommufd_veventq *veventq, *next; + + lockdep_assert_held(&viommu->veventqs_rwsem); + + list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) { + if (veventq->type == type) + return veventq; + } + return NULL; +} + int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_vdevice_destroy(struct iommufd_object *obj); - -struct iommufd_vdevice { - struct iommufd_object obj; - struct iommufd_ctx *ictx; - struct iommufd_viommu *viommu; - struct device *dev; - u64 id; /* per-vIOMMU virtual ID */ -}; +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_hw_queue_destroy(struct iommufd_object *obj); #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index a6b7a163f6364..8fc618b2bcf96 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -24,6 +24,11 @@ enum { IOMMU_TEST_OP_MD_CHECK_IOTLB, IOMMU_TEST_OP_TRIGGER_IOPF, IOMMU_TEST_OP_DEV_CHECK_CACHE, + IOMMU_TEST_OP_TRIGGER_VEVENT, + IOMMU_TEST_OP_PASID_ATTACH, + IOMMU_TEST_OP_PASID_REPLACE, + IOMMU_TEST_OP_PASID_DETACH, + IOMMU_TEST_OP_PASID_CHECK_HWPT, }; enum { @@ -48,6 +53,7 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, + MOCK_FLAGS_DEVICE_PASID = 1 << 2, }; enum { @@ -60,6 +66,9 @@ enum { MOCK_DEV_CACHE_NUM = 4, }; +/* Reserved for special pasid replace test */ +#define IOMMU_TEST_PASID_RESERVED 1024 + struct iommu_test_cmd { __u32 size; __u32 op; @@ -145,11 +154,36 @@ struct iommu_test_cmd { __u32 id; __u32 cache; } check_dev_cache; + struct { + __u32 dev_id; + } trigger_vevent; + struct { + __u32 pasid; + __u32 pt_id; + /* @id is stdev_id */ + } pasid_attach; + struct { + __u32 pasid; + __u32 pt_id; + /* @id is stdev_id */ + } pasid_replace; + struct { + __u32 pasid; + /* @id is stdev_id */ + } pasid_detach; + struct { + __u32 pasid; + __u32 hwpt_id; + /* @id is stdev_id */ + } pasid_check; }; __u32 last; }; #define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32) +/* Mock device/iommu PASID width */ +#define MOCK_PASID_WIDTH 20 + /* Mock structs for IOMMU_DEVICE_GET_HW_INFO ioctl */ #define IOMMU_HW_INFO_TYPE_SELFTEST 0xfeedbeef #define IOMMU_HW_INFO_SELFTEST_REGVAL 0xdeadbeef @@ -193,6 +227,23 @@ struct iommu_hwpt_invalidate_selftest { #define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef +/** + * struct iommu_viommu_selftest - vIOMMU data for Mock driver + * (IOMMU_VIOMMU_TYPE_SELFTEST) + * @in_data: Input random data from user space + * @out_data: Output data (matching @in_data) to user space + * @out_mmap_offset: The offset argument for mmap syscall + * @out_mmap_length: The length argument for mmap syscall + * + * Simply set @out_data=@in_data for a loopback test + */ +struct iommu_viommu_selftest { + __u32 in_data; + __u32 out_data; + __aligned_u64 out_mmap_offset; + __aligned_u64 out_mmap_length; +}; + /* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */ #define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef #define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef @@ -212,4 +263,13 @@ struct iommu_viommu_invalidate_selftest { __u32 cache_id; }; +#define IOMMU_VEVENTQ_TYPE_SELFTEST 0xbeefbeef + +struct iommu_viommu_event_selftest { + __u32 virt_id; +}; + +#define IOMMU_HW_QUEUE_TYPE_SELFTEST 0xdeadbeef +#define IOMMU_TEST_HW_QUEUE_MAX 2 + #endif diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index 39a86a4a1d3af..4514575818fc0 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -407,7 +407,6 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, update_indexes: if (unlikely(!iova_bitmap_mapped_range(mapped, iova, length))) { - /* * The attempt to advance the base index to @iova * may fail if it's out of bounds, or pinning the pages diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ccf616462a1cb..69c2195e77cad 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -29,6 +29,65 @@ struct iommufd_object_ops { static const struct iommufd_object_ops iommufd_object_ops[]; static struct miscdevice vfio_misc_dev; +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + /* Starts out bias'd by 1 until it is removed from the xarray */ + refcount_set(&obj->shortterm_users, 1); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b, + GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} + +struct iommufd_object *_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *new_obj; + + /* Something is coded wrong if this is hit */ + if (WARN_ON(ucmd->new_obj)) + return ERR_PTR(-EBUSY); + + /* + * An abort op means that its caller needs to invoke it within a lock in + * the caller. So it doesn't work with _iommufd_object_alloc_ucmd() that + * will invoke the abort op in iommufd_object_abort_and_destroy(), which + * must be outside the caller's lock. + */ + if (WARN_ON(iommufd_object_ops[type].abort)) + return ERR_PTR(-EOPNOTSUPP); + + new_obj = _iommufd_object_alloc(ucmd->ictx, size, type); + if (IS_ERR(new_obj)) + return new_obj; + + ucmd->new_obj = new_obj; + return new_obj; +} + /* * Allow concurrent access to the object. * @@ -102,9 +161,8 @@ static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, return 0; if (wait_event_timeout(ictx->destroy_wait, - refcount_read(&to_destroy->shortterm_users) == - 0, - msecs_to_jiffies(60000))) + refcount_read(&to_destroy->shortterm_users) == 0, + msecs_to_jiffies(60000))) return 0; pr_crit("Time out waiting for iommufd object to become free\n"); @@ -226,7 +284,10 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; + mt_init_flags(&ictx->mt_mmap, MT_FLAGS_ALLOC_RANGE); init_waitqueue_head(&ictx->destroy_wait); + mutex_init(&ictx->sw_msi_lock); + INIT_LIST_HEAD(&ictx->sw_msi_list); filp->private_data = ictx; return 0; } @@ -234,6 +295,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) static int iommufd_fops_release(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_sw_msi_map *next; + struct iommufd_sw_msi_map *cur; struct iommufd_object *obj; /* @@ -262,6 +325,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) break; } WARN_ON(!xa_empty(&ictx->groups)); + + mutex_destroy(&ictx->sw_msi_lock); + list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item) + kfree(cur); + kfree(ictx); return 0; } @@ -296,6 +364,7 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_fault_alloc fault; struct iommu_hw_info info; + struct iommu_hw_queue_alloc hw_queue; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_invalidate cache; @@ -308,6 +377,7 @@ union ucmd_buffer { struct iommu_ioas_unmap unmap; struct iommu_option option; struct iommu_vdevice_alloc vdev; + struct iommu_veventq_alloc veventq; struct iommu_vfio_ioas vfio_ioas; struct iommu_viommu_alloc viommu; #ifdef CONFIG_IOMMUFD_TEST @@ -337,6 +407,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_fault_alloc, out_fault_fd), IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, __reserved), + IOCTL_OP(IOMMU_HW_QUEUE_ALLOC, iommufd_hw_queue_alloc_ioctl, + struct iommu_hw_queue_alloc, length), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, @@ -363,6 +435,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, struct iommu_vdevice_alloc, virt_id), + IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc, + struct iommu_veventq_alloc, out_veventq_fd), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, @@ -405,14 +479,83 @@ static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, if (ret) return ret; ret = op->execute(&ucmd); + + if (ucmd.new_obj) { + if (ret) + iommufd_object_abort_and_destroy(ictx, ucmd.new_obj); + else + iommufd_object_finalize(ictx, ucmd.new_obj); + } return ret; } +static void iommufd_fops_vma_open(struct vm_area_struct *vma) +{ + struct iommufd_mmap *immap = vma->vm_private_data; + + refcount_inc(&immap->owner->users); +} + +static void iommufd_fops_vma_close(struct vm_area_struct *vma) +{ + struct iommufd_mmap *immap = vma->vm_private_data; + + refcount_dec(&immap->owner->users); +} + +static const struct vm_operations_struct iommufd_vma_ops = { + .open = iommufd_fops_vma_open, + .close = iommufd_fops_vma_close, +}; + +/* The vm_pgoff must be pre-allocated from mt_mmap, and given to user space */ +static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct iommufd_ctx *ictx = filp->private_data; + size_t length = vma->vm_end - vma->vm_start; + struct iommufd_mmap *immap; + int rc; + + if (!PAGE_ALIGNED(length)) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + if (vma->vm_flags & VM_EXEC) + return -EPERM; + + /* vma->vm_pgoff carries a page-shifted start position to an immap */ + immap = mtree_load(&ictx->mt_mmap, vma->vm_pgoff << PAGE_SHIFT); + if (!immap) + return -ENXIO; + /* + * mtree_load() returns the immap for any contained mmio_addr, so only + * allow the exact immap thing to be mapped + */ + if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) + return -ENXIO; + + vma->vm_pgoff = 0; + vma->vm_private_data = immap; + vma->vm_ops = &iommufd_vma_ops; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + rc = io_remap_pfn_range(vma, vma->vm_start, + immap->mmio_addr >> PAGE_SHIFT, length, + vma->vm_page_prot); + if (rc) + return rc; + + /* vm_ops.open won't be called for mmap itself. */ + refcount_inc(&immap->owner->users); + return rc; +} + static const struct file_operations iommufd_fops = { .owner = THIS_MODULE, .open = iommufd_fops_open, .release = iommufd_fops_release, .unlocked_ioctl = iommufd_fops_ioctl, + .mmap = iommufd_fops_mmap, }; /** @@ -491,6 +634,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, }, + [IOMMUFD_OBJ_HW_QUEUE] = { + .destroy = iommufd_hw_queue_destroy, + }, [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hwpt_paging_destroy, .abort = iommufd_hwpt_paging_abort, @@ -505,6 +651,10 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_VDEVICE] = { .destroy = iommufd_vdevice_destroy, }, + [IOMMUFD_OBJ_VEVENTQ] = { + .destroy = iommufd_veventq_destroy, + .abort = iommufd_veventq_abort, + }, [IOMMUFD_OBJ_VIOMMU] = { .destroy = iommufd_viommu_destroy, }, @@ -523,7 +673,6 @@ static struct miscdevice iommu_misc_dev = { .mode = 0660, }; - static struct miscdevice vfio_misc_dev = { .minor = VFIO_MINOR, .name = "vfio", diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index cdc115864f383..939be83e3b3f7 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1360,8 +1360,7 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, } static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte, - unsigned long length, - bool writable) + unsigned long length, bool writable) { struct iopt_pages *pages; @@ -1401,7 +1400,7 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, struct iopt_pages *pages; unsigned long end; void __user *uptr_down = - (void __user *) ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); if (check_add_overflow((unsigned long)uptr, length, &end)) return ERR_PTR(-EOVERFLOW); @@ -2177,6 +2176,7 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, * @last_index: Inclusive last page index * @out_pages: Output list of struct page's representing the PFNs * @flags: IOMMUFD_ACCESS_RW_* flags + * @lock_area: Fail userspace munmap on this area * * Record that an in-kernel access will be accessing the pages, ensure they are * pinned, and return the PFNs as a simple list of 'struct page *'. @@ -2184,8 +2184,8 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, * This should be undone through a matching call to iopt_area_remove_access() */ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, - unsigned long last_index, struct page **out_pages, - unsigned int flags) + unsigned long last_index, struct page **out_pages, + unsigned int flags, bool lock_area) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; @@ -2198,6 +2198,8 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, access = iopt_pages_get_exact_access(pages, start_index, last_index); if (access) { area->num_accesses++; + if (lock_area) + area->num_locks++; access->users++; iopt_pages_fill_from_xarray(pages, start_index, last_index, out_pages); @@ -2219,6 +2221,8 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, access->node.last = last_index; access->users = 1; area->num_accesses++; + if (lock_area) + area->num_locks++; interval_tree_insert(&access->node, &pages->access_itree); mutex_unlock(&pages->mutex); return 0; @@ -2235,12 +2239,13 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, * @area: The source of PFNs * @start_index: First page index * @last_index: Inclusive last page index + * @unlock_area: Must match the matching iopt_area_add_access()'s lock_area * * Undo iopt_area_add_access() and unpin the pages if necessary. The caller * must stop using the PFNs before calling this. */ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, - unsigned long last_index) + unsigned long last_index, bool unlock_area) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; @@ -2251,6 +2256,10 @@ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, goto out_unlock; WARN_ON(area->num_accesses == 0 || access->users == 0); + if (unlock_area) { + WARN_ON(area->num_locks == 0); + area->num_locks--; + } area->num_accesses--; access->users--; if (access->users) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d40deb0a4f062..6213f43f22580 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -135,7 +135,6 @@ to_mock_domain(struct iommu_domain *domain) struct mock_iommu_domain_nested { struct iommu_domain domain; struct mock_viommu *mock_viommu; - struct mock_iommu_domain *parent; u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; }; @@ -148,6 +147,11 @@ to_mock_nested(struct iommu_domain *domain) struct mock_viommu { struct iommufd_viommu core; struct mock_iommu_domain *s2_parent; + struct mock_hw_queue *hw_queue[IOMMU_TEST_HW_QUEUE_MAX]; + struct mutex queue_mutex; + + unsigned long mmap_offset; + u32 *page; /* Mmap page to test u32 type of in_data */ }; static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) @@ -155,15 +159,32 @@ static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) return container_of(viommu, struct mock_viommu, core); } +struct mock_hw_queue { + struct iommufd_hw_queue core; + struct mock_viommu *mock_viommu; + struct mock_hw_queue *prev; + u16 index; +}; + +static inline struct mock_hw_queue * +to_mock_hw_queue(struct iommufd_hw_queue *hw_queue) +{ + return container_of(hw_queue, struct mock_hw_queue, core); +} + enum selftest_obj_type { TYPE_IDEV, }; struct mock_dev { struct device dev; + struct mock_viommu *viommu; + struct rw_semaphore viommu_rwsem; unsigned long flags; + unsigned long vdev_id; int id; u32 cache[MOCK_DEV_CACHE_NUM]; + atomic_t pasid_1024_fake_error; }; static inline struct mock_dev *to_mock_dev(struct device *dev) @@ -193,15 +214,71 @@ static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { struct mock_dev *mdev = to_mock_dev(dev); + struct mock_viommu *new_viommu = NULL; + unsigned long vdev_id = 0; + int rc; if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return -EINVAL; + iommu_group_mutex_assert(dev); + if (domain->type == IOMMU_DOMAIN_NESTED) { + new_viommu = to_mock_nested(domain)->mock_viommu; + if (new_viommu) { + rc = iommufd_viommu_get_vdev_id(&new_viommu->core, dev, + &vdev_id); + if (rc) + return rc; + } + } + if (new_viommu != mdev->viommu) { + down_write(&mdev->viommu_rwsem); + mdev->viommu = new_viommu; + mdev->vdev_id = vdev_id; + up_write(&mdev->viommu_rwsem); + } + + return 0; +} + +static int mock_domain_set_dev_pasid_nop(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct mock_dev *mdev = to_mock_dev(dev); + + /* + * Per the first attach with pasid 1024, set the + * mdev->pasid_1024_fake_error. Hence the second call of this op + * can fake an error to validate the error path of the core. This + * is helpful to test the case in which the iommu core needs to + * rollback to the old domain due to driver failure. e.g. replace. + * User should be careful about the third call of this op, it shall + * succeed since the mdev->pasid_1024_fake_error is cleared in the + * second call. + */ + if (pasid == 1024) { + if (domain->type == IOMMU_DOMAIN_BLOCKED) { + atomic_set(&mdev->pasid_1024_fake_error, 0); + } else if (atomic_read(&mdev->pasid_1024_fake_error)) { + /* + * Clear the flag, and fake an error to fail the + * replacement. + */ + atomic_set(&mdev->pasid_1024_fake_error, 0); + return -ENOMEM; + } else { + /* Set the flag to fake an error in next call */ + atomic_set(&mdev->pasid_1024_fake_error, 1); + } + } + return 0; } static const struct iommu_domain_ops mock_blocking_ops = { .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop }; static struct iommu_domain mock_blocking_domain = { @@ -209,10 +286,15 @@ static struct iommu_domain mock_blocking_domain = { .ops = &mock_blocking_ops, }; -static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) +static void *mock_domain_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct iommu_test_hw_info *info; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); @@ -343,7 +425,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, struct mock_iommu_domain_nested *mock_nested; struct mock_iommu_domain *mock_parent; - if (flags) + if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); if (!parent || parent->ops != mock_ops.default_domain_ops) return ERR_PTR(-EINVAL); @@ -355,7 +437,6 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, mock_nested = __mock_domain_alloc_nested(user_data); if (IS_ERR(mock_nested)) return ERR_CAST(mock_nested); - mock_nested->parent = mock_parent; return &mock_nested->domain; } @@ -365,7 +446,8 @@ mock_domain_alloc_paging_flags(struct device *dev, u32 flags, { bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_ALLOC_NEST_PARENT; + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID; struct mock_dev *mdev = to_mock_dev(dev); bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; struct mock_iommu_domain *mock; @@ -571,9 +653,15 @@ static void mock_viommu_destroy(struct iommufd_viommu *viommu) { struct mock_iommu_device *mock_iommu = container_of( viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); if (refcount_dec_and_test(&mock_iommu->users)) complete(&mock_iommu->complete); + if (mock_viommu->mmap_offset) + iommufd_viommu_destroy_mmap(&mock_viommu->core, + mock_viommu->mmap_offset); + free_page((unsigned long)mock_viommu->page); + mutex_destroy(&mock_viommu->queue_mutex); /* iommufd core frees mock_viommu and viommu */ } @@ -585,14 +673,13 @@ mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, struct mock_viommu *mock_viommu = to_mock_viommu(viommu); struct mock_iommu_domain_nested *mock_nested; - if (flags) + if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); mock_nested = __mock_domain_alloc_nested(user_data); if (IS_ERR(mock_nested)) return ERR_CAST(mock_nested); mock_nested->mock_viommu = mock_viommu; - mock_nested->parent = mock_viommu->s2_parent; return &mock_nested->domain; } @@ -666,31 +753,149 @@ static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu, return rc; } +static size_t mock_viommu_get_hw_queue_size(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type) +{ + if (queue_type != IOMMU_HW_QUEUE_TYPE_SELFTEST) + return 0; + return HW_QUEUE_STRUCT_SIZE(struct mock_hw_queue, core); +} + +static void mock_hw_queue_destroy(struct iommufd_hw_queue *hw_queue) +{ + struct mock_hw_queue *mock_hw_queue = to_mock_hw_queue(hw_queue); + struct mock_viommu *mock_viommu = mock_hw_queue->mock_viommu; + + mutex_lock(&mock_viommu->queue_mutex); + mock_viommu->hw_queue[mock_hw_queue->index] = NULL; + if (mock_hw_queue->prev) + iommufd_hw_queue_undepend(mock_hw_queue, mock_hw_queue->prev, + core); + mutex_unlock(&mock_viommu->queue_mutex); +} + +/* Test iommufd_hw_queue_depend/undepend() */ +static int mock_hw_queue_init_phys(struct iommufd_hw_queue *hw_queue, u32 index, + phys_addr_t base_addr_pa) +{ + struct mock_viommu *mock_viommu = to_mock_viommu(hw_queue->viommu); + struct mock_hw_queue *mock_hw_queue = to_mock_hw_queue(hw_queue); + struct mock_hw_queue *prev = NULL; + int rc = 0; + + if (index >= IOMMU_TEST_HW_QUEUE_MAX) + return -EINVAL; + + mutex_lock(&mock_viommu->queue_mutex); + + if (mock_viommu->hw_queue[index]) { + rc = -EEXIST; + goto unlock; + } + + if (index) { + prev = mock_viommu->hw_queue[index - 1]; + if (!prev) { + rc = -EIO; + goto unlock; + } + } + + /* + * Test to catch a kernel bug if the core converted the physical address + * incorrectly. Let mock_domain_iova_to_phys() WARN_ON if it fails. + */ + if (base_addr_pa != iommu_iova_to_phys(&mock_viommu->s2_parent->domain, + hw_queue->base_addr)) { + rc = -EFAULT; + goto unlock; + } + + if (prev) { + rc = iommufd_hw_queue_depend(mock_hw_queue, prev, core); + if (rc) + goto unlock; + } + + mock_hw_queue->prev = prev; + mock_hw_queue->mock_viommu = mock_viommu; + mock_viommu->hw_queue[index] = mock_hw_queue; + + hw_queue->destroy = &mock_hw_queue_destroy; +unlock: + mutex_unlock(&mock_viommu->queue_mutex); + return rc; +} + static struct iommufd_viommu_ops mock_viommu_ops = { .destroy = mock_viommu_destroy, .alloc_domain_nested = mock_viommu_alloc_domain_nested, .cache_invalidate = mock_viommu_cache_invalidate, + .get_hw_queue_size = mock_viommu_get_hw_queue_size, + .hw_queue_init_phys = mock_hw_queue_init_phys, }; -static struct iommufd_viommu *mock_viommu_alloc(struct device *dev, - struct iommu_domain *domain, - struct iommufd_ctx *ictx, - unsigned int viommu_type) +static size_t mock_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type) { - struct mock_iommu_device *mock_iommu = - iommu_get_iommu_dev(dev, struct mock_iommu_device, iommu_dev); - struct mock_viommu *mock_viommu; - if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST) - return ERR_PTR(-EOPNOTSUPP); + return 0; + return VIOMMU_STRUCT_SIZE(struct mock_viommu, core); +} + +static int mock_viommu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data) +{ + struct mock_iommu_device *mock_iommu = container_of( + viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); + struct iommu_viommu_selftest data; + int rc; + + if (user_data) { + rc = iommu_copy_struct_from_user( + &data, user_data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data); + if (rc) + return rc; + + /* Allocate two pages */ + mock_viommu->page = + (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (!mock_viommu->page) + return -ENOMEM; - mock_viommu = iommufd_viommu_alloc(ictx, struct mock_viommu, core, - &mock_viommu_ops); - if (IS_ERR(mock_viommu)) - return ERR_CAST(mock_viommu); + rc = iommufd_viommu_alloc_mmap(&mock_viommu->core, + __pa(mock_viommu->page), + PAGE_SIZE * 2, + &mock_viommu->mmap_offset); + if (rc) + goto err_free_page; + + /* For loopback tests on both the page and out_data */ + *mock_viommu->page = data.in_data; + data.out_data = data.in_data; + data.out_mmap_length = PAGE_SIZE * 2; + data.out_mmap_offset = mock_viommu->mmap_offset; + rc = iommu_copy_struct_to_user( + user_data, &data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data); + if (rc) + goto err_destroy_mmap; + } refcount_inc(&mock_iommu->users); - return &mock_viommu->core; + mutex_init(&mock_viommu->queue_mutex); + mock_viommu->s2_parent = to_mock_domain(parent_domain); + + viommu->ops = &mock_viommu_ops; + return 0; + +err_destroy_mmap: + iommufd_viommu_destroy_mmap(&mock_viommu->core, + mock_viommu->mmap_offset); +err_free_page: + free_page((unsigned long)mock_viommu->page); + return rc; } static const struct iommu_ops mock_ops = { @@ -712,7 +917,8 @@ static const struct iommu_ops mock_ops = { .dev_enable_feat = mock_dev_enable_feat, .dev_disable_feat = mock_dev_disable_feat, .user_pasid_table = true, - .viommu_alloc = mock_viommu_alloc, + .get_viommu_size = mock_get_viommu_size, + .viommu_init = mock_viommu_init, .default_domain_ops = &(struct iommu_domain_ops){ .free = mock_domain_free, @@ -720,6 +926,7 @@ static const struct iommu_ops mock_ops = { .map_pages = mock_domain_map_pages, .unmap_pages = mock_domain_unmap_pages, .iova_to_phys = mock_domain_iova_to_phys, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, }, }; @@ -780,6 +987,7 @@ static struct iommu_domain_ops domain_nested_ops = { .free = mock_domain_free_nested, .attach_dev = mock_domain_nop_attach, .cache_invalidate_user = mock_domain_cache_invalidate_user, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, }; static inline struct iommufd_hw_pagetable * @@ -839,17 +1047,24 @@ static void mock_dev_release(struct device *dev) static struct mock_dev *mock_dev_create(unsigned long dev_flags) { + struct property_entry prop[] = { + PROPERTY_ENTRY_U32("pasid-num-bits", 0), + {}, + }; + const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY | + MOCK_FLAGS_DEVICE_HUGE_IOVA | + MOCK_FLAGS_DEVICE_PASID; struct mock_dev *mdev; int rc, i; - if (dev_flags & - ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) + if (dev_flags & ~valid_flags) return ERR_PTR(-EINVAL); mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(-ENOMEM); + init_rwsem(&mdev->viommu_rwsem); device_initialize(&mdev->dev); mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; @@ -866,6 +1081,15 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) if (rc) goto err_put; + if (dev_flags & MOCK_FLAGS_DEVICE_PASID) + prop[0] = PROPERTY_ENTRY_U32("pasid-num-bits", MOCK_PASID_WIDTH); + + rc = device_create_managed_software_node(&mdev->dev, prop, NULL); + if (rc) { + dev_err(&mdev->dev, "add pasid-num-bits property failed, rc: %d", rc); + goto err_put; + } + rc = device_add(&mdev->dev); if (rc) goto err_put; @@ -921,7 +1145,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, } sobj->idev.idev = idev; - rc = iommufd_device_attach(idev, &pt_id); + rc = iommufd_device_attach(idev, IOMMU_NO_PASID, &pt_id); if (rc) goto out_unbind; @@ -936,7 +1160,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, return 0; out_detach: - iommufd_device_detach(idev); + iommufd_device_detach(idev, IOMMU_NO_PASID); out_unbind: iommufd_device_unbind(idev); out_mdev: @@ -946,39 +1170,49 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, return rc; } -/* Replace the mock domain with a manually allocated hw_pagetable */ -static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, - unsigned int device_id, u32 pt_id, - struct iommu_test_cmd *cmd) +static struct selftest_obj * +iommufd_test_get_selftest_obj(struct iommufd_ctx *ictx, u32 id) { struct iommufd_object *dev_obj; struct selftest_obj *sobj; - int rc; /* * Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure * it doesn't race with detach, which is not allowed. */ - dev_obj = - iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST); + dev_obj = iommufd_get_object(ictx, id, IOMMUFD_OBJ_SELFTEST); if (IS_ERR(dev_obj)) - return PTR_ERR(dev_obj); + return ERR_CAST(dev_obj); sobj = to_selftest_obj(dev_obj); if (sobj->type != TYPE_IDEV) { - rc = -EINVAL; - goto out_dev_obj; + iommufd_put_object(ictx, dev_obj); + return ERR_PTR(-EINVAL); } + return sobj; +} + +/* Replace the mock domain with a manually allocated hw_pagetable */ +static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, + unsigned int device_id, u32 pt_id, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; - rc = iommufd_device_replace(sobj->idev.idev, &pt_id); + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, device_id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + rc = iommufd_device_replace(sobj->idev.idev, IOMMU_NO_PASID, &pt_id); if (rc) - goto out_dev_obj; + goto out_sobj; cmd->mock_domain_replace.pt_id = pt_id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); -out_dev_obj: - iommufd_put_object(ucmd->ictx, dev_obj); +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); return rc; } @@ -1090,9 +1324,8 @@ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, return 0; } -static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, - u32 mockpt_id, unsigned int iotlb_id, - u32 iotlb) +static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, u32 mockpt_id, + unsigned int iotlb_id, u32 iotlb) { struct mock_iommu_domain_nested *mock_nested; struct iommufd_hw_pagetable *hwpt; @@ -1365,7 +1598,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, int rc; /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ - if (length > 16*1024*1024) + if (length > 16 * 1024 * 1024) return -ENOMEM; if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ)) @@ -1382,7 +1615,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, if (flags & MOCK_FLAGS_ACCESS_SYZ) iova = iommufd_test_syz_conv_iova(staccess->access, - &cmd->access_pages.iova); + &cmd->access_pages.iova); npages = (ALIGN(iova + length, PAGE_SIZE) - ALIGN_DOWN(iova, PAGE_SIZE)) / @@ -1458,7 +1691,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, int rc; /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ - if (length > 16*1024*1024) + if (length > 16 * 1024 * 1024) return -ENOMEM; if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH | @@ -1484,7 +1717,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, if (flags & MOCK_FLAGS_ACCESS_SYZ) iova = iommufd_test_syz_conv_iova(staccess->access, - &cmd->access_rw.iova); + &cmd->access_rw.iova); rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags); if (rc) @@ -1539,7 +1772,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, goto out_put; } - if (copy_from_user(tmp, uptr,DIV_ROUND_UP(max, BITS_PER_BYTE))) { + if (copy_from_user(tmp, uptr, DIV_ROUND_UP(max, BITS_PER_BYTE))) { rc = -EFAULT; goto out_free; } @@ -1575,7 +1808,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, struct iommu_test_cmd *cmd) { - struct iopf_fault event = { }; + struct iopf_fault event = {}; struct iommufd_device *idev; idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id); @@ -1597,13 +1830,165 @@ static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, return 0; } +static int iommufd_test_trigger_vevent(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct iommu_viommu_event_selftest test = {}; + struct iommufd_device *idev; + struct mock_dev *mdev; + int rc = -ENOENT; + + idev = iommufd_get_device(ucmd, cmd->trigger_vevent.dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + mdev = to_mock_dev(idev->dev); + + down_read(&mdev->viommu_rwsem); + if (!mdev->viommu || !mdev->vdev_id) + goto out_unlock; + + test.virt_id = mdev->vdev_id; + rc = iommufd_viommu_report_event(&mdev->viommu->core, + IOMMU_VEVENTQ_TYPE_SELFTEST, &test, + sizeof(test)); +out_unlock: + up_read(&mdev->viommu_rwsem); + iommufd_put_object(ucmd->ictx, &idev->obj); + + return rc; +} + +static inline struct iommufd_hw_pagetable * +iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) +{ + struct iommufd_object *pt_obj; + + pt_obj = iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) + return ERR_CAST(pt_obj); + + if (pt_obj->type != IOMMUFD_OBJ_HWPT_NESTED && + pt_obj->type != IOMMUFD_OBJ_HWPT_PAGING) { + iommufd_put_object(ucmd->ictx, pt_obj); + return ERR_PTR(-EINVAL); + } + + return container_of(pt_obj, struct iommufd_hw_pagetable, obj); +} + +static int iommufd_test_pasid_check_hwpt(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + u32 hwpt_id = cmd->pasid_check.hwpt_id; + struct iommu_domain *attached_domain; + struct iommu_attach_handle *handle; + struct iommufd_hw_pagetable *hwpt; + struct selftest_obj *sobj; + struct mock_dev *mdev; + int rc = 0; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + mdev = sobj->idev.mock_dev; + + handle = iommu_attach_handle_get(mdev->dev.iommu_group, + cmd->pasid_check.pasid, 0); + if (IS_ERR(handle)) + attached_domain = NULL; + else + attached_domain = handle->domain; + + /* hwpt_id == 0 means to check if pasid is detached */ + if (!hwpt_id) { + if (attached_domain) + rc = -EINVAL; + goto out_sobj; + } + + hwpt = iommufd_get_hwpt(ucmd, hwpt_id); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_sobj; + } + + if (attached_domain != hwpt->domain) + rc = -EINVAL; + + iommufd_put_object(ucmd->ictx, &hwpt->obj); +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_attach(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + rc = iommufd_device_attach(sobj->idev.idev, cmd->pasid_attach.pasid, + &cmd->pasid_attach.pt_id); + if (rc) + goto out_sobj; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + iommufd_device_detach(sobj->idev.idev, cmd->pasid_attach.pasid); + +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_replace(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + rc = iommufd_device_replace(sobj->idev.idev, cmd->pasid_attach.pasid, + &cmd->pasid_attach.pt_id); + if (rc) + goto out_sobj; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_detach(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + iommufd_device_detach(sobj->idev.idev, cmd->pasid_detach.pasid); + iommufd_put_object(ucmd->ictx, &sobj->obj); + return 0; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = to_selftest_obj(obj); switch (sobj->type) { case TYPE_IDEV: - iommufd_device_detach(sobj->idev.idev); + iommufd_device_detach(sobj->idev.idev, IOMMU_NO_PASID); iommufd_device_unbind(sobj->idev.idev); mock_dev_destroy(sobj->idev.mock_dev); break; @@ -1678,6 +2063,16 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->dirty.flags); case IOMMU_TEST_OP_TRIGGER_IOPF: return iommufd_test_trigger_iopf(ucmd, cmd); + case IOMMU_TEST_OP_TRIGGER_VEVENT: + return iommufd_test_trigger_vevent(ucmd, cmd); + case IOMMU_TEST_OP_PASID_ATTACH: + return iommufd_test_pasid_attach(ucmd, cmd); + case IOMMU_TEST_OP_PASID_REPLACE: + return iommufd_test_pasid_replace(ucmd, cmd); + case IOMMU_TEST_OP_PASID_DETACH: + return iommufd_test_pasid_detach(ucmd, cmd); + case IOMMU_TEST_OP_PASID_CHECK_HWPT: + return iommufd_test_pasid_check_hwpt(ucmd, cmd); default: return -EOPNOTSUPP; } @@ -1715,8 +2110,8 @@ int __init iommufd_test_init(void) goto err_bus; rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops, - &iommufd_mock_bus_type.bus, - &iommufd_mock_bus_type.nb); + &iommufd_mock_bus_type.bus, + &iommufd_mock_bus_type.nb); if (rc) goto err_sysfs; @@ -1724,6 +2119,7 @@ int __init iommufd_test_init(void) init_completion(&mock_iommu.complete); mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq"); + mock_iommu.iommu_dev.max_pasids = (1 << MOCK_PASID_WIDTH); return 0; diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 69b88e8c7c265..91339f7999161 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -17,10 +17,16 @@ void iommufd_viommu_destroy(struct iommufd_object *obj) int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) { struct iommu_viommu_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_viommu *viommu; struct iommufd_device *idev; const struct iommu_ops *ops; + size_t viommu_size; int rc; if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) @@ -31,7 +37,22 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) return PTR_ERR(idev); ops = dev_iommu_ops(idev->dev); - if (!ops->viommu_alloc) { + if (!ops->get_viommu_size || !ops->viommu_init) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + viommu_size = ops->get_viommu_size(idev->dev, cmd->type); + if (!viommu_size) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + /* + * It is a driver bug for providing a viommu_size smaller than the core + * vIOMMU structure size + */ + if (WARN_ON_ONCE(viommu_size < sizeof(*viommu))) { rc = -EOPNOTSUPP; goto out_put_idev; } @@ -47,8 +68,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_hwpt; } - viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain, - ucmd->ictx, cmd->type); + viommu = (struct iommufd_viommu *)_iommufd_object_alloc_ucmd( + ucmd, viommu_size, IOMMUFD_OBJ_VIOMMU); if (IS_ERR(viommu)) { rc = PTR_ERR(viommu); goto out_put_hwpt; @@ -59,6 +80,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) viommu->ictx = ucmd->ictx; viommu->hwpt = hwpt_paging; refcount_inc(&viommu->hwpt->common.obj.users); + INIT_LIST_HEAD(&viommu->veventqs); + init_rwsem(&viommu->veventqs_rwsem); /* * It is the most likely case that a physical IOMMU is unpluggable. A * pluggable IOMMU instance (if exists) is responsible for refcounting @@ -66,15 +89,20 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) */ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); + rc = ops->viommu_init(viommu, hwpt_paging->common.domain, + user_data.len ? &user_data : NULL); + if (rc) + goto out_put_hwpt; + + /* It is a driver bug that viommu->ops isn't filled */ + if (WARN_ON_ONCE(!viommu->ops)) { + rc = -EOPNOTSUPP; + goto out_put_hwpt; + } + cmd->out_viommu_id = viommu->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); - if (rc) - goto out_abort; - iommufd_object_finalize(ucmd->ictx, &viommu->obj); - goto out_put_hwpt; -out_abort: - iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj); out_put_hwpt: iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); out_put_idev: @@ -88,8 +116,10 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj) container_of(obj, struct iommufd_vdevice, obj); struct iommufd_viommu *viommu = vdev->viommu; + if (vdev->destroy) + vdev->destroy(vdev); /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ - xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL); + xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL); refcount_dec(&viommu->obj.users); put_device(vdev->dev); } @@ -98,6 +128,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) { struct iommu_vdevice_alloc *cmd = ucmd->cmd; struct iommufd_vdevice *vdev, *curr; + size_t vdev_size = sizeof(*vdev); struct iommufd_viommu *viommu; struct iommufd_device *idev; u64 virt_id = cmd->virt_id; @@ -122,13 +153,28 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_idev; } - vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE); + if (viommu->ops && viommu->ops->vdevice_size) { + /* + * It is a driver bug for: + * - ops->vdevice_size smaller than the core structure size + * - not implementing a pairing ops->vdevice_init op + */ + if (WARN_ON_ONCE(viommu->ops->vdevice_size < vdev_size || + !viommu->ops->vdevice_init)) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + vdev_size = viommu->ops->vdevice_size; + } + + vdev = (struct iommufd_vdevice *)_iommufd_object_alloc_ucmd( + ucmd, vdev_size, IOMMUFD_OBJ_VDEVICE); if (IS_ERR(vdev)) { rc = PTR_ERR(vdev); goto out_put_idev; } - vdev->id = virt_id; + vdev->virt_id = virt_id; vdev->dev = idev->dev; get_device(idev->dev); vdev->viommu = viommu; @@ -137,21 +183,201 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL); if (curr) { rc = xa_err(curr) ?: -EEXIST; - goto out_abort; + goto out_put_idev; + } + + if (viommu->ops && viommu->ops->vdevice_init) { + rc = viommu->ops->vdevice_init(vdev); + if (rc) + goto out_put_idev; } cmd->out_vdevice_id = vdev->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); - if (rc) - goto out_abort; - iommufd_object_finalize(ucmd->ictx, &vdev->obj); - goto out_put_idev; -out_abort: - iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); out_put_idev: iommufd_put_object(ucmd->ictx, &idev->obj); out_put_viommu: iommufd_put_object(ucmd->ictx, &viommu->obj); return rc; } + +static void iommufd_hw_queue_destroy_access(struct iommufd_ctx *ictx, + struct iommufd_access *access, + u64 base_iova, size_t length) +{ + u64 aligned_iova = PAGE_ALIGN_DOWN(base_iova); + u64 offset = base_iova - aligned_iova; + + iommufd_access_unpin_pages(access, aligned_iova, + PAGE_ALIGN(length + offset)); + iommufd_access_detach_internal(access); + iommufd_access_destroy_internal(ictx, access); +} + +void iommufd_hw_queue_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_queue *hw_queue = + container_of(obj, struct iommufd_hw_queue, obj); + + if (hw_queue->destroy) + hw_queue->destroy(hw_queue); + if (hw_queue->access) + iommufd_hw_queue_destroy_access(hw_queue->viommu->ictx, + hw_queue->access, + hw_queue->base_addr, + hw_queue->length); + if (hw_queue->viommu) + refcount_dec(&hw_queue->viommu->obj.users); +} + +/* + * When the HW accesses the guest queue via physical addresses, the underlying + * physical pages of the guest queue must be contiguous. Also, for the security + * concern that IOMMUFD_CMD_IOAS_UNMAP could potentially remove the mappings of + * the guest queue from the nesting parent iopt while the HW is still accessing + * the guest queue memory physically, such a HW queue must require an access to + * pin the underlying pages and prevent that from happening. + */ +static struct iommufd_access * +iommufd_hw_queue_alloc_phys(struct iommu_hw_queue_alloc *cmd, + struct iommufd_viommu *viommu, phys_addr_t *base_pa) +{ + u64 aligned_iova = PAGE_ALIGN_DOWN(cmd->nesting_parent_iova); + u64 offset = cmd->nesting_parent_iova - aligned_iova; + struct iommufd_access *access; + struct page **pages; + size_t max_npages; + size_t length; + size_t i; + int rc; + + /* max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE) */ + if (check_add_overflow(offset, cmd->length, &length)) + return ERR_PTR(-ERANGE); + if (check_add_overflow(length, PAGE_SIZE - 1, &length)) + return ERR_PTR(-ERANGE); + max_npages = length / PAGE_SIZE; + /* length needs to be page aligned too */ + length = max_npages * PAGE_SIZE; + + /* + * Use kvcalloc() to avoid memory fragmentation for a large page array. + * Set __GFP_NOWARN to avoid syzkaller blowups + */ + pages = kvcalloc(max_npages, sizeof(*pages), GFP_KERNEL | __GFP_NOWARN); + if (!pages) + return ERR_PTR(-ENOMEM); + + access = iommufd_access_create_internal(viommu->ictx); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_free; + } + + rc = iommufd_access_attach_internal(access, viommu->hwpt->ioas); + if (rc) + goto out_destroy; + + rc = iommufd_access_pin_pages(access, aligned_iova, length, pages, 0); + if (rc) + goto out_detach; + + /* Validate if the underlying physical pages are contiguous */ + for (i = 1; i < max_npages; i++) { + if (page_to_pfn(pages[i]) == page_to_pfn(pages[i - 1]) + 1) + continue; + rc = -EFAULT; + goto out_unpin; + } + + *base_pa = (page_to_pfn(pages[0]) << PAGE_SHIFT) + offset; + kfree(pages); + return access; + +out_unpin: + iommufd_access_unpin_pages(access, aligned_iova, length); +out_detach: + iommufd_access_detach_internal(access); +out_destroy: + iommufd_access_destroy_internal(viommu->ictx, access); +out_free: + kfree(pages); + return ERR_PTR(rc); +} + +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_hw_queue_alloc *cmd = ucmd->cmd; + struct iommufd_hw_queue *hw_queue; + struct iommufd_viommu *viommu; + struct iommufd_access *access; + size_t hw_queue_size; + phys_addr_t base_pa; + u64 last; + int rc; + + if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->length) + return -EINVAL; + if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1, + &last)) + return -EOVERFLOW; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + if (!viommu->ops || !viommu->ops->get_hw_queue_size || + !viommu->ops->hw_queue_init_phys) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + hw_queue_size = viommu->ops->get_hw_queue_size(viommu, cmd->type); + if (!hw_queue_size) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + /* + * It is a driver bug for providing a hw_queue_size smaller than the + * core HW queue structure size + */ + if (WARN_ON_ONCE(hw_queue_size < sizeof(*hw_queue))) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + hw_queue = (struct iommufd_hw_queue *)_iommufd_object_alloc_ucmd( + ucmd, hw_queue_size, IOMMUFD_OBJ_HW_QUEUE); + if (IS_ERR(hw_queue)) { + rc = PTR_ERR(hw_queue); + goto out_put_viommu; + } + + access = iommufd_hw_queue_alloc_phys(cmd, viommu, &base_pa); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_put_viommu; + } + + hw_queue->viommu = viommu; + refcount_inc(&viommu->obj.users); + hw_queue->access = access; + hw_queue->type = cmd->type; + hw_queue->length = cmd->length; + hw_queue->base_addr = cmd->nesting_parent_iova; + + rc = viommu->ops->hw_queue_init_phys(hw_queue, cmd->index, base_pa); + if (rc) + goto out_put_viommu; + + cmd->out_hw_queue_id = hw_queue->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index a565b9e40f4a6..3e724e7f10f02 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -446,22 +446,13 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct iommu_fwspec *fwspec = NULL; struct of_phandle_args iommu_spec; struct mtk_iommu_v1_data *data; int err, idx = 0, larbid, larbidx; struct device_link *link; struct device *larbdev; - /* - * In the deferred case, free the existed fwspec. - * Always initialize the fwspec internally. - */ - if (fwspec) { - iommu_fwspec_free(dev); - fwspec = dev_iommu_fwspec_get(dev); - } - while (!of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", idx, &iommu_spec)) { @@ -476,6 +467,9 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) idx++; } + if (!fwspec) + return ERR_PTR(-ENODEV); + data = dev_iommu_priv_get(dev); /* Link the consumer device with the smi-larb device(supplier) */ diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 97987cd78da93..e10a68b5ffde1 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -116,6 +116,7 @@ static void of_pci_check_device_ats(struct device *dev, struct device_node *np) int of_iommu_configure(struct device *dev, struct device_node *master_np, const u32 *id) { + bool dev_iommu_present; int err; if (!master_np) @@ -127,6 +128,7 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, mutex_unlock(&iommu_probe_device_lock); return 0; } + dev_iommu_present = dev->iommu; /* * We don't currently walk up the tree looking for a parent IOMMU. @@ -147,8 +149,10 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, err = of_iommu_configure_device(master_np, dev, id); } - if (err) + if (err && dev_iommu_present) iommu_fwspec_free(dev); + else if (err && dev->iommu) + dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); if (!err && dev->bus) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 7f633bb5efef1..69d353e1df843 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -846,7 +846,6 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, err = ops->of_xlate(dev, args); if (err < 0) { dev_err(dev, "failed to parse SW group ID: %d\n", err); - iommu_fwspec_free(dev); return err; } diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index b85ce6310ddbd..ecd41fb03e5a5 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -48,6 +48,7 @@ struct viommu_dev { u64 pgsize_bitmap; u32 first_domain; u32 last_domain; + u32 identity_domain_id; /* Supported MAP flags */ u32 map_flags; u32 probe_size; @@ -62,7 +63,6 @@ struct viommu_mapping { struct viommu_domain { struct iommu_domain domain; struct viommu_dev *viommu; - struct mutex mutex; /* protects viommu pointer */ unsigned int id; u32 map_flags; @@ -70,7 +70,6 @@ struct viommu_domain { struct rb_root_cached mappings; unsigned long nr_endpoints; - bool bypass; }; struct viommu_endpoint { @@ -97,6 +96,8 @@ struct viommu_event { }; }; +static struct viommu_domain viommu_identity_domain; + #define to_viommu_domain(domain) \ container_of(domain, struct viommu_domain, domain) @@ -305,6 +306,22 @@ static int viommu_send_req_sync(struct viommu_dev *viommu, void *buf, return ret; } +static int viommu_send_attach_req(struct viommu_dev *viommu, struct device *dev, + struct virtio_iommu_req_attach *req) +{ + int ret; + unsigned int i; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + for (i = 0; i < fwspec->num_ids; i++) { + req->endpoint = cpu_to_le32(fwspec->ids[i]); + ret = viommu_send_req_sync(viommu, req, sizeof(*req)); + if (ret) + return ret; + } + return 0; +} + /* * viommu_add_mapping - add a mapping to the internal tree * @@ -637,71 +654,45 @@ static void viommu_event_handler(struct virtqueue *vq) /* IOMMU API */ -static struct iommu_domain *viommu_domain_alloc(unsigned type) +static struct iommu_domain *viommu_domain_alloc_paging(struct device *dev) { + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct viommu_dev *viommu = vdev->viommu; + unsigned long viommu_page_size; struct viommu_domain *vdomain; - - if (type != IOMMU_DOMAIN_UNMANAGED && - type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_IDENTITY) - return NULL; - - vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); - if (!vdomain) - return NULL; - - mutex_init(&vdomain->mutex); - spin_lock_init(&vdomain->mappings_lock); - vdomain->mappings = RB_ROOT_CACHED; - - return &vdomain->domain; -} - -static int viommu_domain_finalise(struct viommu_endpoint *vdev, - struct iommu_domain *domain) -{ int ret; - unsigned long viommu_page_size; - struct viommu_dev *viommu = vdev->viommu; - struct viommu_domain *vdomain = to_viommu_domain(domain); viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap); if (viommu_page_size > PAGE_SIZE) { dev_err(vdev->dev, "granule 0x%lx larger than system page size 0x%lx\n", viommu_page_size, PAGE_SIZE); - return -ENODEV; + return ERR_PTR(-ENODEV); } - ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, - viommu->last_domain, GFP_KERNEL); - if (ret < 0) - return ret; + vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); + if (!vdomain) + return ERR_PTR(-ENOMEM); - vdomain->id = (unsigned int)ret; + spin_lock_init(&vdomain->mappings_lock); + vdomain->mappings = RB_ROOT_CACHED; - domain->pgsize_bitmap = viommu->pgsize_bitmap; - domain->geometry = viommu->geometry; + ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, + viommu->last_domain, GFP_KERNEL); + if (ret < 0) { + kfree(vdomain); + return ERR_PTR(ret); + } - vdomain->map_flags = viommu->map_flags; - vdomain->viommu = viommu; + vdomain->id = (unsigned int)ret; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - if (virtio_has_feature(viommu->vdev, - VIRTIO_IOMMU_F_BYPASS_CONFIG)) { - vdomain->bypass = true; - return 0; - } + vdomain->domain.pgsize_bitmap = viommu->pgsize_bitmap; + vdomain->domain.geometry = viommu->geometry; - ret = viommu_domain_map_identity(vdev, vdomain); - if (ret) { - ida_free(&viommu->domain_ids, vdomain->id); - vdomain->viommu = NULL; - return ret; - } - } + vdomain->map_flags = viommu->map_flags; + vdomain->viommu = viommu; - return 0; + return &vdomain->domain; } static void viommu_domain_free(struct iommu_domain *domain) @@ -717,29 +708,37 @@ static void viommu_domain_free(struct iommu_domain *domain) kfree(vdomain); } +static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev) +{ + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct iommu_domain *domain; + int ret; + + if (virtio_has_feature(vdev->viommu->vdev, + VIRTIO_IOMMU_F_BYPASS_CONFIG)) + return &viommu_identity_domain.domain; + + domain = viommu_domain_alloc_paging(dev); + if (IS_ERR(domain)) + return domain; + + ret = viommu_domain_map_identity(vdev, to_viommu_domain(domain)); + if (ret) { + viommu_domain_free(domain); + return ERR_PTR(ret); + } + return domain; +} + static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) { - int i; int ret = 0; struct virtio_iommu_req_attach req; - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); struct viommu_domain *vdomain = to_viommu_domain(domain); - mutex_lock(&vdomain->mutex); - if (!vdomain->viommu) { - /* - * Properly initialize the domain now that we know which viommu - * owns it. - */ - ret = viommu_domain_finalise(vdev, domain); - } else if (vdomain->viommu != vdev->viommu) { - ret = -EINVAL; - } - mutex_unlock(&vdomain->mutex); - - if (ret) - return ret; + if (vdomain->viommu != vdev->viommu) + return -EINVAL; /* * In the virtio-iommu device, when attaching the endpoint to a new @@ -761,16 +760,9 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) .domain = cpu_to_le32(vdomain->id), }; - if (vdomain->bypass) - req.flags |= cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS); - - for (i = 0; i < fwspec->num_ids; i++) { - req.endpoint = cpu_to_le32(fwspec->ids[i]); - - ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req)); - if (ret) - return ret; - } + ret = viommu_send_attach_req(vdomain->viommu, dev, &req); + if (ret) + return ret; if (!vdomain->nr_endpoints) { /* @@ -788,6 +780,40 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) return 0; } +static int viommu_attach_identity_domain(struct iommu_domain *domain, + struct device *dev) +{ + int ret = 0; + struct virtio_iommu_req_attach req; + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct viommu_domain *vdomain = to_viommu_domain(domain); + + req = (struct virtio_iommu_req_attach) { + .head.type = VIRTIO_IOMMU_T_ATTACH, + .domain = cpu_to_le32(vdev->viommu->identity_domain_id), + .flags = cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS), + }; + + ret = viommu_send_attach_req(vdev->viommu, dev, &req); + if (ret) + return ret; + + if (vdev->vdomain) + vdev->vdomain->nr_endpoints--; + vdomain->nr_endpoints++; + vdev->vdomain = vdomain; + return 0; +} + +static struct viommu_domain viommu_identity_domain = { + .domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = viommu_attach_identity_domain, + }, + }, +}; + static void viommu_detach_dev(struct viommu_endpoint *vdev) { int i; @@ -1062,7 +1088,8 @@ static bool viommu_capable(struct device *dev, enum iommu_cap cap) static struct iommu_ops viommu_ops = { .capable = viommu_capable, - .domain_alloc = viommu_domain_alloc, + .domain_alloc_identity = viommu_domain_alloc_identity, + .domain_alloc_paging = viommu_domain_alloc_paging, .probe_device = viommu_probe_device, .release_device = viommu_release_device, .device_group = viommu_device_group, @@ -1184,6 +1211,12 @@ static int viommu_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_IOMMU_F_MMIO)) viommu->map_flags |= VIRTIO_IOMMU_MAP_F_MMIO; + /* Reserve an ID to use as the bypass domain */ + if (virtio_has_feature(viommu->vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) { + viommu->identity_domain_id = viommu->first_domain; + viommu->first_domain++; + } + viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; virtio_device_ready(vdev); diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index c11b9965c4ad9..64658a1c3aa18 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -28,6 +28,7 @@ config ARM_GIC_V2M select ARM_GIC select IRQ_MSI_LIB select PCI_MSI + select IRQ_MSI_IOMMU config GIC_NON_BANKED bool @@ -38,12 +39,14 @@ config ARM_GIC_V3 select PARTITION_PERCPU select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP select HAVE_ARM_SMCCC_DISCOVERY + select IRQ_MSI_IOMMU config ARM_GIC_V3_ITS bool select GENERIC_MSI_IRQ select IRQ_MSI_LIB default ARM_GIC_V3 + select IRQ_MSI_IOMMU config ARM_GIC_V3_ITS_FSL_MC bool @@ -408,6 +411,7 @@ config LS_EXTIRQ config LS_SCFG_MSI def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE + select IRQ_MSI_IOMMU depends on PCI_MSI config PARTITION_PERCPU diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index a1e370d0200f1..34f437207adf7 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -87,9 +87,6 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct v2m_data *v2m = irq_data_get_irq_chip_data(data); phys_addr_t addr = gicv2m_get_msi_addr(v2m, data->hwirq); - msg->address_hi = upper_32_bits(addr); - msg->address_lo = lower_32_bits(addr); - if (v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY) msg->data = 0; else @@ -97,7 +94,7 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (v2m->flags & GICV2M_NEEDS_SPI_OFFSET) msg->data -= v2m->spi_offset; - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, addr); } static struct irq_chip gicv2m_irq_chip = { diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index f30ed281882ff..0115ad6c82593 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1811,17 +1811,10 @@ static u64 its_irq_get_msi_base(struct its_device *its_dev) static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); - struct its_node *its; - u64 addr; - - its = its_dev->its; - addr = its->get_msi_base(its_dev); - - msg->address_lo = lower_32_bits(addr); - msg->address_hi = upper_32_bits(addr); - msg->data = its_get_event_id(d); - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg); + msg->data = its_get_event_id(d); + msi_msg_set_addr(irq_data_get_msi_desc(d), msg, + its_dev->its->get_msi_base(its_dev)); } static int its_irq_set_irqchip_state(struct irq_data *d, diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index 3fe870f8ee174..a6510128611e0 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -147,22 +147,18 @@ static const struct irq_domain_ops mbi_domain_ops = { static void mbi_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - msg[0].address_hi = upper_32_bits(mbi_phys_base + GICD_SETSPI_NSR); - msg[0].address_lo = lower_32_bits(mbi_phys_base + GICD_SETSPI_NSR); msg[0].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[0], + mbi_phys_base + GICD_SETSPI_NSR); } static void mbi_compose_mbi_msg(struct irq_data *data, struct msi_msg *msg) { mbi_compose_msi_msg(data, msg); - msg[1].address_hi = upper_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); - msg[1].address_lo = lower_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); msg[1].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), &msg[1]); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[1], + mbi_phys_base + GICD_CLRSPI_NSR); } static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain, diff --git a/drivers/irqchip/irq-ls-scfg-msi.c b/drivers/irqchip/irq-ls-scfg-msi.c index c0e1aafe468c3..3cb80796cc7cc 100644 --- a/drivers/irqchip/irq-ls-scfg-msi.c +++ b/drivers/irqchip/irq-ls-scfg-msi.c @@ -87,8 +87,6 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { struct ls_scfg_msi *msi_data = irq_data_get_irq_chip_data(data); - msg->address_hi = upper_32_bits(msi_data->msiir_addr); - msg->address_lo = lower_32_bits(msi_data->msiir_addr); msg->data = data->hwirq; if (msi_affinity_flag) { @@ -98,7 +96,8 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) msg->data |= cpumask_first(mask); } - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, + msi_data->msiir_addr); } static int ls_scfg_msi_set_affinity(struct irq_data *irq_data, diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 043221363f164..00603c2c4ff0e 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -540,4 +540,37 @@ int pci_max_pasids(struct pci_dev *pdev) return (1 << FIELD_GET(PCI_PASID_CAP_WIDTH, supported)); } EXPORT_SYMBOL_GPL(pci_max_pasids); + +/** + * pci_pasid_status - Check the PASID status + * @pdev: PCI device structure + * + * Returns a negative value when no PASID capability is present. + * Otherwise the value of the control register is returned. + * Status reported are: + * + * PCI_PASID_CTRL_ENABLE - PASID enabled + * PCI_PASID_CTRL_EXEC - Execute permission enabled + * PCI_PASID_CTRL_PRIV - Privileged mode enabled + */ +int pci_pasid_status(struct pci_dev *pdev) +{ + int pasid; + u16 ctrl; + + if (pdev->is_virtfn) + pdev = pci_physfn(pdev); + + pasid = pdev->pasid_cap; + if (!pasid) + return -EINVAL; + + pci_read_config_word(pdev, pasid + PCI_PASID_CTRL, &ctrl); + + ctrl &= PCI_PASID_CTRL_ENABLE | PCI_PASID_CTRL_EXEC | + PCI_PASID_CTRL_PRIV; + + return ctrl; +} +EXPORT_SYMBOL_GPL(pci_pasid_status); #endif /* CONFIG_PCI_PASID */ diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index bb1817bd4ff31..281a8dc3ed497 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -162,9 +162,9 @@ void vfio_df_unbind_iommufd(struct vfio_device_file *df) int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, struct vfio_device_attach_iommufd_pt __user *arg) { - struct vfio_device *device = df->device; struct vfio_device_attach_iommufd_pt attach; - unsigned long minsz; + struct vfio_device *device = df->device; + unsigned long minsz, xend = 0; int ret; minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id); @@ -172,11 +172,34 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, if (copy_from_user(&attach, arg, minsz)) return -EFAULT; - if (attach.argsz < minsz || attach.flags) + if (attach.argsz < minsz) return -EINVAL; + if (attach.flags & ~VFIO_DEVICE_ATTACH_PASID) + return -EINVAL; + + if (attach.flags & VFIO_DEVICE_ATTACH_PASID) { + if (!device->ops->pasid_attach_ioas) + return -EOPNOTSUPP; + xend = offsetofend(struct vfio_device_attach_iommufd_pt, pasid); + } + + if (xend) { + if (attach.argsz < xend) + return -EINVAL; + + if (copy_from_user((void *)&attach + minsz, + (void __user *)arg + minsz, xend - minsz)) + return -EFAULT; + } + mutex_lock(&device->dev_set->lock); - ret = device->ops->attach_ioas(device, &attach.pt_id); + if (attach.flags & VFIO_DEVICE_ATTACH_PASID) + ret = device->ops->pasid_attach_ioas(device, + attach.pasid, + &attach.pt_id); + else + ret = device->ops->attach_ioas(device, &attach.pt_id); if (ret) goto out_unlock; @@ -198,20 +221,41 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, struct vfio_device_detach_iommufd_pt __user *arg) { - struct vfio_device *device = df->device; struct vfio_device_detach_iommufd_pt detach; - unsigned long minsz; + struct vfio_device *device = df->device; + unsigned long minsz, xend = 0; minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags); if (copy_from_user(&detach, arg, minsz)) return -EFAULT; - if (detach.argsz < minsz || detach.flags) + if (detach.argsz < minsz) return -EINVAL; + if (detach.flags & ~VFIO_DEVICE_DETACH_PASID) + return -EINVAL; + + if (detach.flags & VFIO_DEVICE_DETACH_PASID) { + if (!device->ops->pasid_detach_ioas) + return -EOPNOTSUPP; + xend = offsetofend(struct vfio_device_detach_iommufd_pt, pasid); + } + + if (xend) { + if (detach.argsz < xend) + return -EINVAL; + + if (copy_from_user((void *)&detach + minsz, + (void __user *)arg + minsz, xend - minsz)) + return -EFAULT; + } + mutex_lock(&device->dev_set->lock); - device->ops->detach_ioas(device); + if (detach.flags & VFIO_DEVICE_DETACH_PASID) + device->ops->pasid_detach_ioas(device, detach.pasid); + else + device->ops->detach_ioas(device); mutex_unlock(&device->dev_set->lock); return 0; diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 516294fd901be..c8c3a2d53f86e 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -119,16 +119,24 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, if (IS_ERR(idev)) return PTR_ERR(idev); vdev->iommufd_device = idev; + ida_init(&vdev->pasids); return 0; } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind); void vfio_iommufd_physical_unbind(struct vfio_device *vdev) { + int pasid; + lockdep_assert_held(&vdev->dev_set->lock); + while ((pasid = ida_find_first(&vdev->pasids)) >= 0) { + iommufd_device_detach(vdev->iommufd_device, pasid); + ida_free(&vdev->pasids, pasid); + } + if (vdev->iommufd_attached) { - iommufd_device_detach(vdev->iommufd_device); + iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID); vdev->iommufd_attached = false; } iommufd_device_unbind(vdev->iommufd_device); @@ -146,9 +154,11 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) return -EINVAL; if (vdev->iommufd_attached) - rc = iommufd_device_replace(vdev->iommufd_device, pt_id); + rc = iommufd_device_replace(vdev->iommufd_device, + IOMMU_NO_PASID, pt_id); else - rc = iommufd_device_attach(vdev->iommufd_device, pt_id); + rc = iommufd_device_attach(vdev->iommufd_device, + IOMMU_NO_PASID, pt_id); if (rc) return rc; vdev->iommufd_attached = true; @@ -163,11 +173,53 @@ void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev) if (WARN_ON(!vdev->iommufd_device) || !vdev->iommufd_attached) return; - iommufd_device_detach(vdev->iommufd_device); + iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID); vdev->iommufd_attached = false; } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas); +int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev, + u32 pasid, u32 *pt_id) +{ + int rc; + + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return -EINVAL; + + if (ida_exists(&vdev->pasids, pasid)) + return iommufd_device_replace(vdev->iommufd_device, + pasid, pt_id); + + rc = ida_alloc_range(&vdev->pasids, pasid, pasid, GFP_KERNEL); + if (rc < 0) + return rc; + + rc = iommufd_device_attach(vdev->iommufd_device, pasid, pt_id); + if (rc) + ida_free(&vdev->pasids, pasid); + + return rc; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_attach_ioas); + +void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, + u32 pasid) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return; + + if (!ida_exists(&vdev->pasids, pasid)) + return; + + iommufd_device_detach(vdev->iommufd_device, pasid); + ida_free(&vdev->pasids, pasid); +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_detach_ioas); + /* * The emulated standard ops mean that vfio_device is going to use the * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this diff --git a/drivers/vfio/pci/nvgrace-gpu/Kconfig b/drivers/vfio/pci/nvgrace-gpu/Kconfig index a7f624b37e410..d5773bbd22f5e 100644 --- a/drivers/vfio/pci/nvgrace-gpu/Kconfig +++ b/drivers/vfio/pci/nvgrace-gpu/Kconfig @@ -1,8 +1,19 @@ # SPDX-License-Identifier: GPL-2.0-only +config NVGRACE_EGM + tristate "EGM driver for NVIDIA Grace Hopper and Blackwell Superchip" + depends on ARM64 || (COMPILE_TEST && 64BIT) + help + Extended GPU Memory (EGM) support for the GPU in the NVIDIA Grace + based chips required to avail the CPU memory as additional + cross-node/cross-socket memory for GPU using KVM/qemu. + + If you don't know what to do here, say N. + config NVGRACE_GPU_VFIO_PCI tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip" depends on ARM64 || (COMPILE_TEST && 64BIT) select VFIO_PCI_CORE + select NVGRACE_EGM help VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is required to assign the GPU device to userspace using KVM/qemu/etc. diff --git a/drivers/vfio/pci/nvgrace-gpu/Makefile b/drivers/vfio/pci/nvgrace-gpu/Makefile index 3ca8c187897a9..c99b04a94e770 100644 --- a/drivers/vfio/pci/nvgrace-gpu/Makefile +++ b/drivers/vfio/pci/nvgrace-gpu/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o nvgrace-gpu-vfio-pci-y := main.o + +obj-$(CONFIG_NVGRACE_EGM) += nvgrace-egm.o +nvgrace-egm-y := egm.o diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c new file mode 100644 index 0000000000000..6cdcec03d03f6 --- /dev/null +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -0,0 +1,568 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include +#include +#include + +#ifdef CONFIG_MEMORY_FAILURE +#include +#include +#endif + +#define MAX_EGM_NODES 256 + +struct gpu_node { + struct list_head list; + struct pci_dev *pdev; +}; + +struct egm_region { + struct list_head list; + int egmpxm; + atomic_t open_count; + phys_addr_t egmphys; + size_t egmlength; + struct device device; + struct cdev cdev; + struct list_head gpus; + DECLARE_HASHTABLE(htbl, 0x10); +#ifdef CONFIG_MEMORY_FAILURE + struct pfn_address_space pfn_address_space; +#endif +}; + +struct h_node { + unsigned long mem_offset; + struct hlist_node node; +}; + +static dev_t dev; +static struct class *class; +static struct list_head egm_list; + +#ifdef CONFIG_MEMORY_FAILURE +static void +nvgrace_egm_pfn_memory_failure(struct pfn_address_space *pfn_space, + unsigned long pfn) +{ + struct egm_region *region = + container_of(pfn_space, struct egm_region, pfn_address_space); + unsigned long mem_offset = PFN_PHYS(pfn - pfn_space->node.start); + struct h_node *ecc; + + if (mem_offset >= region->egmlength) + return; + + /* + * MM has called to notify a poisoned page. Track that in the hastable. + */ + ecc = (struct h_node *)(vzalloc(sizeof(struct h_node))); + if (!ecc) + return; /* Silently fail on allocation error */ + ecc->mem_offset = mem_offset; + hash_add(region->htbl, &ecc->node, ecc->mem_offset); +} + +struct pfn_address_space_ops nvgrace_egm_pas_ops = { + .failure = nvgrace_egm_pfn_memory_failure, +}; + +static int +nvgrace_egm_register_pfn_range(struct egm_region *region, + struct vm_area_struct *vma) +{ + unsigned long nr_pages = region->egmlength >> PAGE_SHIFT; + + region->pfn_address_space.node.start = vma->vm_pgoff; + region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; + region->pfn_address_space.ops = &nvgrace_egm_pas_ops; + region->pfn_address_space.mapping = vma->vm_file->f_mapping; + + return register_pfn_address_space(®ion->pfn_address_space); +} + +static vm_fault_t nvgrace_egm_fault(struct vm_fault *vmf) +{ + unsigned long mem_offset = PFN_PHYS(vmf->pgoff - vmf->vma->vm_pgoff); + struct egm_region *region = vmf->vma->vm_file->private_data; + struct h_node *cur; + + /* + * Check if the page is poisoned. + */ + if (mem_offset < region->egmlength) { + hash_for_each_possible(region->htbl, cur, node, mem_offset) { + if (cur->mem_offset == mem_offset) + return VM_FAULT_HWPOISON; + } + } + + return VM_FAULT_ERROR; +} + +static const struct vm_operations_struct nvgrace_egm_mmap_ops = { + .fault = nvgrace_egm_fault, +}; + +#endif + +static int nvgrace_egm_open(struct inode *inode, struct file *file) +{ + void *memaddr; + struct egm_region *region = container_of(inode->i_cdev, + struct egm_region, cdev); + + if (atomic_inc_return(®ion->open_count) > 1) + return 0; + + memaddr = memremap(region->egmphys, region->egmlength, MEMREMAP_WB); + if (!memaddr) { + atomic_dec(®ion->open_count); + return -EINVAL; + } + + memset((u8 *)memaddr, 0, region->egmlength); + memunmap(memaddr); + file->private_data = region; + + return 0; +} + +static int nvgrace_egm_release(struct inode *inode, struct file *file) +{ + struct egm_region *region = container_of(inode->i_cdev, + struct egm_region, cdev); + + if (atomic_dec_and_test(®ion->open_count)) { +#ifdef CONFIG_MEMORY_FAILURE + unregister_pfn_address_space(®ion->pfn_address_space); +#endif + file->private_data = NULL; + } + + return 0; +} + +static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret = 0; + struct egm_region *region = file->private_data; + + if (!region) + return -EINVAL; + + ret = remap_pfn_range(vma, vma->vm_start, + PHYS_PFN(region->egmphys), + (vma->vm_end - vma->vm_start), + vma->vm_page_prot); + if (ret) + return ret; + + vma->vm_pgoff = PHYS_PFN(region->egmphys); + +#ifdef CONFIG_MEMORY_FAILURE + vma->vm_ops = &nvgrace_egm_mmap_ops; + + ret = nvgrace_egm_register_pfn_range(region, vma); +#endif + return ret; +} + +static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + unsigned long minsz = offsetofend(struct egm_bad_pages_list, count); + struct egm_bad_pages_list info; + void __user *uarg = (void __user *)arg; + struct egm_region *region = file->private_data; + + if (copy_from_user(&info, uarg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (!region) + return -EINVAL; + + switch (cmd) { + case EGM_BAD_PAGES_LIST: + { + int ret; + unsigned long bad_page_struct_size = sizeof(struct egm_bad_pages_info); + struct egm_bad_pages_info tmp; + struct h_node *cur_page; + struct hlist_node *tmp_node; + unsigned long bkt; + int count = 0, index = 0; + + hash_for_each_safe(region->htbl, bkt, tmp_node, cur_page, node) + count++; + + if (info.argsz < (minsz + count * bad_page_struct_size)) { + info.argsz = minsz + count * bad_page_struct_size; + info.count = 0; + goto done; + } else { + hash_for_each_safe(region->htbl, bkt, tmp_node, cur_page, node) { + /* + * This check fails if there was an ECC error + * after the usermode app read the count of + * bad pages through this ioctl. + */ + if (minsz + index * bad_page_struct_size >= info.argsz) { + info.argsz = minsz + index * bad_page_struct_size; + info.count = index; + goto done; + } + + tmp.offset = cur_page->mem_offset; + tmp.size = PAGE_SIZE; + + ret = copy_to_user(uarg + minsz + + index * bad_page_struct_size, + &tmp, bad_page_struct_size); + if (ret) + return -EFAULT; + index++; + } + + info.count = index; + } + break; + } + default: + return -EINVAL; + } + +done: + return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; +} + +static const struct file_operations file_ops = { + .owner = THIS_MODULE, + .open = nvgrace_egm_open, + .release = nvgrace_egm_release, + .mmap = nvgrace_egm_mmap, + .unlocked_ioctl = nvgrace_egm_ioctl, +}; + +static int setup_egm_chardev(struct egm_region *region) +{ + int ret = 0; + + device_initialize(®ion->device); + + /* + * Use the proximity domain number as the device minor + * number. So the EGM corresponding to node X would be + * /dev/egmX. + */ + region->device.devt = MKDEV(MAJOR(dev), region->egmpxm); + region->device.class = class; + cdev_init(®ion->cdev, &file_ops); + region->cdev.owner = THIS_MODULE; + + ret = dev_set_name(®ion->device, "egm%d", region->egmpxm); + if (ret) + return ret; + + ret = cdev_device_add(®ion->cdev, ®ion->device); + + return ret; +} + +static void destroy_egm_chardev(struct egm_region *region) +{ + cdev_device_del(®ion->cdev, ®ion->device); +} + +static int +nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys, + u64 *pegmlength, u64 *pegmpxm) +{ + int ret; + + /* + * The memory information is present in the system ACPI tables as DSD + * properties nvidia,egm-base-pa and nvidia,egmm-size. + */ + ret = device_property_read_u64(&pdev->dev, "nvidia,egm-size", + pegmlength); + if (ret) + return ret; + + if (overflows_type(*pegmlength, size_t)) + return -EOVERFLOW; + + ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa", + pegmphys); + if (ret) + return ret; + + if (overflows_type(*pegmphys, phys_addr_t)) + return -EOVERFLOW; + + ret = device_property_read_u64(&pdev->dev, "nvidia,egm-pxm", + pegmpxm); + if (ret) + return ret; + + if (overflows_type(*pegmpxm, int)) + return -EOVERFLOW; + + return 0; +} + +static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev, + struct egm_region *region) +{ + u64 retiredpagesphys, count; + void *memaddr; + int index; + + if (device_property_read_u64(&pdev->dev, + "nvidia,egm-retired-pages-data-base", + &retiredpagesphys)) + return; + + /* Catch firmware bug and avoid a crash */ + if (WARN_ON_ONCE(retiredpagesphys == 0)) + return; + + memaddr = memremap(retiredpagesphys, PAGE_SIZE, MEMREMAP_WB); + if (!memaddr) + return; + + count = *(u64 *)memaddr; + + for (index = 0; index < count; index++) { + struct h_node *retired_page; + + /* + * Since the EGM is linearly mapped, the offset in the + * carveout is the same offset in the VM system memory. + * + * Calculate the offset to communicate to the usermode + * apps. + */ + retired_page = (struct h_node *)(vzalloc(sizeof(struct h_node))); + if (!retired_page) + continue; /* Skip this entry on allocation failure */ + retired_page->mem_offset = *((u64 *)memaddr + index + 1) - + region->egmphys; + hash_add(region->htbl, &retired_page->node, retired_page->mem_offset); + } + + memunmap(memaddr); +} + +static ssize_t gpu_devices_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct egm_region *region = + container_of(dev, struct egm_region, device); + struct gpu_node *node, *temp_node; + int len = 0; + + list_for_each_entry_safe(node, temp_node, ®ion->gpus, list) { + struct pci_dev *pdev = node->pdev; + + len += sysfs_emit_at(buf, len, "%04x:%02x:%02x.%x\n", + pci_domain_nr(pdev->bus), + pdev->bus->number, + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn)); + } + + return len; +} + +static DEVICE_ATTR_RO(gpu_devices); + +static ssize_t egm_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct egm_region *region = + container_of(dev, struct egm_region, device); + return sysfs_emit(buf, "0x%lx\n", region->egmlength); +} + +static DEVICE_ATTR_RO(egm_size); + +static struct attribute *attrs[] = { + &dev_attr_gpu_devices.attr, + &dev_attr_egm_size.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static int add_gpu(struct egm_region *region, struct pci_dev *pdev) +{ + struct gpu_node *node; + + node = kvzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + node->pdev = pdev; + + list_add_tail(&node->list, ®ion->gpus); + return 0; +} + +static void remove_gpu(struct egm_region *region, struct pci_dev *pdev) +{ + struct gpu_node *node, *tmp; + + list_for_each_entry_safe(node, tmp, ®ion->gpus, list) { + if (node->pdev == pdev) { + list_del(&node->list); + kvfree(node); + } + } +} + +int register_egm_node(struct pci_dev *pdev) +{ + struct egm_region *region = NULL; + u64 egmphys, egmlength, egmpxm; + int ret; + + ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength, &egmpxm); + if (ret) + return ret; + + /* Check if region already exists */ + list_for_each_entry(region, &egm_list, list) { + if (region->egmphys == egmphys) { + /* Add GPU to existing region */ + return add_gpu(region, pdev); + } + } + + /* Create new region */ + region = kvzalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return -ENOMEM; + + region->egmphys = egmphys; + region->egmlength = egmlength; + region->egmpxm = egmpxm; + + hash_init(region->htbl); + INIT_LIST_HEAD(®ion->gpus); + + atomic_set(®ion->open_count, 0); + + nvgrace_egm_fetch_bad_pages(pdev, region); + + ret = setup_egm_chardev(region); + if (ret) + goto err_free_region; + + list_add_tail(®ion->list, &egm_list); + + ret = sysfs_create_group(®ion->device.kobj, &attr_group); + if (ret) + goto err_remove_from_list; + + ret = add_gpu(region, pdev); + if (ret) + goto err_remove_sysfs; + + return 0; + +err_remove_sysfs: + sysfs_remove_group(®ion->device.kobj, &attr_group); +err_remove_from_list: + list_del(®ion->list); + destroy_egm_chardev(region); +err_free_region: + kfree(region); + return ret; +} +EXPORT_SYMBOL_GPL(register_egm_node); + +void unregister_egm_node(struct pci_dev *pdev) +{ + struct egm_region *region, *temp_region; + struct h_node *cur_page; + unsigned long bkt; + struct hlist_node *temp_node; + u64 egmphys, egmlength, egmpxm; + int ret; + + ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength, &egmpxm); + if (ret) + return; + + list_for_each_entry_safe(region, temp_region, &egm_list, list) { + if (egmpxm == region->egmpxm) { + remove_gpu(region, pdev); + if (!list_empty(®ion->gpus)) + break; + + hash_for_each_safe(region->htbl, bkt, temp_node, cur_page, node) { + hash_del(&cur_page->node); + vfree(cur_page); + } + + sysfs_remove_group(®ion->device.kobj, &attr_group); + destroy_egm_chardev(region); + list_del(®ion->list); + kfree(region); + } + } +} +EXPORT_SYMBOL_GPL(unregister_egm_node); + +static char *egm_devnode(const struct device *device, umode_t *mode) +{ + if (mode) + *mode = 0600; + + return NULL; +} + +static int __init nvgrace_egm_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&dev, + 0, MAX_EGM_NODES, "egm"); + if (ret < 0) + return ret; + + class = class_create("egm"); + if (IS_ERR(class)) { + unregister_chrdev_region(dev, MAX_EGM_NODES); + return PTR_ERR(class); + } + + class->devnode = egm_devnode; + + INIT_LIST_HEAD(&egm_list); + + return 0; +} + +static void __exit nvgrace_egm_cleanup(void) +{ + class_destroy(class); + unregister_chrdev_region(dev, MAX_EGM_NODES); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ankit Agrawal "); +MODULE_DESCRIPTION("NVGRACE EGM - Helper module of NVGRACE GPU to support Extended GPU Memory"); + +module_init(nvgrace_egm_init); +module_exit(nvgrace_egm_cleanup); diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e5ac39c4cc6b6..749f12b51f025 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,19 @@ #include #include #include +#include +#include + +#ifdef CONFIG_MEMORY_FAILURE +#include +#include +#include +#endif + +struct h_node { + unsigned long mem_offset; + struct hlist_node node; +}; /* * The device memory usable to the workloads running in the VM is cached @@ -47,6 +60,10 @@ struct mem_region { void *memaddr; void __iomem *ioaddr; }; /* Base virtual address of the region */ +#ifdef CONFIG_MEMORY_FAILURE + struct pfn_address_space pfn_address_space; + DECLARE_HASHTABLE(htbl, 8); +#endif }; struct nvgrace_gpu_pci_core_device { @@ -58,7 +75,101 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + int egm_node; +}; + +static bool egm_enabled; + +#ifdef CONFIG_MEMORY_FAILURE +static void +nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space, + unsigned long pfn) +{ + struct mem_region *region = container_of(pfn_space, + struct mem_region, pfn_address_space); + unsigned long mem_offset = pfn - pfn_space->node.start; + struct h_node *ecc; + + if (mem_offset >= region->memlength) + return; + + /* + * MM has called to notify a poisoned page. Track that in the hastable. + */ + ecc = (struct h_node *)(vzalloc(sizeof(struct h_node))); + ecc->mem_offset = mem_offset; + hash_add(region->htbl, &ecc->node, ecc->mem_offset); +} + +struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = { + .failure = nvgrace_gpu_vfio_pci_pfn_memory_failure, +}; + +static int +nvgrace_gpu_vfio_pci_register_pfn_range(struct mem_region *region, + struct vm_area_struct *vma) +{ + unsigned long nr_pages; + int ret = 0; + + nr_pages = region->memlength >> PAGE_SHIFT; + + region->pfn_address_space.node.start = vma->vm_pgoff; + region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; + region->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops; + region->pfn_address_space.mapping = vma->vm_file->f_mapping; + + ret = register_pfn_address_space(®ion->pfn_address_space); + + return ret; +} + +extern struct vfio_device *vfio_device_from_file(struct file *file); + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff; + struct vfio_device *core_vdev; + struct nvgrace_gpu_pci_core_device *nvdev; + struct h_node *cur; + + if (!(vmf->vma->vm_file)) + goto error_exit; + + core_vdev = vfio_device_from_file(vmf->vma->vm_file); + + if (!core_vdev) + goto error_exit; + + nvdev = container_of(core_vdev, + struct nvgrace_gpu_pci_core_device, + core_device.vdev); + + /* + * Check if the page is poisoned. + */ + if (mem_offset < (nvdev->resmem.memlength >> PAGE_SHIFT)) { + hash_for_each_possible(nvdev->resmem.htbl, cur, node, mem_offset) { + if (cur->mem_offset == mem_offset) + return VM_FAULT_HWPOISON; + } + } + + if (mem_offset < (nvdev->usemem.memlength >> PAGE_SHIFT)) { + hash_for_each_possible(nvdev->usemem.htbl, cur, node, mem_offset) { + if (cur->mem_offset == mem_offset) + return VM_FAULT_HWPOISON; + } + } + +error_exit: + return VM_FAULT_ERROR; +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, }; +#endif static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) { @@ -127,6 +238,11 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) mutex_destroy(&nvdev->remap_lock); +#ifdef CONFIG_MEMORY_FAILURE + if (nvdev->resmem.memlength) + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); + unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); +#endif vfio_pci_core_close_device(core_vdev); } @@ -202,7 +318,17 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_pgoff = start_pfn; - return 0; +#ifdef CONFIG_MEMORY_FAILURE + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; + + if (index == VFIO_PCI_BAR2_REGION_INDEX) { + WARN_ON_ONCE(!nvdev->has_mig_hw_bug); + ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->resmem, vma); + } else + ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->usemem, vma); +#endif + + return ret; } static long @@ -736,7 +862,7 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, if (ret) return ret; - if (*pmemphys > type_max(phys_addr_t)) + if (overflows_type(*pmemphys, phys_addr_t)) return -EOVERFLOW; ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", @@ -744,7 +870,7 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, if (ret) return ret; - if (*pmemlength > type_max(size_t)) + if (overflows_type(*pmemlength, size_t)) return -EOVERFLOW; /* @@ -757,6 +883,13 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, return ret; } +static int +nvgrace_gpu_has_egm_property(struct pci_dev *pdev, u64 *pegmpxm) +{ + return device_property_read_u64(&pdev->dev, "nvidia,egm-pxm", + pegmpxm); +} + static int nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, struct nvgrace_gpu_pci_core_device *nvdev, @@ -935,6 +1068,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; struct nvgrace_gpu_pci_core_device *nvdev; u64 memphys, memlength; + u64 egmpxm; int ret; ret = nvgrace_gpu_wait_device_ready(pdev); @@ -942,9 +1076,14 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, return ret; ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); - if (!ret) + if (!ret) { ops = &nvgrace_gpu_pci_ops; + ret = nvgrace_gpu_has_egm_property(pdev, &egmpxm); + if (!ret) + egm_enabled = true; + } + nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, &pdev->dev, ops); if (IS_ERR(nvdev)) @@ -963,14 +1102,34 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, memphys, memlength); if (ret) goto out_put_vdev; + + if (egm_enabled) { + ret = register_egm_node(pdev); + if (ret) + goto out_put_vdev; + + nvdev->egm_node = egmpxm; + } + } ret = vfio_pci_core_register_device(&nvdev->core_device); if (ret) - goto out_put_vdev; + goto out_egm_unreg; + +#ifdef CONFIG_MEMORY_FAILURE + /* + * Initialize the hashtable tracking the poisoned pages. + */ + hash_init(nvdev->resmem.htbl); + hash_init(nvdev->usemem.htbl); +#endif return ret; +out_egm_unreg: + if (egm_enabled) + unregister_egm_node(pdev); out_put_vdev: vfio_put_device(&nvdev->core_device.vdev); return ret; @@ -979,6 +1138,28 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, static void nvgrace_gpu_remove(struct pci_dev *pdev) { struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); + +#ifdef CONFIG_MEMORY_FAILURE + struct h_node *cur; + unsigned long bkt; + struct hlist_node *tmp_node; + + hash_for_each_safe(nvdev->resmem.htbl, bkt, tmp_node, cur, node) { + hash_del(&cur->node); + vfree(cur); + } + + hash_for_each_safe(nvdev->usemem.htbl, bkt, tmp_node, cur, node) { + hash_del(&cur->node); + vfree(cur); + } +#endif + + if (egm_enabled) + unregister_egm_node(pdev); vfio_pci_core_unregister_device(core_device); vfio_put_device(&core_device->vdev); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index e727941f589de..6f7ae7e5b7b07 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -144,6 +144,8 @@ static const struct vfio_device_ops vfio_pci_ops = { .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, .detach_ioas = vfio_iommufd_physical_detach_ioas, + .pasid_attach_ioas = vfio_iommufd_physical_pasid_attach_ioas, + .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 1fd261efc582d..a9081ffd7a8ca 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1365,7 +1365,7 @@ const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -static struct vfio_device *vfio_device_from_file(struct file *file) +struct vfio_device *vfio_device_from_file(struct file *file) { struct vfio_device_file *df = file->private_data; @@ -1373,6 +1373,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file) return NULL; return df->device; } +EXPORT_SYMBOL_GPL(vfio_device_from_file); /** * vfio_file_is_valid - True if the file is valid vfio file diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6a..718f9b1b91afa 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -257,6 +257,7 @@ struct ida { int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); +int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max); /** * ida_alloc() - Allocate an unused ID. @@ -328,4 +329,14 @@ static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } + +static inline bool ida_exists(struct ida *ida, unsigned int id) +{ + return ida_find_first_range(ida, id, id) == id; +} + +static inline int ida_find_first(struct ida *ida) +{ + return ida_find_first_range(ida, 0, ~0); +} #endif /* __IDR_H__ */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9058e0ce5a190..5d743cdec881f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -14,6 +14,7 @@ #include #include #include +#include #define IOMMU_READ (1 << 0) #define IOMMU_WRITE (1 << 1) @@ -41,9 +42,12 @@ struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; +struct iommu_dma_msi_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; +struct msi_desc; +struct msi_msg; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -163,6 +167,15 @@ struct iommu_domain_geometry { bool force_aperture; /* DMA only allowed in mappable range? */ }; +enum iommu_domain_cookie_type { + IOMMU_COOKIE_NONE, + IOMMU_COOKIE_DMA_IOVA, + IOMMU_COOKIE_DMA_MSI, + IOMMU_COOKIE_FAULT_HANDLER, + IOMMU_COOKIE_SVA, + IOMMU_COOKIE_IOMMUFD, +}; + /* Domain feature flags */ #define __IOMMU_DOMAIN_PAGING (1U << 0) /* Support for iommu_map/unmap */ #define __IOMMU_DOMAIN_DMA_API (1U << 1) /* Domain for use in DMA-API @@ -209,15 +222,18 @@ struct iommu_domain_geometry { struct iommu_domain { unsigned type; + enum iommu_domain_cookie_type cookie_type; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; - struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); - void *fault_data; - union { + + union { /* cookie */ + struct iommu_dma_cookie *iova_cookie; + struct iommu_dma_msi_cookie *msi_cookie; + struct iommufd_hw_pagetable *iommufd_hwpt; struct { iommu_fault_handler_t handler; void *handler_token; @@ -547,16 +563,57 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, return 0; } +/** + * __iommu_copy_struct_to_user - Report iommu driver specific user space data + * @dst_data: Pointer to a struct iommu_user_data for user space data location + * @src_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @data_type: The data type of the @src_data. Must match with @dst_data.type + * @data_len: Length of current user data structure, i.e. sizeof(struct _src) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int +__iommu_copy_struct_to_user(const struct iommu_user_data *dst_data, + void *src_data, unsigned int data_type, + size_t data_len, size_t min_len) +{ + if (WARN_ON(!dst_data || !src_data)) + return -EINVAL; + if (dst_data->type != data_type) + return -EINVAL; + if (dst_data->len < min_len || data_len < dst_data->len) + return -EINVAL; + return copy_struct_to_user(dst_data->uptr, dst_data->len, src_data, + data_len, NULL); +} + +/** + * iommu_copy_struct_to_user - Report iommu driver specific user space data + * @user_data: Pointer to a struct iommu_user_data for user space data location + * @ksrc: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @data_type: The data type of the @ksrc. Must match with @user_data->type + * @min_last: The last member of the data structure @ksrc points in the initial + * version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_to_user(user_data, ksrc, data_type, min_last) \ + __iommu_copy_struct_to_user(user_data, ksrc, data_type, sizeof(*ksrc), \ + offsetofend(typeof(*ksrc), min_last)) + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability * @hw_info: report iommu hardware information. The data buffer returned by this * op is allocated in the iommu driver and freed by the caller after - * use. The information type is one of enum iommu_hw_info_type defined - * in include/uapi/linux/iommufd.h. - * @domain_alloc: allocate and return an iommu domain if success. Otherwise - * NULL is returned. The domain is not fully initialized until - * the caller iommu_domain_alloc() returns. + * use. @type can input a requested type and output a supported type. + * Driver should reject an unsupported data @type input + * @domain_alloc: Do not use in new drivers + * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to + * use identity_domain instead. This should only be used + * if dynamic logic is necessary. * @domain_alloc_paging_flags: Allocate an iommu domain corresponding to the * input parameters as defined in * include/uapi/linux/iommufd.h. The @user_data can be @@ -587,14 +644,16 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting * @default_domain_ops: the default ops for domains - * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind - * the @dev, as the set of virtualization resources shared/passed - * to user space IOMMU instance. And associate it with a nesting - * @parent_domain. The @viommu_type must be defined in the header - * include/uapi/linux/iommufd.h - * It is required to call iommufd_viommu_alloc() helper for - * a bundled allocation of the core and the driver structures, - * using the given @ictx pointer. + * @get_viommu_size: Get the size of a driver-level vIOMMU structure for a given + * @dev corresponding to @viommu_type. Driver should return 0 + * if vIOMMU isn't supported accordingly. It is required for + * driver to use the VIOMMU_STRUCT_SIZE macro to sanitize the + * driver-level vIOMMU structure related to the core one + * @viommu_init: Init the driver-level struct of an iommufd_viommu on a physical + * IOMMU instance @viommu->iommu_dev, as the set of virtualization + * resources shared/passed to user space IOMMU instance. Associate + * it with a nesting @parent_domain. It is required for driver to + * set @viommu->ops pointing to its own viommu_ops * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops * @identity_domain: An always available, always attachable identity @@ -611,10 +670,14 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, */ struct iommu_ops { bool (*capable)(struct device *dev, enum iommu_cap); - void *(*hw_info)(struct device *dev, u32 *length, u32 *type); + void *(*hw_info)(struct device *dev, u32 *length, + enum iommu_hw_info_type *type); /* Domain allocation and freeing by the iommu driver */ +#if IS_ENABLED(CONFIG_FSL_PAMU) struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); +#endif + struct iommu_domain *(*domain_alloc_identity)(struct device *dev); struct iommu_domain *(*domain_alloc_paging_flags)( struct device *dev, u32 flags, const struct iommu_user_data *user_data); @@ -645,9 +708,11 @@ struct iommu_ops { int (*def_domain_type)(struct device *dev); - struct iommufd_viommu *(*viommu_alloc)( - struct device *dev, struct iommu_domain *parent_domain, - struct iommufd_ctx *ictx, unsigned int viommu_type); + size_t (*get_viommu_size)(struct device *dev, + enum iommu_viommu_type viommu_type); + int (*viommu_init)(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data); const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; @@ -696,8 +761,6 @@ struct iommu_ops { * specific mechanisms. * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. - * @get_msi_mapping_domain: Return the related iommu_domain that should hold the - * MSI cookie and accept mapping(s). */ struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); @@ -727,8 +790,6 @@ struct iommu_domain_ops { unsigned long quirks); void (*free)(struct iommu_domain *domain); - struct iommu_domain * - (*get_msi_mapping_domain)(struct iommu_domain *domain); }; /** @@ -1083,7 +1144,6 @@ struct iommu_mm_data { }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); -void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) @@ -1394,10 +1454,6 @@ static inline int iommu_fwspec_init(struct device *dev, return -ENODEV; } -static inline void iommu_fwspec_free(struct device *dev) -{ -} - static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) { @@ -1474,6 +1530,18 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#ifdef CONFIG_IRQ_MSI_IOMMU +#ifdef CONFIG_IOMMU_API +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); +#else +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, + phys_addr_t msi_addr) +{ + return 0; +} +#endif /* CONFIG_IOMMU_API */ +#endif /* CONFIG_IRQ_MSI_IOMMU */ + #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) void iommu_group_mutex_assert(struct device *dev); #else @@ -1507,32 +1575,12 @@ static inline void iommu_debugfs_setup(void) {} #endif #ifdef CONFIG_IOMMU_DMA -#include - int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); - -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg); - #else /* CONFIG_IOMMU_DMA */ - -struct msi_desc; -struct msi_msg; - static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { return -ENODEV; } - -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) -{ - return 0; -} - -static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -} - #endif /* CONFIG_IOMMU_DMA */ /* diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 11110c7492009..e3a0cd47384d0 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -8,9 +8,11 @@ #include #include +#include #include #include #include +#include struct device; struct file; @@ -34,6 +36,8 @@ enum iommufd_object_type { IOMMUFD_OBJ_FAULT, IOMMUFD_OBJ_VIOMMU, IOMMUFD_OBJ_VDEVICE, + IOMMUFD_OBJ_VEVENTQ, + IOMMUFD_OBJ_HW_QUEUE, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -52,9 +56,11 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); void iommufd_device_unbind(struct iommufd_device *idev); -int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); -int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id); -void iommufd_device_detach(struct iommufd_device *idev); +int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id); +int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id); +void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); u32 iommufd_device_to_id(struct iommufd_device *idev); @@ -93,8 +99,39 @@ struct iommufd_viommu { const struct iommufd_viommu_ops *ops; struct xarray vdevs; + struct list_head veventqs; + struct rw_semaphore veventqs_rwsem; - unsigned int type; + enum iommu_viommu_type type; +}; + +struct iommufd_vdevice { + struct iommufd_object obj; + struct iommufd_viommu *viommu; + struct device *dev; + + /* + * Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID of + * AMD IOMMU, and vRID of Intel VT-d + */ + u64 virt_id; + + /* Clean up all driver-specific parts of an iommufd_vdevice */ + void (*destroy)(struct iommufd_vdevice *vdev); +}; + +struct iommufd_hw_queue { + struct iommufd_object obj; + struct iommufd_viommu *viommu; + struct iommufd_access *access; + + u64 base_addr; /* in guest physical address space */ + size_t length; + + enum iommu_hw_queue_type type; + + /* Clean up all driver-specific parts of an iommufd_hw_queue */ + void (*destroy)(struct iommufd_hw_queue *hw_queue); }; /** @@ -113,6 +150,30 @@ struct iommufd_viommu { * array->entry_num to report the number of handled requests. * The data structure of the array entry must be defined in * include/uapi/linux/iommufd.h + * @vdevice_size: Size of the driver-defined vDEVICE structure per this vIOMMU + * @vdevice_init: Initialize the driver-level structure of a vDEVICE object, or + * related HW procedure. @vdev is already initialized by iommufd + * core: vdev->dev and vdev->viommu pointers; vdev->id carries a + * per-vIOMMU virtual ID (refer to struct iommu_vdevice_alloc in + * include/uapi/linux/iommufd.h) + * If driver has a deinit function to revert what vdevice_init op + * does, it should set it to the @vdev->destroy function pointer + * @get_hw_queue_size: Get the size of a driver-defined HW queue structure for a + * given @viommu corresponding to @queue_type. Driver should + * return 0 if HW queue aren't supported accordingly. It is + * required for driver to use the HW_QUEUE_STRUCT_SIZE macro + * to sanitize the driver-level HW queue structure related + * to the core one + * @hw_queue_init_phys: Initialize the driver-level structure of a HW queue that + * is initialized with its core-level structure that holds + * all the info about a guest queue memory. + * Driver providing this op indicates that HW accesses the + * guest queue memory via physical addresses. + * @index carries the logical HW QUEUE ID per vIOMMU in a + * guest VM, for a multi-queue model. @base_addr_pa carries + * the physical location of the guest queue + * If driver has a deinit function to revert what this op + * does, it should set it to the @hw_queue->destroy pointer */ struct iommufd_viommu_ops { void (*destroy)(struct iommufd_viommu *viommu); @@ -121,6 +182,13 @@ struct iommufd_viommu_ops { const struct iommu_user_data *user_data); int (*cache_invalidate)(struct iommufd_viommu *viommu, struct iommu_user_data_array *array); + const size_t vdevice_size; + int (*vdevice_init)(struct iommufd_vdevice *vdev); + size_t (*get_hw_queue_size)(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type); + /* AMD's HW will add hw_queue_init simply using @hw_queue->base_addr */ + int (*hw_queue_init_phys)(struct iommufd_hw_queue *hw_queue, u32 index, + phys_addr_t base_addr_pa); }; #if IS_ENABLED(CONFIG_IOMMUFD) @@ -164,8 +232,9 @@ static inline void iommufd_access_unpin_pages(struct iommufd_access *access, { } -static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, - void *data, size_t len, unsigned int flags) +static inline int iommufd_access_rw(struct iommufd_access *access, + unsigned long iova, void *data, size_t len, + unsigned int flags) { return -EOPNOTSUPP; } @@ -182,17 +251,47 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) #endif /* CONFIG_IOMMUFD */ #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type); +int _iommufd_object_depend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended); +void _iommufd_object_undepend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended); +int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner, + phys_addr_t mmio_addr, size_t length, + unsigned long *offset); +void _iommufd_destroy_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, unsigned long offset); struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id); +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ -static inline struct iommufd_object * -_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, - enum iommufd_object_type type) +static inline int _iommufd_object_depend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ + return -EOPNOTSUPP; +} + +static inline void +_iommufd_object_undepend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ +} + +static inline int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, + phys_addr_t mmio_addr, size_t length, + unsigned long *offset) +{ + return -EOPNOTSUPP; +} + +static inline void _iommufd_destroy_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, + unsigned long offset) { - return ERR_PTR(-EOPNOTSUPP); } static inline struct device * @@ -200,23 +299,89 @@ iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) { return NULL; } + +static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, + unsigned long *vdev_id) +{ + return -ENOENT; +} + +static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, + void *event_data, size_t data_len) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ +#define VIOMMU_STRUCT_SIZE(drv_struct, member) \ + (sizeof(drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ + BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_viommu, \ + ((drv_struct *)NULL)->member))) + +#define VDEVICE_STRUCT_SIZE(drv_struct, member) \ + (sizeof(drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ + BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_vdevice, \ + ((drv_struct *)NULL)->member))) + +#define HW_QUEUE_STRUCT_SIZE(drv_struct, member) \ + (sizeof(drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ + BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_hw_queue, \ + ((drv_struct *)NULL)->member))) + /* - * Helpers for IOMMU driver to allocate driver structures that will be freed by - * the iommufd core. The free op will be called prior to freeing the memory. + * Helpers for IOMMU driver to build/destroy a dependency between two sibling + * structures created by one of the allocators above */ -#define iommufd_viommu_alloc(ictx, drv_struct, member, viommu_ops) \ +#define iommufd_hw_queue_depend(dependent, depended, member) \ ({ \ - drv_struct *ret; \ + int ret = -EINVAL; \ \ - static_assert(__same_type(struct iommufd_viommu, \ - ((drv_struct *)NULL)->member)); \ - static_assert(offsetof(drv_struct, member.obj) == 0); \ - ret = (drv_struct *)_iommufd_object_alloc( \ - ictx, sizeof(drv_struct), IOMMUFD_OBJ_VIOMMU); \ - if (!IS_ERR(ret)) \ - ret->member.ops = viommu_ops; \ + static_assert(__same_type(struct iommufd_hw_queue, \ + dependent->member)); \ + static_assert(__same_type(typeof(*dependent), *depended)); \ + if (!WARN_ON_ONCE(dependent->member.viommu != \ + depended->member.viommu)) \ + ret = _iommufd_object_depend(&dependent->member.obj, \ + &depended->member.obj); \ ret; \ }) + +#define iommufd_hw_queue_undepend(dependent, depended, member) \ + ({ \ + static_assert(__same_type(struct iommufd_hw_queue, \ + dependent->member)); \ + static_assert(__same_type(typeof(*dependent), *depended)); \ + WARN_ON_ONCE(dependent->member.viommu != \ + depended->member.viommu); \ + _iommufd_object_undepend(&dependent->member.obj, \ + &depended->member.obj); \ + }) + +/* + * Helpers for IOMMU driver to alloc/destroy an mmapable area for a structure. + * + * To support an mmappable MMIO region, kernel driver must first register it to + * iommufd core to allocate an @offset, during a driver-structure initialization + * (e.g. viommu_init op). Then, it should report to user space this @offset and + * the @length of the MMIO region for mmap syscall. + */ +static inline int iommufd_viommu_alloc_mmap(struct iommufd_viommu *viommu, + phys_addr_t mmio_addr, + size_t length, + unsigned long *offset) +{ + return _iommufd_alloc_mmap(viommu->ictx, &viommu->obj, mmio_addr, + length, offset); +} + +static inline void iommufd_viommu_destroy_mmap(struct iommufd_viommu *viommu, + unsigned long offset) +{ + _iommufd_destroy_mmap(viommu->ictx, &viommu->obj, offset); +} #endif diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 0000000000000..9a579960972aa --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include + +struct pfn_address_space; + +struct pfn_address_space_ops { + void (*failure)(struct pfn_address_space *pfn_space, unsigned long pfn); +}; + +struct pfn_address_space { + struct interval_tree_node node; + const struct pfn_address_space_ops *ops; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 72dbec7ed901e..318d33b54949c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4033,6 +4033,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; diff --git a/include/linux/msi.h b/include/linux/msi.h index 59a421fc42bf0..862cfec024763 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -165,6 +165,10 @@ struct msi_desc_data { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @iommu_msi_iova: Optional shifted IOVA from the IOMMU to override the msi_addr. + * Only used if iommu_msi_shift != 0 + * @iommu_msi_shift: Indicates how many bits of the original address should be + * preserved when using iommu_msi_iova. * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message @@ -183,7 +187,8 @@ struct msi_desc { struct msi_msg msg; struct irq_affinity_desc *affinity; #ifdef CONFIG_IRQ_MSI_IOMMU - const void *iommu_cookie; + u64 iommu_msi_iova : 58; + u64 iommu_msi_shift : 6; #endif #ifdef CONFIG_SYSFS struct device_attribute *sysfs_attrs; @@ -284,28 +289,42 @@ struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid, #define msi_desc_to_dev(desc) ((desc)->dev) -#ifdef CONFIG_IRQ_MSI_IOMMU -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) +static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_iova, + unsigned int msi_shift) { - return desc->iommu_cookie; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ - desc->iommu_cookie = iommu_cookie; -} -#else -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -{ - return NULL; +#ifdef CONFIG_IRQ_MSI_IOMMU + desc->iommu_msi_iova = msi_iova >> msi_shift; + desc->iommu_msi_shift = msi_shift; +#endif } -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) +/** + * msi_msg_set_addr() - Set MSI address in an MSI message + * + * @desc: MSI descriptor that may carry an IOVA base address for MSI via @iommu_msi_iova/shift + * @msg: Target MSI message to set its address_hi and address_lo + * @msi_addr: Physical address to set the MSI message + * + * Notes: + * - Override @msi_addr using the IOVA base address in the @desc if @iommu_msi_shift is set + * - Otherwise, simply set @msi_addr to @msg + */ +static inline void msi_msg_set_addr(struct msi_desc *desc, struct msi_msg *msg, + phys_addr_t msi_addr) { -} +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; + + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msi_addr & ((1 << desc->iommu_msi_shift) - 1)); + return; + } #endif + msg->address_hi = upper_32_bits(msi_addr); + msg->address_lo = lower_32_bits(msi_addr); +} int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); diff --git a/include/linux/nvgrace-egm.h b/include/linux/nvgrace-egm.h new file mode 100644 index 0000000000000..4bbd383a02732 --- /dev/null +++ b/include/linux/nvgrace-egm.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _NVGRACE_EGM_H +#define _NVGRACE_EGM_H + +int register_egm_node(struct pci_dev *pdev); +void unregister_egm_node(struct pci_dev *pdev); + +#endif /* _NVGRACE_EGM_H */ diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index 0e8b74e63767a..75c6c86cf09dc 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -42,6 +42,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features); void pci_disable_pasid(struct pci_dev *pdev); int pci_pasid_features(struct pci_dev *pdev); int pci_max_pasids(struct pci_dev *pdev); +int pci_pasid_status(struct pci_dev *pdev); #else /* CONFIG_PCI_PASID */ static inline int pci_enable_pasid(struct pci_dev *pdev, int features) { return -EINVAL; } @@ -50,6 +51,8 @@ static inline int pci_pasid_features(struct pci_dev *pdev) { return -EINVAL; } static inline int pci_max_pasids(struct pci_dev *pdev) { return -EINVAL; } +static inline int pci_pasid_status(struct pci_dev *pdev) +{ return -EINVAL; } #endif /* CONFIG_PCI_PASID */ #endif /* LINUX_PCI_ATS_H */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 000a6cab2d318..707b00772ce1f 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -67,6 +67,7 @@ struct vfio_device { struct inode *inode; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; + struct ida pasids; u8 iommufd_attached:1; #endif u8 cdev_opened:1; @@ -91,6 +92,8 @@ struct vfio_device { * bound iommufd. Undo in unbind_iommufd if @detach_ioas is not * called. * @detach_ioas: Opposite of attach_ioas + * @pasid_attach_ioas: The pasid variation of attach_ioas + * @pasid_detach_ioas: Opposite of pasid_attach_ioas * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device * @read: Perform read(2) on device file descriptor @@ -115,6 +118,9 @@ struct vfio_device_ops { void (*unbind_iommufd)(struct vfio_device *vdev); int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); void (*detach_ioas)(struct vfio_device *vdev); + int (*pasid_attach_ioas)(struct vfio_device *vdev, u32 pasid, + u32 *pt_id); + void (*pasid_detach_ioas)(struct vfio_device *vdev, u32 pasid); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -139,6 +145,10 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, void vfio_iommufd_physical_unbind(struct vfio_device *vdev); int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev); +int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev, + u32 pasid, u32 *pt_id); +void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, + u32 pasid); int vfio_iommufd_emulated_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); @@ -166,6 +176,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) #define vfio_iommufd_physical_detach_ioas \ ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_physical_pasid_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 pasid, u32 *pt_id)) NULL) +#define vfio_iommufd_physical_pasid_detach_ioas \ + ((void (*)(struct vfio_device *vdev, u32 pasid)) NULL) #define vfio_iommufd_emulated_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ u32 *out_device_id)) NULL) diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index e5f7ee0864e78..bb1127f3e5b7e 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -373,6 +373,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/include/uapi/linux/egm.h b/include/uapi/linux/egm.h new file mode 100644 index 0000000000000..8a808e45c2052 --- /dev/null +++ b/include/uapi/linux/egm.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _UAPIEGM_H +#define _UAPIEGM_H + +#define EGM_TYPE ('E') + +struct egm_bad_pages_info { + __aligned_u64 offset; + __aligned_u64 size; +}; + +struct egm_bad_pages_list { + __u32 argsz; + /* out */ + __u32 count; + /* out */ + struct egm_bad_pages_info bad_pages[]; +}; + +#define EGM_BAD_PAGES_LIST _IO(EGM_TYPE, 100) + +#endif /* _UAPIEGM_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 78747b24bd0fb..83204a5d5b95d 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -55,6 +55,8 @@ enum { IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, + IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, + IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94, }; /** @@ -392,6 +394,9 @@ struct iommu_vfio_ioas { * Any domain attached to the non-PASID part of the * device must also be flagged, otherwise attaching a * PASID will blocked. + * For the user that wants to attach PASID, ioas is + * not recommended for both the non-PASID part + * and PASID part of the device. * If IOMMU does not support PASID it will return * error (-EOPNOTSUPP). */ @@ -586,17 +591,44 @@ struct iommu_hw_info_arm_smmuv3 { __u32 aidr; }; +/** + * struct iommu_hw_info_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Hardware + * Information (IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV) + * + * @flags: Must be 0 + * @version: Version number for the CMDQ-V HW for PARAM bits[03:00] + * @log2vcmdqs: Log2 of the total number of VCMDQs for PARAM bits[07:04] + * @log2vsids: Log2 of the total number of SID replacements for PARAM bits[15:12] + * @__reserved: Must be 0 + * + * VMM can use these fields directly in its emulated global PARAM register. Note + * that only one Virtual Interface (VINTF) should be exposed to a VM, i.e. PARAM + * bits[11:08] should be set to 0 for log2 of the total number of VINTFs. + */ +struct iommu_hw_info_tegra241_cmdqv { + __u32 flags; + __u8 version; + __u8 log2vcmdqs; + __u8 log2vsids; + __u8 __reserved; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types - * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware + * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware * info + * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type + * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, + IOMMU_HW_INFO_TYPE_DEFAULT = 0, IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, + IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, }; /** @@ -608,9 +640,26 @@ enum iommu_hw_info_type { * IOMMU_HWPT_GET_DIRTY_BITMAP * IOMMU_HWPT_SET_DIRTY_TRACKING * + * @IOMMU_HW_CAP_PCI_PASID_EXEC: Execute Permission Supported, user ignores it + * when the struct + * iommu_hw_info::out_max_pasid_log2 is zero. + * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it + * when the struct + * iommu_hw_info::out_max_pasid_log2 is zero. */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, + IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, + IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, +}; + +/** + * enum iommufd_hw_info_flags - Flags for iommu_hw_info + * @IOMMU_HW_INFO_FLAG_INPUT_TYPE: If set, @in_data_type carries an input type + * for user space to request for a specific info + */ +enum iommufd_hw_info_flags { + IOMMU_HW_INFO_FLAG_INPUT_TYPE = 1 << 0, }; /** @@ -622,10 +671,19 @@ enum iommufd_hw_capabilities { * data that kernel supports * @data_uptr: User pointer to a user-space buffer used by the kernel to fill * the iommu type specific hardware information data + * @in_data_type: This shares the same field with @out_data_type, making it be + * a bidirectional field. When IOMMU_HW_INFO_FLAG_INPUT_TYPE is + * set, an input type carried via this @in_data_type field will + * be valid, requesting for the info data to the given type. If + * IOMMU_HW_INFO_FLAG_INPUT_TYPE is unset, any input value will + * be seen as IOMMU_HW_INFO_TYPE_DEFAULT * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined * in the enum iommu_hw_capabilities. + * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support. + * PCI devices turn to out_capabilities to check if the + * specific capabilities is supported or not. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -648,8 +706,12 @@ struct iommu_hw_info { __u32 dev_id; __u32 data_len; __aligned_u64 data_uptr; - __u32 out_data_type; - __u32 __reserved; + union { + __u32 in_data_type; + __u32 out_data_type; + }; + __u8 out_max_pasid_log2; + __u8 __reserved[3]; __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) @@ -935,10 +997,29 @@ struct iommu_fault_alloc { * enum iommu_viommu_type - Virtual IOMMU Type * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type + * @IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) enabled ARM SMMUv3 type */ enum iommu_viommu_type { IOMMU_VIOMMU_TYPE_DEFAULT = 0, IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2, +}; + +/** + * struct iommu_viommu_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Virtual Interface + * (IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV) + * @out_vintf_mmap_offset: mmap offset argument for VINTF's page0 + * @out_vintf_mmap_length: mmap length argument for VINTF's page0 + * + * Both @out_vintf_mmap_offset and @out_vintf_mmap_length are reported by kernel + * for user space to mmap the VINTF page0 from the host physical address space + * to the guest physical address space so that a guest kernel can directly R/W + * access to the VINTF page0 in order to control its virtual command queues. + */ +struct iommu_viommu_tegra241_cmdqv { + __aligned_u64 out_vintf_mmap_offset; + __aligned_u64 out_vintf_mmap_length; }; /** @@ -949,6 +1030,9 @@ enum iommu_viommu_type { * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU * @hwpt_id: ID of a nesting parent HWPT to associate to * @out_viommu_id: Output virtual IOMMU ID for the allocated object + * @data_len: Length of the type specific data + * @__reserved: Must be 0 + * @data_uptr: User pointer to a driver-specific virtual IOMMU data * * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's * virtualization support that is a security-isolated slice of the real IOMMU HW @@ -969,6 +1053,9 @@ struct iommu_viommu_alloc { __u32 dev_id; __u32 hwpt_id; __u32 out_viommu_id; + __u32 data_len; + __u32 __reserved; + __aligned_u64 data_uptr; }; #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) @@ -979,7 +1066,7 @@ struct iommu_viommu_alloc { * @dev_id: The physical device to allocate a virtual instance on the vIOMMU * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID - * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table + * of AMD IOMMU, and vRID of Intel VT-d * * Allocate a virtual device instance (for a physical device) against a vIOMMU. * This instance holds the device's information (related to its vIOMMU) in a VM. @@ -1014,4 +1101,180 @@ struct iommu_ioas_change_process { #define IOMMU_IOAS_CHANGE_PROCESS \ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS) +/** + * enum iommu_veventq_flag - flag for struct iommufd_vevent_header + * @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs + */ +enum iommu_veventq_flag { + IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0), +}; + +/** + * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status + * @flags: Combination of enum iommu_veventq_flag + * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of + * [0, INT_MAX] where the following index of INT_MAX is 0 + * + * Each iommufd_vevent_header reports a sequence index of the following vEVENT: + * ------------------------------------------------------------------------- + * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN | + * ------------------------------------------------------------------------- + * And this sequence index is expected to be monotonic to the sequence index of + * the previous vEVENT. If two adjacent sequence indexes has a delta larger than + * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs: + * ------------------------------------------------------------------------- + * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... | + * ------------------------------------------------------------------------- + * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT + * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header + * would be added to the tail, and no data would follow this header: + * --------------------------------------------------------------------------- + * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} | + * --------------------------------------------------------------------------- + */ +struct iommufd_vevent_header { + __u32 flags; + __u32 sequence; +}; + +/** + * enum iommu_veventq_type - Virtual Event Queue Type + * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue + * @IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension IRQ + */ +enum iommu_veventq_type { + IOMMU_VEVENTQ_TYPE_DEFAULT = 0, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1, + IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV = 2, +}; + +/** + * struct iommu_vevent_arm_smmuv3 - ARM SMMUv3 Virtual Event + * (IOMMU_VEVENTQ_TYPE_ARM_SMMUV3) + * @evt: 256-bit ARM SMMUv3 Event record, little-endian. + * Reported event records: (Refer to "7.3 Event records" in SMMUv3 HW Spec) + * - 0x04 C_BAD_STE + * - 0x06 F_STREAM_DISABLED + * - 0x08 C_BAD_SUBSTREAMID + * - 0x0a C_BAD_CD + * - 0x10 F_TRANSLATION + * - 0x11 F_ADDR_SIZE + * - 0x12 F_ACCESS + * - 0x13 F_PERMISSION + * + * StreamID field reports a virtual device ID. To receive a virtual event for a + * device, a vDEVICE must be allocated via IOMMU_VDEVICE_ALLOC. + */ +struct iommu_vevent_arm_smmuv3 { + __aligned_le64 evt[4]; +}; + +/** + * struct iommu_vevent_tegra241_cmdqv - Tegra241 CMDQV IRQ + * (IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV) + * @lvcmdq_err_map: 128-bit logical vcmdq error map, little-endian. + * (Refer to register LVCMDQ_ERR_MAPs per VINTF ) + * + * The 128-bit register value from HW exclusively reflect the error bits for a + * Virtual Interface represented by a vIOMMU object. Read and report directly. + */ +struct iommu_vevent_tegra241_cmdqv { + __aligned_le64 lvcmdq_err_map[2]; +}; + +/** + * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC) + * @size: sizeof(struct iommu_veventq_alloc) + * @flags: Must be 0 + * @viommu_id: virtual IOMMU ID to associate the vEVENTQ with + * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type + * @veventq_depth: Maximum number of events in the vEVENTQ + * @out_veventq_id: The ID of the new vEVENTQ + * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the + * successfully returned fd after using it + * @__reserved: Must be 0 + * + * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU + * can have multiple FDs for different types, but is confined to one per @type. + * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ, + * if there are vEVENTs available. A vEVENTQ will lose events due to overflow, + * if the number of the vEVENTs hits @veventq_depth. + * + * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by + * a type-specific data structure, in a normal case: + * ------------------------------------------------------------- + * || header0 | data0 | header1 | data1 | ... | headerN | dataN || + * ------------------------------------------------------------- + * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to + * struct iommufd_vevent_header). + */ +struct iommu_veventq_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 veventq_depth; + __u32 out_veventq_id; + __u32 out_veventq_fd; + __u32 __reserved; +}; +#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) + +/** + * enum iommu_hw_queue_type - HW Queue Type + * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use + * @IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) Virtual Command Queue (VCMDQ) + */ +enum iommu_hw_queue_type { + IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, + /* + * TEGRA241_CMDQV requirements (otherwise, allocation will fail) + * - alloc starts from the lowest @index=0 in ascending order + * - destroy starts from the last allocated @index in descending order + * - @base_addr must be aligned to @length in bytes and mapped in IOAS + * - @length must be a power of 2, with a minimum 32 bytes and a maximum + * 2 ^ idr[1].CMDQS * 16 bytes (use GET_HW_INFO call to read idr[1] + * from struct iommu_hw_info_arm_smmuv3) + * - suggest to back the queue memory with contiguous physical pages or + * a single huge page with alignment of the queue size, and limit the + * emulated vSMMU's IDR1.CMDQS to log2(huge page size / 16 bytes) + */ + IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV = 1, +}; + +/** + * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC) + * @size: sizeof(struct iommu_hw_queue_alloc) + * @flags: Must be 0 + * @viommu_id: Virtual IOMMU ID to associate the HW queue with + * @type: One of enum iommu_hw_queue_type + * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue + * model + * @out_hw_queue_id: The ID of the new HW queue + * @nesting_parent_iova: Base address of the queue memory in the guest physical + * address space + * @length: Length of the queue memory + * + * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which + * allows HW to access a guest queue memory described using @nesting_parent_iova + * and @length. + * + * A vIOMMU can allocate multiple queues, but it must use a different @index per + * type to separate each allocation, e.g:: + * + * Type1 HW queue0, Type1 HW queue1, Type2 HW queue0, ... + */ +struct iommu_hw_queue_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 index; + __u32 out_hw_queue_id; + __aligned_u64 nesting_parent_iova; + __aligned_u64 length; +}; +#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC) #endif diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index c8dbf8219c4fc..6899da70b929f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -931,29 +931,34 @@ struct vfio_device_bind_iommufd { * VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19, * struct vfio_device_attach_iommufd_pt) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Flags for attach. * @pt_id: Input the target id which can represent an ioas or a hwpt * allocated via iommufd subsystem. * Output the input ioas id or the attached hwpt id which could * be the specified hwpt itself or a hwpt automatically created * for the specified ioas by kernel during the attachment. + * @pasid: The pasid to be attached, only meaningful when + * VFIO_DEVICE_ATTACH_PASID is set in @flags * * Associate the device with an address space within the bound iommufd. * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only * allowed on cdev fds. * - * If a vfio device is currently attached to a valid hw_pagetable, without doing - * a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl - * passing in another hw_pagetable (hwpt) id is allowed. This action, also known - * as a hw_pagetable replacement, will replace the device's currently attached - * hw_pagetable with a new hw_pagetable corresponding to the given pt_id. + * If a vfio device or a pasid of this device is currently attached to a valid + * hw_pagetable (hwpt), without doing a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second + * VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is allowed. + * This action, also known as a hw_pagetable replacement, will replace the + * currently attached hwpt of the device or the pasid of this device with a new + * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_ATTACH_PASID (1 << 0) __u32 pt_id; + __u32 pasid; }; #define VFIO_DEVICE_ATTACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 19) @@ -962,17 +967,21 @@ struct vfio_device_attach_iommufd_pt { * VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20, * struct vfio_device_detach_iommufd_pt) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Flags for detach. + * @pasid: The pasid to be detached, only meaningful when + * VFIO_DEVICE_DETACH_PASID is set in @flags * - * Remove the association of the device and its current associated address - * space. After it, the device should be in a blocking DMA state. This is only - * allowed on cdev fds. + * Remove the association of the device or a pasid of the device and its current + * associated address space. After it, the device or the pasid should be in a + * blocking DMA state. This is only allowed on cdev fds. * * Return: 0 on success, -errno on failure. */ struct vfio_device_detach_iommufd_pt { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_DETACH_PASID (1 << 0) + __u32 pasid; }; #define VFIO_DEVICE_DETACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 20) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 875f25ed6f710..e126e6ce510e4 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -96,6 +96,7 @@ config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY +# irqchip drivers should select this if they call iommu_dma_prepare_msi() config IRQ_MSI_IOMMU bool diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 13bef2462e94b..05bedb0f1919d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1039,27 +1039,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_X86_KERNEL_IBT -static unsigned long get_entry_ip(unsigned long fentry_ip) +static inline unsigned long get_entry_ip(unsigned long fentry_ip) { - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) +#ifdef CONFIG_X86_KERNEL_IBT + if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; +#endif return fentry_ip; } -#else -#define get_entry_ip(fentry_ip) fentry_ip -#endif BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { diff --git a/lib/idr.c b/lib/idr.c index da36054c3ca02..e2adc457abb4b 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -476,6 +476,73 @@ int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, } EXPORT_SYMBOL(ida_alloc_range); +/** + * ida_find_first_range - Get the lowest used ID. + * @ida: IDA handle. + * @min: Lowest ID to get. + * @max: Highest ID to get. + * + * Get the lowest used ID between @min and @max, inclusive. The returned + * ID will not exceed %INT_MAX, even if @max is larger. + * + * Context: Any context. Takes and releases the xa_lock. + * Return: The lowest used ID, or errno if no used ID is found. + */ +int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max) +{ + unsigned long index = min / IDA_BITMAP_BITS; + unsigned int offset = min % IDA_BITMAP_BITS; + unsigned long *addr, size, bit; + unsigned long tmp = 0; + unsigned long flags; + void *entry; + int ret; + + if ((int)min < 0) + return -EINVAL; + if ((int)max < 0) + max = INT_MAX; + + xa_lock_irqsave(&ida->xa, flags); + + entry = xa_find(&ida->xa, &index, max / IDA_BITMAP_BITS, XA_PRESENT); + if (!entry) { + ret = -ENOENT; + goto err_unlock; + } + + if (index > min / IDA_BITMAP_BITS) + offset = 0; + if (index * IDA_BITMAP_BITS + offset > max) { + ret = -ENOENT; + goto err_unlock; + } + + if (xa_is_value(entry)) { + tmp = xa_to_value(entry); + addr = &tmp; + size = BITS_PER_XA_VALUE; + } else { + addr = ((struct ida_bitmap *)entry)->bitmap; + size = IDA_BITMAP_BITS; + } + + bit = find_next_bit(addr, size, offset); + + xa_unlock_irqrestore(&ida->xa, flags); + + if (bit == size || + index * IDA_BITMAP_BITS + bit > max) + return -ENOENT; + + return index * IDA_BITMAP_BITS + bit; + +err_unlock: + xa_unlock_irqrestore(&ida->xa, flags); + return ret; +} +EXPORT_SYMBOL(ida_find_first_range); + /** * ida_free() - Release an allocated ID. * @ida: IDA handle. diff --git a/lib/test_ida.c b/lib/test_ida.c index c80155a1956d2..63078f8dc13f5 100644 --- a/lib/test_ida.c +++ b/lib/test_ida.c @@ -189,6 +189,75 @@ static void ida_check_bad_free(struct ida *ida) IDA_BUG_ON(ida, !ida_is_empty(ida)); } +/* + * Check ida_find_first_range() and varriants. + */ +static void ida_check_find_first(struct ida *ida) +{ + /* IDA is empty; all of the below should be not exist */ + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, ida_exists(ida, 3)); + IDA_BUG_ON(ida, ida_exists(ida, 63)); + IDA_BUG_ON(ida, ida_exists(ida, 1023)); + IDA_BUG_ON(ida, ida_exists(ida, (1 << 20) - 1)); + + /* IDA contains a single value entry */ + IDA_BUG_ON(ida, ida_alloc_min(ida, 3, GFP_KERNEL) != 3); + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, !ida_exists(ida, 3)); + IDA_BUG_ON(ida, ida_exists(ida, 63)); + IDA_BUG_ON(ida, ida_exists(ida, 1023)); + IDA_BUG_ON(ida, ida_exists(ida, (1 << 20) - 1)); + + IDA_BUG_ON(ida, ida_alloc_min(ida, 63, GFP_KERNEL) != 63); + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, !ida_exists(ida, 3)); + IDA_BUG_ON(ida, !ida_exists(ida, 63)); + IDA_BUG_ON(ida, ida_exists(ida, 1023)); + IDA_BUG_ON(ida, ida_exists(ida, (1 << 20) - 1)); + + /* IDA contains a single bitmap */ + IDA_BUG_ON(ida, ida_alloc_min(ida, 1023, GFP_KERNEL) != 1023); + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, !ida_exists(ida, 3)); + IDA_BUG_ON(ida, !ida_exists(ida, 63)); + IDA_BUG_ON(ida, !ida_exists(ida, 1023)); + IDA_BUG_ON(ida, ida_exists(ida, (1 << 20) - 1)); + + /* IDA contains a tree */ + IDA_BUG_ON(ida, ida_alloc_min(ida, (1 << 20) - 1, GFP_KERNEL) != (1 << 20) - 1); + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, !ida_exists(ida, 3)); + IDA_BUG_ON(ida, !ida_exists(ida, 63)); + IDA_BUG_ON(ida, !ida_exists(ida, 1023)); + IDA_BUG_ON(ida, !ida_exists(ida, (1 << 20) - 1)); + + /* Now try to find first */ + IDA_BUG_ON(ida, ida_find_first(ida) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, -1, 2) != -EINVAL); + IDA_BUG_ON(ida, ida_find_first_range(ida, 0, 2) != -ENOENT); // no used ID + IDA_BUG_ON(ida, ida_find_first_range(ida, 0, 3) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, 1, 3) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, 3, 3) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, 2, 4) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, 4, 3) != -ENOENT); // min > max, fail + IDA_BUG_ON(ida, ida_find_first_range(ida, 4, 60) != -ENOENT); // no used ID + IDA_BUG_ON(ida, ida_find_first_range(ida, 4, 64) != 63); + IDA_BUG_ON(ida, ida_find_first_range(ida, 63, 63) != 63); + IDA_BUG_ON(ida, ida_find_first_range(ida, 64, 1026) != 1023); + IDA_BUG_ON(ida, ida_find_first_range(ida, 1023, 1023) != 1023); + IDA_BUG_ON(ida, ida_find_first_range(ida, 1023, (1 << 20) - 1) != 1023); + IDA_BUG_ON(ida, ida_find_first_range(ida, 1024, (1 << 20) - 1) != (1 << 20) - 1); + IDA_BUG_ON(ida, ida_find_first_range(ida, (1 << 20), INT_MAX) != -ENOENT); + + ida_free(ida, 3); + ida_free(ida, 63); + ida_free(ida, 1023); + ida_free(ida, (1 << 20) - 1); + + IDA_BUG_ON(ida, !ida_is_empty(ida)); +} + static DEFINE_IDA(ida); static int ida_checks(void) @@ -202,6 +271,7 @@ static int ida_checks(void) ida_check_max(&ida); ida_check_conv(&ida); ida_check_bad_free(&ida); + ida_check_find_first(&ida); printk("IDA: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run != tests_passed) ? 0 : -EINVAL; diff --git a/mm/Kconfig b/mm/Kconfig index 1b501db064172..8b17b8533cc1c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -810,6 +810,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select MEMORY_ISOLATION + select INTERVAL_TREE select RAS help Enables code to recover from some memory failures on systems diff --git a/mm/gup.c b/mm/gup.c index 4ededc1133583..c2a6619612d8d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1667,7 +1667,7 @@ int fixup_user_fault(struct mm_struct *mm, } if (ret & VM_FAULT_ERROR) { - int err = vm_fault_to_errno(ret, 0); + int err = vm_fault_to_errno(ret, FOLL_HWPOISON); if (err) return err; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b04fb434b6cf1..9bb055277466e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -60,6 +61,7 @@ #include #include #include +#include #include "swap.h" #include "internal.h" #include "ras/ras_event.h" @@ -154,6 +156,10 @@ static const struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -441,13 +447,22 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ +#define FSDAX_INVALID_PGOFF ULONG_MAX + /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * + * Notice: @pgoff is used when: + * a. @p is a fsdax page and a filesystem with a memory failure handler + * has claimed the memory_failure event. + * b. pgoff is not backed by struct page. + * In all other cases, page->index and page->mapping are sufficient + * for mapping the page back to its corresponding user virtual address. */ static void __add_to_kill(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long addr) + unsigned long ksm_addr, pgoff_t pgoff) { struct to_kill *tk; @@ -457,11 +472,20 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p, return; } - tk->addr = addr; - if (is_zone_device_page(p)) - tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); - else - tk->size_shift = folio_shift(page_folio(p)); + /* Check for pgoff not backed by struct page */ + if (!(pfn_valid(pgoff)) && (vma->vm_flags | PFN_MAP)) { + tk->addr = vma_address(vma, pgoff, 1); + tk->size_shift = PAGE_SHIFT; + } else { + tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(page_folio(p), p, vma); + if (is_zone_device_page(p)) { + if (pgoff != FSDAX_INVALID_PGOFF) + tk->addr = vma_address(vma, pgoff, 1); + tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); + } else { + tk->size_shift = folio_shift(page_folio(p)); + } + } /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -474,8 +498,8 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p, * has a mapping for the page. */ if (tk->addr == -EFAULT) { - pr_info("Unable to find user space address %lx in %s\n", - page_to_pfn(p), tsk->comm); + pr_info("Unable to find address %lx in %s\n", + pfn_valid(pgoff) ? page_to_pfn(p) : pgoff, tsk->comm); } else if (tk->size_shift == 0) { kfree(tk); return; @@ -492,7 +516,7 @@ static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p, { if (addr == -EFAULT) return; - __add_to_kill(tsk, p, vma, to_kill, addr); + __add_to_kill(tsk, p, vma, to_kill, addr, FSDAX_INVALID_PGOFF); } #ifdef CONFIG_KSM @@ -514,7 +538,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, unsigned long addr) { if (!task_in_to_kill_list(to_kill, tsk)) - __add_to_kill(tsk, p, vma, to_kill, addr); + __add_to_kill(tsk, p, vma, to_kill, addr, FSDAX_INVALID_PGOFF); } #endif /* @@ -681,21 +705,21 @@ static void collect_procs_file(const struct folio *folio, i_mmap_unlock_read(mapping); } -#ifdef CONFIG_FS_DAX -static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p, +static void add_to_kill_pgoff(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { unsigned long addr = vma_address(vma, pgoff, 1); - __add_to_kill(tsk, p, vma, to_kill, addr); + __add_to_kill(tsk, p, vma, to_kill, addr, pgoff); } /* - * Collect processes when the error hit a fsdax page. + * Collect processes when the error hit a fsdax page or a PFN not backed by + * struct page. */ -static void collect_procs_fsdax(const struct page *page, - struct address_space *mapping, pgoff_t pgoff, - struct list_head *to_kill, bool pre_remove) +static void collect_procs_pgoff(const struct page *page, + struct address_space *mapping, pgoff_t pgoff, + struct list_head *to_kill, bool pre_remove) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -716,13 +740,12 @@ static void collect_procs_fsdax(const struct page *page, continue; vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_mm == t->mm) - add_to_kill_fsdax(t, page, vma, to_kill, pgoff); + add_to_kill_pgoff(t, page, vma, to_kill, pgoff); } } rcu_read_unlock(); i_mmap_unlock_read(mapping); } -#endif /* CONFIG_FS_DAX */ /* * Collect the processes who have the corrupted page mapped to kill. @@ -943,6 +966,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1337,7 +1361,8 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, num_poisoned_pages_inc(pfn); - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_PFN_MAP) + update_per_node_mf_stats(pfn, result); pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -1846,7 +1871,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, * The pre_remove case is revoking access, the memory is still * good and could theoretically be put back into service. */ - collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove); + collect_procs_pgoff(page, mapping, index, &to_kill, pre_remove); unmap_and_kill(&to_kill, page_to_pfn(page), mapping, index, mf_flags); unlock: @@ -2198,6 +2223,83 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, kill_procs(&tokill, true, pfn, flags); } +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + if (!pfn_space) + return -EINVAL; + + if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT, + (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, "")) + return -EBUSY; + + mutex_lock(&pfn_space_lock); + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + mutex_unlock(&pfn_space_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + if (!pfn_space) + return; + + mutex_lock(&pfn_space_lock); + interval_tree_remove(&pfn_space->node, &pfn_space_itree); + mutex_unlock(&pfn_space_lock); + release_mem_region(pfn_space->node.start << PAGE_SHIFT, + (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + int res = MF_FAILED; + LIST_HEAD(tokill); + + mutex_lock(&pfn_space_lock); + /* + * Modules registers with MM the address space mapping to the device memory they + * manage. Iterate to identify exactly which address space has mapped to this + * failing PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + /* + * Modules managing the device memory need to be conveyed about the + * memory failure so that the poisoned PFN can be tracked. + */ + if (pfn_space->ops) + pfn_space->ops->failure(pfn_space, pfn); + + collect_procs_pgoff(NULL, pfn_space->mapping, pfn, &tokill, false); + + unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT, + PAGE_SIZE, 0); + + res = MF_RECOVERED; + } + mutex_unlock(&pfn_space_lock); + + if (res == MF_FAILED) + return action_result(pfn, MF_MSG_PFN_MAP, res); + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2237,6 +2339,11 @@ int memory_failure(unsigned long pfn, int flags) if (!(flags & MF_SW_SIMULATED)) hw_memory_failure = true; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + p = pfn_to_online_page(pfn); if (!p) { res = arch_memory_failure(pfn, flags); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index a1b2b657999dc..d59d48022a24a 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -342,12 +342,14 @@ FIXTURE(iommufd_ioas) uint32_t hwpt_id; uint32_t device_id; uint64_t base_iova; + uint32_t device_pasid_id; }; FIXTURE_VARIANT(iommufd_ioas) { unsigned int mock_domains; unsigned int memory_limit; + bool pasid_capable; }; FIXTURE_SETUP(iommufd_ioas) @@ -372,6 +374,12 @@ FIXTURE_SETUP(iommufd_ioas) IOMMU_TEST_DEV_CACHE_DEFAULT); self->base_iova = MOCK_APERTURE_START; } + + if (variant->pasid_capable) + test_cmd_mock_domain_flags(self->ioas_id, + MOCK_FLAGS_DEVICE_PASID, + NULL, NULL, + &self->device_pasid_id); } FIXTURE_TEARDOWN(iommufd_ioas) @@ -387,6 +395,7 @@ FIXTURE_VARIANT_ADD(iommufd_ioas, no_domain) FIXTURE_VARIANT_ADD(iommufd_ioas, mock_domain) { .mock_domains = 1, + .pasid_capable = true, }; FIXTURE_VARIANT_ADD(iommufd_ioas, two_mock_domain) @@ -439,6 +448,10 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) &test_hwpt_id); test_err_hwpt_alloc(EINVAL, self->device_id, self->device_id, 0, &test_hwpt_id); + test_err_hwpt_alloc(EOPNOTSUPP, self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_FAULT_ID_VALID, + &test_hwpt_id); test_cmd_hwpt_alloc(self->device_id, self->ioas_id, IOMMU_HWPT_ALLOC_NEST_PARENT, @@ -748,25 +761,51 @@ TEST_F(iommufd_ioas, get_hw_info) } buffer_smaller; if (self->device_id) { + uint8_t max_pasid = 0; + /* Provide a zero-size user_buffer */ - test_cmd_get_hw_info(self->device_id, NULL, 0); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_DEFAULT, NULL, 0); /* Provide a user_buffer with exact size */ - test_cmd_get_hw_info(self->device_id, &buffer_exact, sizeof(buffer_exact)); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_exact, + sizeof(buffer_exact)); + + /* Request for a wrong data_type, and a correct one */ + test_err_get_hw_info(EOPNOTSUPP, self->device_id, + IOMMU_HW_INFO_TYPE_SELFTEST + 1, + &buffer_exact, sizeof(buffer_exact)); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_SELFTEST, &buffer_exact, + sizeof(buffer_exact)); /* * Provide a user_buffer with size larger than the exact size to check if * kernel zero the trailing bytes. */ - test_cmd_get_hw_info(self->device_id, &buffer_larger, sizeof(buffer_larger)); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_larger, + sizeof(buffer_larger)); /* * Provide a user_buffer with size smaller than the exact size to check if * the fields within the size range still gets updated. */ - test_cmd_get_hw_info(self->device_id, &buffer_smaller, sizeof(buffer_smaller)); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_DEFAULT, + &buffer_smaller, sizeof(buffer_smaller)); + test_cmd_get_hw_info_pasid(self->device_id, &max_pasid); + ASSERT_EQ(0, max_pasid); + if (variant->pasid_capable) { + test_cmd_get_hw_info_pasid(self->device_pasid_id, + &max_pasid); + ASSERT_EQ(MOCK_PASID_WIDTH, max_pasid); + } } else { test_err_get_hw_info(ENOENT, self->device_id, - &buffer_exact, sizeof(buffer_exact)); + IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_exact, + sizeof(buffer_exact)); test_err_get_hw_info(ENOENT, self->device_id, - &buffer_larger, sizeof(buffer_larger)); + IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_larger, + sizeof(buffer_larger)); } } @@ -2153,8 +2192,7 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability) test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, 0, &hwpt_id); test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); - test_cmd_get_hw_capabilities(self->idev_id, caps, - IOMMU_HW_CAP_DIRTY_TRACKING); + test_cmd_get_hw_capabilities(self->idev_id, caps); ASSERT_EQ(IOMMU_HW_CAP_DIRTY_TRACKING, caps & IOMMU_HW_CAP_DIRTY_TRACKING); @@ -2666,7 +2704,7 @@ FIXTURE_SETUP(iommufd_viommu) /* Allocate a vIOMMU taking refcount of the parent hwpt */ test_cmd_viommu_alloc(self->device_id, self->hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, &self->viommu_id); /* Allocate a regular nested hwpt */ @@ -2705,24 +2743,27 @@ TEST_F(iommufd_viommu, viommu_negative_tests) if (self->device_id) { /* Negative test -- invalid hwpt (hwpt_id=0) */ test_err_viommu_alloc(ENOENT, device_id, 0, - IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, + NULL); /* Negative test -- not a nesting parent hwpt */ test_cmd_hwpt_alloc(device_id, ioas_id, 0, &hwpt_id); test_err_viommu_alloc(EINVAL, device_id, hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, + NULL); test_ioctl_destroy(hwpt_id); /* Negative test -- unsupported viommu type */ test_err_viommu_alloc(EOPNOTSUPP, device_id, self->hwpt_id, - 0xdead, NULL); + 0xdead, NULL, 0, NULL); EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, self->hwpt_id)); EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, self->viommu_id)); } else { test_err_viommu_alloc(ENOENT, self->device_id, self->hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, + NULL); } } @@ -2736,6 +2777,7 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) uint32_t iopf_hwpt_id; uint32_t fault_id; uint32_t fault_fd; + uint32_t vdev_id; if (self->device_id) { test_ioctl_fault_alloc(&fault_id, &fault_fd); @@ -2752,6 +2794,10 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) &iopf_hwpt_id, IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + /* Must allocate vdevice before attaching to a nested hwpt */ + test_err_mock_domain_replace(ENOENT, self->stdev_id, + iopf_hwpt_id); + test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id); EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, iopf_hwpt_id)); @@ -2764,20 +2810,85 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) } } +TEST_F(iommufd_viommu, viommu_alloc_with_data) +{ + struct iommu_viommu_selftest data = { + .in_data = 0xbeef, + }; + uint32_t *test; + + if (!self->device_id) + SKIP(return, "Skipping test for variant no_viommu"); + + test_cmd_viommu_alloc(self->device_id, self->hwpt_id, + IOMMU_VIOMMU_TYPE_SELFTEST, &data, sizeof(data), + &self->viommu_id); + ASSERT_EQ(data.out_data, data.in_data); + + /* Negative mmap tests -- offset and length cannot be changed */ + test_err_mmap(ENXIO, data.out_mmap_length, + data.out_mmap_offset + PAGE_SIZE); + test_err_mmap(ENXIO, data.out_mmap_length, + data.out_mmap_offset + PAGE_SIZE * 2); + test_err_mmap(ENXIO, data.out_mmap_length / 2, data.out_mmap_offset); + test_err_mmap(ENXIO, data.out_mmap_length * 2, data.out_mmap_offset); + + /* Now do a correct mmap for a loopback test */ + test = mmap(NULL, data.out_mmap_length, PROT_READ | PROT_WRITE, + MAP_SHARED, self->fd, data.out_mmap_offset); + ASSERT_NE(MAP_FAILED, test); + ASSERT_EQ(data.in_data, *test); + + /* The owner of the mmap region should be blocked */ + EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, self->viommu_id)); + munmap(test, data.out_mmap_length); +} + TEST_F(iommufd_viommu, vdevice_alloc) { uint32_t viommu_id = self->viommu_id; uint32_t dev_id = self->device_id; uint32_t vdev_id = 0; + uint32_t veventq_id; + uint32_t veventq_fd; + int prev_seq = -1; if (dev_id) { + /* Must allocate vdevice before attaching to a nested hwpt */ + test_err_mock_domain_replace(ENOENT, self->stdev_id, + self->nested_hwpt_id); + + /* Allocate a vEVENTQ with veventq_depth=2 */ + test_cmd_veventq_alloc(viommu_id, IOMMU_VEVENTQ_TYPE_SELFTEST, + &veventq_id, &veventq_fd); + test_err_veventq_alloc(EEXIST, viommu_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, NULL, NULL); /* Set vdev_id to 0x99, unset it, and set to 0x88 */ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, + self->nested_hwpt_id); + test_cmd_trigger_vevents(dev_id, 1); + test_cmd_read_vevents(veventq_fd, 1, 0x99, &prev_seq); test_err_vdevice_alloc(EEXIST, viommu_id, dev_id, 0x99, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); test_ioctl_destroy(vdev_id); + + /* Try again with 0x88 */ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x88, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, + self->nested_hwpt_id); + /* Trigger an overflow with three events */ + test_cmd_trigger_vevents(dev_id, 3); + test_err_read_vevents(EOVERFLOW, veventq_fd, 3, 0x88, + &prev_seq); + /* Overflow must be gone after the previous reads */ + test_cmd_trigger_vevents(dev_id, 1); + test_cmd_read_vevents(veventq_fd, 1, 0x88, &prev_seq); + close(veventq_fd); + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); test_ioctl_destroy(vdev_id); + test_ioctl_destroy(veventq_id); } else { test_err_vdevice_alloc(ENOENT, viommu_id, dev_id, 0x99, NULL); } @@ -2956,4 +3067,369 @@ TEST_F(iommufd_viommu, vdevice_cache) } } +TEST_F(iommufd_viommu, hw_queue) +{ + __u64 iova = MOCK_APERTURE_START, iova2; + uint32_t viommu_id = self->viommu_id; + uint32_t hw_queue_id[2]; + + if (!viommu_id) + SKIP(return, "Skipping test for variant no_viommu"); + + /* Fail IOMMU_HW_QUEUE_TYPE_DEFAULT */ + test_err_hw_queue_alloc(EOPNOTSUPP, viommu_id, + IOMMU_HW_QUEUE_TYPE_DEFAULT, 0, iova, PAGE_SIZE, + &hw_queue_id[0]); + /* Fail queue addr and length */ + test_err_hw_queue_alloc(EINVAL, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, + 0, iova, 0, &hw_queue_id[0]); + test_err_hw_queue_alloc(EOVERFLOW, viommu_id, + IOMMU_HW_QUEUE_TYPE_SELFTEST, 0, ~(uint64_t)0, + PAGE_SIZE, &hw_queue_id[0]); + /* Fail missing iova */ + test_err_hw_queue_alloc(ENOENT, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, + 0, iova, PAGE_SIZE, &hw_queue_id[0]); + + /* Map iova */ + test_ioctl_ioas_map(buffer, PAGE_SIZE, &iova); + test_ioctl_ioas_map(buffer + PAGE_SIZE, PAGE_SIZE, &iova2); + + /* Fail index=1 and =MAX; must start from index=0 */ + test_err_hw_queue_alloc(EIO, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, 1, + iova, PAGE_SIZE, &hw_queue_id[0]); + test_err_hw_queue_alloc(EINVAL, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, + IOMMU_TEST_HW_QUEUE_MAX, iova, PAGE_SIZE, + &hw_queue_id[0]); + + /* Allocate index=0, declare ownership of the iova */ + test_cmd_hw_queue_alloc(viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, 0, + iova, PAGE_SIZE, &hw_queue_id[0]); + /* Fail duplicated index */ + test_err_hw_queue_alloc(EEXIST, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, + 0, iova, PAGE_SIZE, &hw_queue_id[0]); + /* Fail unmap, due to iova ownership */ + test_err_ioctl_ioas_unmap(EBUSY, iova, PAGE_SIZE); + /* The 2nd page is not pinned, so it can be unmmap */ + test_ioctl_ioas_unmap(iova2, PAGE_SIZE); + + /* Allocate index=1, with an unaligned case */ + test_cmd_hw_queue_alloc(viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, 1, + iova + PAGE_SIZE / 2, PAGE_SIZE / 2, + &hw_queue_id[1]); + /* Fail to destroy, due to dependency */ + EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hw_queue_id[0])); + + /* Destroy in descending order */ + test_ioctl_destroy(hw_queue_id[1]); + test_ioctl_destroy(hw_queue_id[0]); + /* Now it can unmap the first page */ + test_ioctl_ioas_unmap(iova, PAGE_SIZE); +} + +FIXTURE(iommufd_device_pasid) +{ + int fd; + uint32_t ioas_id; + uint32_t hwpt_id; + uint32_t stdev_id; + uint32_t device_id; + uint32_t no_pasid_stdev_id; + uint32_t no_pasid_device_id; +}; + +FIXTURE_VARIANT(iommufd_device_pasid) +{ + bool pasid_capable; +}; + +FIXTURE_SETUP(iommufd_device_pasid) +{ + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + test_ioctl_ioas_alloc(&self->ioas_id); + + test_cmd_mock_domain_flags(self->ioas_id, + MOCK_FLAGS_DEVICE_PASID, + &self->stdev_id, &self->hwpt_id, + &self->device_id); + if (!variant->pasid_capable) + test_cmd_mock_domain_flags(self->ioas_id, 0, + &self->no_pasid_stdev_id, NULL, + &self->no_pasid_device_id); +} + +FIXTURE_TEARDOWN(iommufd_device_pasid) +{ + teardown_iommufd(self->fd, _metadata); +} + +FIXTURE_VARIANT_ADD(iommufd_device_pasid, no_pasid) +{ + .pasid_capable = false, +}; + +FIXTURE_VARIANT_ADD(iommufd_device_pasid, has_pasid) +{ + .pasid_capable = true, +}; + +TEST_F(iommufd_device_pasid, pasid_attach) +{ + struct iommu_hwpt_selftest data = { + .iotlb = IOMMU_TEST_IOTLB_DEFAULT, + }; + uint32_t nested_hwpt_id[3] = {}; + uint32_t parent_hwpt_id = 0; + uint32_t fault_id, fault_fd; + uint32_t s2_hwpt_id = 0; + uint32_t iopf_hwpt_id; + uint32_t pasid = 100; + uint32_t viommu_id; + + /* + * Negative, detach pasid without attaching, this is not expected. + * But it should not result in failure anyway. + */ + test_cmd_pasid_detach(pasid); + + /* Allocate two nested hwpts sharing one common parent hwpt */ + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + &parent_hwpt_id); + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, + IOMMU_HWPT_ALLOC_PASID, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, + &data, sizeof(data)); + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, + IOMMU_HWPT_ALLOC_PASID, + &nested_hwpt_id[1], + IOMMU_HWPT_DATA_SELFTEST, + &data, sizeof(data)); + + /* Fault related preparation */ + test_ioctl_fault_alloc(&fault_id, &fault_fd); + test_cmd_hwpt_alloc_iopf(self->device_id, parent_hwpt_id, fault_id, + IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID, + &iopf_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + /* Allocate a regular nested hwpt based on viommu */ + test_cmd_viommu_alloc(self->device_id, parent_hwpt_id, + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, &viommu_id); + test_cmd_hwpt_alloc_nested(self->device_id, viommu_id, + IOMMU_HWPT_ALLOC_PASID, + &nested_hwpt_id[2], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_PASID, + &s2_hwpt_id); + + /* Attach RID to non-pasid compat domain, */ + test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id); + /* then attach to pasid should fail */ + test_err_pasid_attach(EINVAL, pasid, s2_hwpt_id); + + /* Attach RID to pasid compat domain, */ + test_cmd_mock_domain_replace(self->stdev_id, s2_hwpt_id); + /* then attach to pasid should succeed, */ + test_cmd_pasid_attach(pasid, nested_hwpt_id[0]); + /* but attach RID to non-pasid compat domain should fail now. */ + test_err_mock_domain_replace(EINVAL, self->stdev_id, parent_hwpt_id); + /* + * Detach hwpt from pasid 100, and check if the pasid 100 + * has null domain. + */ + test_cmd_pasid_detach(pasid); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, 0)); + /* RID is attached to pasid-comapt domain, pasid path is not used */ + + if (!variant->pasid_capable) { + /* + * PASID-compatible domain can be used by non-PASID-capable + * device. + */ + test_cmd_mock_domain_replace(self->no_pasid_stdev_id, nested_hwpt_id[0]); + test_cmd_mock_domain_replace(self->no_pasid_stdev_id, self->ioas_id); + /* + * Attach hwpt to pasid 100 of non-PASID-capable device, + * should fail, no matter domain is pasid-comapt or not. + */ + EXPECT_ERRNO(EINVAL, + _test_cmd_pasid_attach(self->fd, self->no_pasid_stdev_id, + pasid, parent_hwpt_id)); + EXPECT_ERRNO(EINVAL, + _test_cmd_pasid_attach(self->fd, self->no_pasid_stdev_id, + pasid, s2_hwpt_id)); + } + + /* + * Attach non pasid compat hwpt to pasid-capable device, should + * fail, and have null domain. + */ + test_err_pasid_attach(EINVAL, pasid, parent_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, 0)); + + /* + * Attach ioas to pasid 100, should fail, domain should + * be null. + */ + test_err_pasid_attach(EINVAL, pasid, self->ioas_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, 0)); + + /* + * Attach the s2_hwpt to pasid 100, should succeed, domain should + * be valid. + */ + test_cmd_pasid_attach(pasid, s2_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* + * Try attach pasid 100 with another hwpt, should FAIL + * as attach does not allow overwrite, use REPLACE instead. + */ + test_err_pasid_attach(EBUSY, pasid, nested_hwpt_id[0]); + + /* + * Detach hwpt from pasid 100 for next test, should succeed, + * and have null domain. + */ + test_cmd_pasid_detach(pasid); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, 0)); + + /* + * Attach nested hwpt to pasid 100, should succeed, domain + * should be valid. + */ + test_cmd_pasid_attach(pasid, nested_hwpt_id[0]); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, nested_hwpt_id[0])); + + /* Attach to pasid 100 which has been attached, should fail. */ + test_err_pasid_attach(EBUSY, pasid, nested_hwpt_id[0]); + + /* cleanup pasid 100 */ + test_cmd_pasid_detach(pasid); + + /* Replace tests */ + + pasid = 200; + /* + * Replace pasid 200 without attaching it, should fail + * with -EINVAL. + */ + test_err_pasid_replace(EINVAL, pasid, s2_hwpt_id); + + /* + * Attach the s2 hwpt to pasid 200, should succeed, domain should + * be valid. + */ + test_cmd_pasid_attach(pasid, s2_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* + * Replace pasid 200 with self->ioas_id, should fail + * and domain should be the prior s2 hwpt. + */ + test_err_pasid_replace(EINVAL, pasid, self->ioas_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* + * Replace a nested hwpt for pasid 200, should succeed, + * and have valid domain. + */ + test_cmd_pasid_replace(pasid, nested_hwpt_id[0]); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, nested_hwpt_id[0])); + + /* + * Replace with another nested hwpt for pasid 200, should + * succeed, and have valid domain. + */ + test_cmd_pasid_replace(pasid, nested_hwpt_id[1]); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, nested_hwpt_id[1])); + + /* cleanup pasid 200 */ + test_cmd_pasid_detach(pasid); + + /* Negative Tests for pasid replace, use pasid 1024 */ + + /* + * Attach the s2 hwpt to pasid 1024, should succeed, domain should + * be valid. + */ + pasid = 1024; + test_cmd_pasid_attach(pasid, s2_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* + * Replace pasid 1024 with nested_hwpt_id[0], should fail, + * but have the old valid domain. This is a designed + * negative case. Normally, this shall succeed. + */ + test_err_pasid_replace(ENOMEM, pasid, nested_hwpt_id[0]); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* cleanup pasid 1024 */ + test_cmd_pasid_detach(pasid); + + /* Attach to iopf-capable hwpt */ + + /* + * Attach an iopf hwpt to pasid 2048, should succeed, domain should + * be valid. + */ + pasid = 2048; + test_cmd_pasid_attach(pasid, iopf_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, iopf_hwpt_id)); + + test_cmd_trigger_iopf_pasid(self->device_id, pasid, fault_fd); + + /* + * Replace with s2_hwpt_id for pasid 2048, should + * succeed, and have valid domain. + */ + test_cmd_pasid_replace(pasid, s2_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* cleanup pasid 2048 */ + test_cmd_pasid_detach(pasid); + + test_ioctl_destroy(iopf_hwpt_id); + close(fault_fd); + test_ioctl_destroy(fault_id); + + /* Detach the s2_hwpt_id from RID */ + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 64b1f8e1b0cf1..651fc9f13c081 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -209,12 +209,16 @@ FIXTURE(basic_fail_nth) { int fd; uint32_t access_id; + uint32_t stdev_id; + uint32_t pasid; }; FIXTURE_SETUP(basic_fail_nth) { self->fd = -1; self->access_id = 0; + self->stdev_id = 0; + self->pasid = 0; //test should use a non-zero value } FIXTURE_TEARDOWN(basic_fail_nth) @@ -226,6 +230,8 @@ FIXTURE_TEARDOWN(basic_fail_nth) rc = _test_cmd_destroy_access(self->access_id); assert(rc == 0); } + if (self->pasid && self->stdev_id) + _test_cmd_pasid_detach(self->fd, self->stdev_id, self->pasid); teardown_iommufd(self->fd, _metadata); } @@ -620,13 +626,15 @@ TEST_FAIL_NTH(basic_fail_nth, device) }; struct iommu_test_hw_info info; uint32_t fault_id, fault_fd; + uint32_t veventq_id, veventq_fd; uint32_t fault_hwpt_id; + uint32_t test_hwpt_id; uint32_t ioas_id; uint32_t ioas_id2; - uint32_t stdev_id; uint32_t idev_id; uint32_t hwpt_id; uint32_t viommu_id; + uint32_t hw_queue_id; uint32_t vdev_id; __u64 iova; @@ -654,35 +662,46 @@ TEST_FAIL_NTH(basic_fail_nth, device) fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL, - &idev_id)) + if (_test_cmd_mock_domain_flags(self->fd, ioas_id, + MOCK_FLAGS_DEVICE_PASID, + &self->stdev_id, NULL, &idev_id)) return -1; - if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL)) + if (_test_cmd_get_hw_info(self->fd, idev_id, IOMMU_HW_INFO_TYPE_DEFAULT, + &info, sizeof(info), NULL, NULL)) return -1; - if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, 0, &hwpt_id, + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, + IOMMU_HWPT_ALLOC_PASID, &hwpt_id, IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; - if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) + if (_test_cmd_mock_domain_replace(self->fd, self->stdev_id, ioas_id2, NULL)) return -1; - if (_test_cmd_mock_domain_replace(self->fd, stdev_id, hwpt_id, NULL)) + if (_test_cmd_mock_domain_replace(self->fd, self->stdev_id, hwpt_id, NULL)) return -1; if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, - IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id, + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID, + &hwpt_id, IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; - if (_test_cmd_viommu_alloc(self->fd, idev_id, hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, 0, &viommu_id)) + if (_test_cmd_viommu_alloc(self->fd, idev_id, hwpt_id, 0, + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, + &viommu_id)) return -1; if (_test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, 0, &vdev_id)) return -1; + if (_test_cmd_hw_queue_alloc(self->fd, viommu_id, + IOMMU_HW_QUEUE_TYPE_SELFTEST, 0, iova, + PAGE_SIZE, &hw_queue_id)) + return -1; + if (_test_ioctl_fault_alloc(self->fd, &fault_id, &fault_fd)) return -1; close(fault_fd); @@ -692,6 +711,37 @@ TEST_FAIL_NTH(basic_fail_nth, device) IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data))) return -1; + if (_test_cmd_veventq_alloc(self->fd, viommu_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, &veventq_id, + &veventq_fd)) + return -1; + close(veventq_fd); + + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, + IOMMU_HWPT_ALLOC_PASID, + &test_hwpt_id, + IOMMU_HWPT_DATA_NONE, 0, 0)) + return -1; + + /* Tests for pasid attach/replace/detach */ + + self->pasid = 200; + + if (_test_cmd_pasid_attach(self->fd, self->stdev_id, + self->pasid, hwpt_id)) { + self->pasid = 0; + return -1; + } + + if (_test_cmd_pasid_replace(self->fd, self->stdev_id, + self->pasid, test_hwpt_id)) + return -1; + + if (_test_cmd_pasid_detach(self->fd, self->stdev_id, self->pasid)) + return -1; + + self->pasid = 0; + return 0; } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index d979f5b0efe83..5384852ce038c 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "../kselftest_harness.h" #include "../../../../drivers/iommu/iommufd/iommufd_test.h" @@ -55,6 +56,10 @@ static unsigned long PAGE_SIZE; #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) +#define test_err_mmap(_errno, length, offset) \ + EXPECT_ERRNO(_errno, (long)mmap(NULL, length, PROT_READ | PROT_WRITE, \ + MAP_SHARED, self->fd, offset)) + static inline void *memfd_mmap(size_t length, int prot, int flags, int *mfd_p) { int mfd_flags = (flags & MAP_HUGETLB) ? MFD_HUGETLB : 0; @@ -756,19 +761,24 @@ static void teardown_iommufd(int fd, struct __test_metadata *_metadata) #endif /* @data can be NULL */ -static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, - size_t data_len, uint32_t *capabilities) +static int _test_cmd_get_hw_info(int fd, __u32 device_id, __u32 data_type, + void *data, size_t data_len, + uint32_t *capabilities, uint8_t *max_pasid) { struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data; struct iommu_hw_info cmd = { .size = sizeof(cmd), .dev_id = device_id, .data_len = data_len, + .in_data_type = data_type, .data_uptr = (uint64_t)data, .out_capabilities = 0, }; int ret; + if (data_type != IOMMU_HW_INFO_TYPE_DEFAULT) + cmd.flags |= IOMMU_HW_INFO_FLAG_INPUT_TYPE; + ret = ioctl(fd, IOMMU_GET_HW_INFO, &cmd); if (ret) return ret; @@ -802,22 +812,33 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, assert(!info->flags); } + if (max_pasid) + *max_pasid = cmd.out_max_pasid_log2; + if (capabilities) *capabilities = cmd.out_capabilities; return 0; } -#define test_cmd_get_hw_info(device_id, data, data_len) \ - ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \ - data_len, NULL)) +#define test_cmd_get_hw_info(device_id, data_type, data, data_len) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data_type, \ + data, data_len, NULL, NULL)) -#define test_err_get_hw_info(_errno, device_id, data, data_len) \ - EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \ - data_len, NULL)) +#define test_err_get_hw_info(_errno, device_id, data_type, data, data_len) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_get_hw_info(self->fd, device_id, data_type, \ + data, data_len, NULL, NULL)) -#define test_cmd_get_hw_capabilities(device_id, caps, mask) \ - ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps)) +#define test_cmd_get_hw_capabilities(device_id, caps) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \ + IOMMU_HW_INFO_TYPE_DEFAULT, NULL, \ + 0, &caps, NULL)) + +#define test_cmd_get_hw_info_pasid(device_id, max_pasid) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \ + IOMMU_HW_INFO_TYPE_DEFAULT, NULL, \ + 0, NULL, max_pasid)) static int _test_ioctl_fault_alloc(int fd, __u32 *fault_id, __u32 *fault_fd) { @@ -842,14 +863,15 @@ static int _test_ioctl_fault_alloc(int fd, __u32 *fault_id, __u32 *fault_fd) ASSERT_NE(0, *(fault_fd)); \ }) -static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd) +static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 pasid, + __u32 fault_fd) { struct iommu_test_cmd trigger_iopf_cmd = { .size = sizeof(trigger_iopf_cmd), .op = IOMMU_TEST_OP_TRIGGER_IOPF, .trigger_iopf = { .dev_id = device_id, - .pasid = 0x1, + .pasid = pasid, .grpid = 0x2, .perm = IOMMU_PGFAULT_PERM_READ | IOMMU_PGFAULT_PERM_WRITE, .addr = 0xdeadbeaf, @@ -880,10 +902,14 @@ static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd) } #define test_cmd_trigger_iopf(device_id, fault_fd) \ - ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd)) + ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, 0x1, fault_fd)) +#define test_cmd_trigger_iopf_pasid(device_id, pasid, fault_fd) \ + ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, \ + pasid, fault_fd)) static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, - __u32 type, __u32 flags, __u32 *viommu_id) + __u32 flags, __u32 type, void *data, + __u32 data_len, __u32 *viommu_id) { struct iommu_viommu_alloc cmd = { .size = sizeof(cmd), @@ -891,6 +917,8 @@ static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, .type = type, .dev_id = device_id, .hwpt_id = hwpt_id, + .data_uptr = (uint64_t)data, + .data_len = data_len, }; int ret; @@ -902,13 +930,15 @@ static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, return 0; } -#define test_cmd_viommu_alloc(device_id, hwpt_id, type, viommu_id) \ - ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \ - type, 0, viommu_id)) -#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, viommu_id) \ - EXPECT_ERRNO(_errno, \ - _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \ - type, 0, viommu_id)) +#define test_cmd_viommu_alloc(device_id, hwpt_id, type, data, data_len, \ + viommu_id) \ + ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, 0, \ + type, data, data_len, viommu_id)) +#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, data, \ + data_len, viommu_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, 0, \ + type, data, data_len, viommu_id)) static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id, __u64 virt_id, __u32 *vdev_id) @@ -936,3 +966,235 @@ static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id, EXPECT_ERRNO(_errno, \ _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \ virt_id, vdev_id)) + +static int _test_cmd_hw_queue_alloc(int fd, __u32 viommu_id, __u32 type, + __u32 idx, __u64 base_addr, __u64 length, + __u32 *hw_queue_id) +{ + struct iommu_hw_queue_alloc cmd = { + .size = sizeof(cmd), + .viommu_id = viommu_id, + .type = type, + .index = idx, + .nesting_parent_iova = base_addr, + .length = length, + }; + int ret; + + ret = ioctl(fd, IOMMU_HW_QUEUE_ALLOC, &cmd); + if (ret) + return ret; + if (hw_queue_id) + *hw_queue_id = cmd.out_hw_queue_id; + return 0; +} + +#define test_cmd_hw_queue_alloc(viommu_id, type, idx, base_addr, len, out_qid) \ + ASSERT_EQ(0, _test_cmd_hw_queue_alloc(self->fd, viommu_id, type, idx, \ + base_addr, len, out_qid)) +#define test_err_hw_queue_alloc(_errno, viommu_id, type, idx, base_addr, len, \ + out_qid) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_hw_queue_alloc(self->fd, viommu_id, type, idx, \ + base_addr, len, out_qid)) + +static int _test_cmd_veventq_alloc(int fd, __u32 viommu_id, __u32 type, + __u32 *veventq_id, __u32 *veventq_fd) +{ + struct iommu_veventq_alloc cmd = { + .size = sizeof(cmd), + .type = type, + .veventq_depth = 2, + .viommu_id = viommu_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_VEVENTQ_ALLOC, &cmd); + if (ret) + return ret; + if (veventq_id) + *veventq_id = cmd.out_veventq_id; + if (veventq_fd) + *veventq_fd = cmd.out_veventq_fd; + return 0; +} + +#define test_cmd_veventq_alloc(viommu_id, type, veventq_id, veventq_fd) \ + ASSERT_EQ(0, _test_cmd_veventq_alloc(self->fd, viommu_id, type, \ + veventq_id, veventq_fd)) +#define test_err_veventq_alloc(_errno, viommu_id, type, veventq_id, \ + veventq_fd) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_veventq_alloc(self->fd, viommu_id, type, \ + veventq_id, veventq_fd)) + +static int _test_cmd_trigger_vevents(int fd, __u32 dev_id, __u32 nvevents) +{ + struct iommu_test_cmd trigger_vevent_cmd = { + .size = sizeof(trigger_vevent_cmd), + .op = IOMMU_TEST_OP_TRIGGER_VEVENT, + .trigger_vevent = { + .dev_id = dev_id, + }, + }; + int ret; + + while (nvevents--) { + ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT), + &trigger_vevent_cmd); + if (ret < 0) + return -1; + } + return ret; +} + +#define test_cmd_trigger_vevents(dev_id, nvevents) \ + ASSERT_EQ(0, _test_cmd_trigger_vevents(self->fd, dev_id, nvevents)) + +static int _test_cmd_read_vevents(int fd, __u32 event_fd, __u32 nvevents, + __u32 virt_id, int *prev_seq) +{ + struct pollfd pollfd = { .fd = event_fd, .events = POLLIN }; + struct iommu_viommu_event_selftest *event; + struct iommufd_vevent_header *hdr; + ssize_t bytes; + void *data; + int ret, i; + + ret = poll(&pollfd, 1, 1000); + if (ret < 0) + return -1; + + data = calloc(nvevents, sizeof(*hdr) + sizeof(*event)); + if (!data) { + errno = ENOMEM; + return -1; + } + + bytes = read(event_fd, data, + nvevents * (sizeof(*hdr) + sizeof(*event))); + if (bytes <= 0) { + errno = EFAULT; + ret = -1; + goto out_free; + } + + for (i = 0; i < nvevents; i++) { + hdr = data + i * (sizeof(*hdr) + sizeof(*event)); + + if (hdr->flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS || + hdr->sequence - *prev_seq > 1) { + *prev_seq = hdr->sequence; + errno = EOVERFLOW; + ret = -1; + goto out_free; + } + *prev_seq = hdr->sequence; + event = data + sizeof(*hdr); + if (event->virt_id != virt_id) { + errno = EINVAL; + ret = -1; + goto out_free; + } + } + + ret = 0; +out_free: + free(data); + return ret; +} + +#define test_cmd_read_vevents(event_fd, nvevents, virt_id, prev_seq) \ + ASSERT_EQ(0, _test_cmd_read_vevents(self->fd, event_fd, nvevents, \ + virt_id, prev_seq)) +#define test_err_read_vevents(_errno, event_fd, nvevents, virt_id, prev_seq) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_read_vevents(self->fd, event_fd, nvevents, \ + virt_id, prev_seq)) + +static int _test_cmd_pasid_attach(int fd, __u32 stdev_id, __u32 pasid, + __u32 pt_id) +{ + struct iommu_test_cmd test_attach = { + .size = sizeof(test_attach), + .op = IOMMU_TEST_OP_PASID_ATTACH, + .id = stdev_id, + .pasid_attach = { + .pasid = pasid, + .pt_id = pt_id, + }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_PASID_ATTACH), + &test_attach); +} + +#define test_cmd_pasid_attach(pasid, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_pasid_attach(self->fd, self->stdev_id, \ + pasid, hwpt_id)) + +#define test_err_pasid_attach(_errno, pasid, hwpt_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_pasid_attach(self->fd, self->stdev_id, \ + pasid, hwpt_id)) + +static int _test_cmd_pasid_replace(int fd, __u32 stdev_id, __u32 pasid, + __u32 pt_id) +{ + struct iommu_test_cmd test_replace = { + .size = sizeof(test_replace), + .op = IOMMU_TEST_OP_PASID_REPLACE, + .id = stdev_id, + .pasid_replace = { + .pasid = pasid, + .pt_id = pt_id, + }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_PASID_REPLACE), + &test_replace); +} + +#define test_cmd_pasid_replace(pasid, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_pasid_replace(self->fd, self->stdev_id, \ + pasid, hwpt_id)) + +#define test_err_pasid_replace(_errno, pasid, hwpt_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_pasid_replace(self->fd, self->stdev_id, \ + pasid, hwpt_id)) + +static int _test_cmd_pasid_detach(int fd, __u32 stdev_id, __u32 pasid) +{ + struct iommu_test_cmd test_detach = { + .size = sizeof(test_detach), + .op = IOMMU_TEST_OP_PASID_DETACH, + .id = stdev_id, + .pasid_detach = { + .pasid = pasid, + }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_PASID_DETACH), + &test_detach); +} + +#define test_cmd_pasid_detach(pasid) \ + ASSERT_EQ(0, _test_cmd_pasid_detach(self->fd, self->stdev_id, pasid)) + +static int test_cmd_pasid_check_hwpt(int fd, __u32 stdev_id, __u32 pasid, + __u32 hwpt_id) +{ + struct iommu_test_cmd test_pasid_check = { + .size = sizeof(test_pasid_check), + .op = IOMMU_TEST_OP_PASID_CHECK_HWPT, + .id = stdev_id, + .pasid_check = { + .pasid = pasid, + .hwpt_id = hwpt_id, + }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_PASID_CHECK_HWPT), + &test_pasid_check); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ba0327e2d0d33..bef00c86cc86f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2933,8 +2933,11 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; - if (r < 0) + if (r < 0) { pfn = KVM_PFN_ERR_FAULT; + if (r == -EHWPOISON) + pfn = KVM_PFN_ERR_HWPOISON; + } } else { if ((kfp->flags & FOLL_NOWAIT) && vma_is_valid(vma, kfp->flags & FOLL_WRITE))