diff --git a/Cargo.lock b/Cargo.lock
index 4d54b5aeb4e1d..71ec774a33582 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4121,6 +4121,7 @@ dependencies = [
 name = "rustc_monomorphize"
 version = "0.0.0"
 dependencies = [
+ "rustc_ast",
  "rustc_data_structures",
  "rustc_errors",
  "rustc_fluent_macro",
@@ -4129,6 +4130,7 @@ dependencies = [
  "rustc_middle",
  "rustc_session",
  "rustc_span",
+ "rustc_symbol_mangling",
  "rustc_target",
  "serde",
  "serde_json",
diff --git a/compiler/rustc_builtin_macros/src/errors.rs b/compiler/rustc_builtin_macros/src/errors.rs
index f8e65661e52e2..8ca2867fc6304 100644
--- a/compiler/rustc_builtin_macros/src/errors.rs
+++ b/compiler/rustc_builtin_macros/src/errors.rs
@@ -217,6 +217,60 @@ mod ad_fallback {
     }
 }
 
+#[derive(Diagnostic)]
+#[diag(builtin_macros_autodiff_unknown_activity)]
+pub(crate) struct AutoDiffUnknownActivity {
+    #[primary_span]
+    pub(crate) span: Span,
+    pub(crate) act: String,
+}
+#[derive(Diagnostic)]
+#[diag(builtin_macros_autodiff_ty_activity)]
+pub(crate) struct AutoDiffInvalidTypeForActivity {
+    #[primary_span]
+    pub(crate) span: Span,
+    pub(crate) act: String,
+}
+#[derive(Diagnostic)]
+#[diag(builtin_macros_autodiff_number_activities)]
+pub(crate) struct AutoDiffInvalidNumberActivities {
+    #[primary_span]
+    pub(crate) span: Span,
+    pub(crate) expected: usize,
+    pub(crate) found: usize,
+}
+#[derive(Diagnostic)]
+#[diag(builtin_macros_autodiff_mode_activity)]
+pub(crate) struct AutoDiffInvalidApplicationModeAct {
+    #[primary_span]
+    pub(crate) span: Span,
+    pub(crate) mode: String,
+    pub(crate) act: String,
+}
+
+#[derive(Diagnostic)]
+#[diag(builtin_macros_autodiff_mode)]
+pub(crate) struct AutoDiffInvalidMode {
+    #[primary_span]
+    pub(crate) span: Span,
+    pub(crate) mode: String,
+}
+
+#[derive(Diagnostic)]
+#[diag(builtin_macros_autodiff)]
+pub(crate) struct AutoDiffInvalidApplication {
+    #[primary_span]
+    pub(crate) span: Span,
+}
+
+#[cfg(not(llvm_enzyme))]
+#[derive(Diagnostic)]
+#[diag(builtin_macros_autodiff_not_build)]
+pub(crate) struct AutoDiffSupportNotBuild {
+    #[primary_span]
+    pub(crate) span: Span,
+}
+
 #[derive(Diagnostic)]
 #[diag(builtin_macros_concat_bytes_invalid)]
 pub(crate) struct ConcatBytesInvalid {
diff --git a/compiler/rustc_builtin_macros/src/lib.rs b/compiler/rustc_builtin_macros/src/lib.rs
index 377d7f542cf46..24e2ab254654e 100644
--- a/compiler/rustc_builtin_macros/src/lib.rs
+++ b/compiler/rustc_builtin_macros/src/lib.rs
@@ -16,6 +16,7 @@
 #![feature(proc_macro_internals)]
 #![feature(proc_macro_quote)]
 #![feature(rustdoc_internals)]
+#![cfg_attr(not(bootstrap), feature(autodiff))]
 #![feature(try_blocks)]
 #![warn(unreachable_pub)]
 // tidy-alphabetical-end
diff --git a/compiler/rustc_codegen_llvm/messages.ftl b/compiler/rustc_codegen_llvm/messages.ftl
index 0950e4bb26bac..26d699d5cc1d1 100644
--- a/compiler/rustc_codegen_llvm/messages.ftl
+++ b/compiler/rustc_codegen_llvm/messages.ftl
@@ -51,6 +51,10 @@ codegen_llvm_prepare_thin_lto_module_with_llvm_err = failed to prepare thin LTO
 codegen_llvm_run_passes = failed to run LLVM passes
 codegen_llvm_run_passes_with_llvm_err = failed to run LLVM passes: {$llvm_err}
 
+codegen_llvm_prepare_autodiff = failed to prepare AutoDiff: src: {$src}, target: {$target}, {$error}
+codegen_llvm_prepare_autodiff_with_llvm_err = failed to prepare AutoDiff: {$llvm_err}, src: {$src}, target: {$target}, {$error}
+codegen_llvm_autodiff_without_lto = using the autodiff feature requires using fat-lto
+
 codegen_llvm_sanitizer_memtag_requires_mte =
     `-Zsanitizer=memtag` requires `-Ctarget-feature=+mte`
 
diff --git a/compiler/rustc_codegen_llvm/src/abi.rs b/compiler/rustc_codegen_llvm/src/abi.rs
index 2fe5ed32daa31..e300af961e896 100644
--- a/compiler/rustc_codegen_llvm/src/abi.rs
+++ b/compiler/rustc_codegen_llvm/src/abi.rs
@@ -244,6 +244,7 @@ impl<'ll, 'tcx> ArgAbiExt<'ll, 'tcx> for ArgAbi<'tcx, Ty<'tcx>> {
                     scratch_align,
                     bx.const_usize(copy_bytes),
                     MemFlags::empty(),
+                    None,
                 );
                 bx.lifetime_end(llscratch, scratch_size);
             }
diff --git a/compiler/rustc_codegen_llvm/src/attributes.rs b/compiler/rustc_codegen_llvm/src/attributes.rs
index 2c5ec9dad59f1..a94ef2d5e8eea 100644
--- a/compiler/rustc_codegen_llvm/src/attributes.rs
+++ b/compiler/rustc_codegen_llvm/src/attributes.rs
@@ -1,5 +1,6 @@
 //! Set and unset common attributes on LLVM values.
 
+use rustc_ast::expand::autodiff_attrs::AutoDiffAttrs;
 use rustc_attr::{InlineAttr, InstructionSetAttr, OptimizeAttr};
 use rustc_codegen_ssa::traits::*;
 use rustc_hir::def_id::DefId;
@@ -332,6 +333,7 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>(
     instance: ty::Instance<'tcx>,
 ) {
     let codegen_fn_attrs = cx.tcx.codegen_fn_attrs(instance.def_id());
+    let autodiff_attrs: &AutoDiffAttrs = cx.tcx.autodiff_attrs(instance.def_id());
 
     let mut to_add = SmallVec::<[_; 16]>::new();
 
@@ -349,6 +351,8 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>(
     let inline =
         if codegen_fn_attrs.inline == InlineAttr::None && instance.def.requires_inline(cx.tcx) {
             InlineAttr::Hint
+        } else if autodiff_attrs.is_active() {
+            InlineAttr::Never
         } else {
             codegen_fn_attrs.inline
         };
diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
index 1f7a923dd2c68..8a69650f07e92 100644
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -616,7 +616,12 @@ pub(crate) fn run_pass_manager(
         }
         let opt_stage = if thin { llvm::OptStage::ThinLTO } else { llvm::OptStage::FatLTO };
         let opt_level = config.opt_level.unwrap_or(config::OptLevel::No);
-        write::llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage)?;
+
+        // We will run this again with different values in the context of automatic differentiation.
+        let first_run = true;
+        let noop = false;
+        debug!("running llvm pm opt pipeline");
+        write::llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage, first_run, noop)?;
     }
     debug!("lto done");
     Ok(())
@@ -723,7 +728,12 @@ pub(crate) unsafe fn optimize_thin_module(
     let llcx = unsafe { llvm::LLVMRustContextCreate(cgcx.fewer_names) };
     let llmod_raw = parse_module(llcx, module_name, thin_module.data(), dcx)? as *const _;
     let mut module = ModuleCodegen {
-        module_llvm: ModuleLlvm { llmod_raw, llcx, tm: ManuallyDrop::new(tm) },
+        module_llvm: ModuleLlvm {
+            llmod_raw,
+            llcx,
+            tm: ManuallyDrop::new(tm),
+            typetrees: Default::default(),
+        },
         name: thin_module.name().to_string(),
         kind: ModuleKind::Regular,
     };
diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs
index afdd2b581b86e..3f7eec3531def 100644
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@@ -4,10 +4,13 @@ use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::{fs, slice, str};
 
-use libc::{c_char, c_int, c_void, size_t};
+use libc::{c_char, c_int, c_uint, c_void, size_t};
 use llvm::{
-    LLVMRustLLVMHasZlibCompressionForDebugSymbols, LLVMRustLLVMHasZstdCompressionForDebugSymbols,
+    IntPredicate, LLVMRustLLVMHasZlibCompressionForDebugSymbols,
+    LLVMRustLLVMHasZstdCompressionForDebugSymbols,
 };
+use rustc_ast::expand::autodiff_attrs::{AutoDiffItem, DiffActivity, DiffMode};
+use rustc_ast::expand::typetree::FncTree;
 use rustc_codegen_ssa::back::link::ensure_removed;
 use rustc_codegen_ssa::back::write::{
     BitcodeSection, CodegenContext, EmitObj, ModuleConfig, TargetMachineFactoryConfig,
@@ -15,6 +18,7 @@ use rustc_codegen_ssa::back::write::{
 };
 use rustc_codegen_ssa::traits::*;
 use rustc_codegen_ssa::{CompiledModule, ModuleCodegen};
+use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::profiling::SelfProfilerRef;
 use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_errors::{DiagCtxtHandle, FatalError, Level};
@@ -22,12 +26,13 @@ use rustc_fs_util::{link_or_copy, path_to_c_string};
 use rustc_middle::ty::TyCtxt;
 use rustc_session::Session;
 use rustc_session::config::{
-    self, Lto, OutputType, Passes, RemapPathScopeComponents, SplitDwarfKind, SwitchWithOptPath,
+    self, AutoDiff, Lto, OutputType, Passes, RemapPathScopeComponents, SplitDwarfKind,
+    SwitchWithOptPath,
 };
 use rustc_span::InnerSpan;
 use rustc_span::symbol::sym;
 use rustc_target::spec::{CodeModel, RelocModel, SanitizerSet, SplitDebuginfo, TlsModel};
-use tracing::debug;
+use tracing::{debug, trace};
 
 use crate::back::lto::ThinBuffer;
 use crate::back::owned_target_machine::OwnedTargetMachine;
@@ -39,9 +44,24 @@ use crate::errors::{
     WithLlvmError, WriteBytecode,
 };
 use crate::llvm::diagnostic::OptimizationDiagnosticKind::*;
-use crate::llvm::{self, DiagnosticInfo, PassManager};
+use crate::llvm::{
+    self, AttributeKind, CreateEnzymeLogic, CreateTypeAnalysis, DiagnosticInfo, EnzymeLogicRef,
+    EnzymeTypeAnalysisRef, FreeTypeAnalysis, LLVMAppendBasicBlockInContext, LLVMBuildCall2,
+    LLVMBuildCondBr, LLVMBuildExtractValue, LLVMBuildICmp, LLVMBuildRet, LLVMBuildRetVoid,
+    LLVMCountParams, LLVMCountStructElementTypes, LLVMCreateBuilderInContext,
+    LLVMCreateStringAttribute, LLVMDisposeBuilder, LLVMDumpModule, LLVMGetFirstBasicBlock,
+    LLVMGetFirstFunction, LLVMGetNextFunction, LLVMGetParams, LLVMGetReturnType,
+    LLVMGetStringAttributeAtIndex, LLVMGlobalGetValueType, LLVMIsEnumAttribute,
+    LLVMIsStringAttribute, LLVMMetadataAsValue, LLVMPositionBuilderAtEnd,
+    LLVMRemoveStringAttributeAtIndex, LLVMRustAddEnumAttributeAtIndex,
+    LLVMRustAddFunctionAttributes, LLVMRustDIGetInstMetadata, LLVMRustEraseInstBefore,
+    LLVMRustEraseInstFromParent, LLVMRustGetEnumAttributeAtIndex, LLVMRustGetFunctionType,
+    LLVMRustGetLastInstruction, LLVMRustGetTerminator, LLVMRustHasMetadata,
+    LLVMRustRemoveEnumAttributeAtIndex, LLVMVerifyFunction, LLVMVoidTypeInContext, PassManager,
+    Value, enzyme_rust_forward_diff, enzyme_rust_reverse_diff,
+};
 use crate::type_::Type;
-use crate::{LlvmCodegenBackend, ModuleLlvm, base, common, llvm_util};
+use crate::{DiffTypeTree, LlvmCodegenBackend, ModuleLlvm, base, common, llvm_util};
 
 pub(crate) fn llvm_err<'a>(dcx: DiagCtxtHandle<'_>, err: LlvmError<'a>) -> FatalError {
     match llvm::last_error() {
@@ -515,9 +535,38 @@ pub(crate) unsafe fn llvm_optimize(
     config: &ModuleConfig,
     opt_level: config::OptLevel,
     opt_stage: llvm::OptStage,
+    first_run: bool,
+    noop: bool,
 ) -> Result<(), FatalError> {
-    let unroll_loops =
-        opt_level != config::OptLevel::Size && opt_level != config::OptLevel::SizeMin;
+    if noop {
+        return Ok(());
+    }
+    // Enzyme:
+    // The whole point of compiler based AD is to differentiate optimized IR instead of unoptimized
+    // source code. However, benchmarks show that optimizations increasing the code size
+    // tend to reduce AD performance. Therefore deactivate them before AD, then differentiate the code
+    // and finally re-optimize the module, now with all optimizations available.
+    // TODO: In a future update we could figure out how to only optimize functions getting
+    // differentiated.
+
+    let unroll_loops;
+    let vectorize_slp;
+    let vectorize_loop;
+
+    if first_run {
+        unroll_loops = false;
+        vectorize_slp = false;
+        vectorize_loop = false;
+    } else {
+        unroll_loops =
+            opt_level != config::OptLevel::Size && opt_level != config::OptLevel::SizeMin;
+        vectorize_slp = config.vectorize_slp;
+        vectorize_loop = config.vectorize_loop;
+    }
+    trace!(
+        "Enzyme: Running with unroll_loops: {}, vectorize_slp: {}, vectorize_loop: {}",
+        unroll_loops, vectorize_slp, vectorize_loop
+    );
     let using_thin_buffers = opt_stage == llvm::OptStage::PreLinkThinLTO || config.bitcode_needed();
     let pgo_gen_path = get_pgo_gen_path(config);
     let pgo_use_path = get_pgo_use_path(config);
@@ -581,8 +630,8 @@ pub(crate) unsafe fn llvm_optimize(
             using_thin_buffers,
             config.merge_functions,
             unroll_loops,
-            config.vectorize_slp,
-            config.vectorize_loop,
+            vectorize_slp,
+            vectorize_loop,
             config.no_builtins,
             config.emit_lifetime_markers,
             sanitizer_options.as_ref(),
@@ -605,6 +654,655 @@ pub(crate) unsafe fn llvm_optimize(
     result.into_result().map_err(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
 }
 
+fn get_params(fnc: &Value) -> Vec<&Value> {
+    unsafe {
+        let param_num = LLVMCountParams(fnc) as usize;
+        let mut fnc_args: Vec<&Value> = vec![];
+        fnc_args.reserve(param_num);
+        LLVMGetParams(fnc, fnc_args.as_mut_ptr());
+        fnc_args.set_len(param_num);
+        fnc_args
+    }
+}
+
+// DESIGN:
+// Today we have our placeholder function, and our Enzyme generated one.
+// We create a wrapper function and delete the placeholder body. You can see the
+// placeholder by running `cargo expand` on an autodiff invocation. We call the wrapper
+// from the placeholder. This function is a bit longer, because it matches the Rust level
+// autodiff macro with LLVM level Enzyme autodiff expectations.
+//
+// Think of computing the derivative with respect to &[f32] by marking it as duplicated.
+// The user will then pass an extra &mut [f32] and we want add the derivative to that.
+// On LLVM/Enzyme level, &[f32] however becomes `ptr, i64` and we mark ptr as duplicated,
+// and i64 (len) as const. Enzyme will then expect `ptr, ptr, i64` as arguments. See how the
+// second i64 from the mut slice isn't used? That's why we add a safety check to assert
+// that the second (mut) slice is at least as long as the first (const) slice. Otherwise,
+// Enzyme would write out of bounds if the first (const) slice is longer than the second.
+
+unsafe fn create_call<'a>(
+    tgt: &'a Value,
+    src: &'a Value,
+    llmod: &'a llvm::Module,
+    llcx: &llvm::Context,
+    // FIXME: Instead of recomputing the positions as we do it below, we should
+    // start using this list of positions that indicate length integers.
+    _size_positions: &[usize],
+    ad: &[AutoDiff],
+) {
+    unsafe {
+        // first, remove all calls from fnc
+        let bb = LLVMGetFirstBasicBlock(tgt);
+        let br = LLVMRustGetTerminator(bb);
+        LLVMRustEraseInstFromParent(br);
+
+        // now add a call to inner.
+        // append call to src at end of bb.
+        let f_ty = LLVMRustGetFunctionType(src);
+
+        let inner_param_num = LLVMCountParams(src);
+        let outer_param_num = LLVMCountParams(tgt);
+        let outer_args: Vec<&Value> = get_params(tgt);
+        let inner_args: Vec<&Value> = get_params(src);
+        let mut call_args: Vec<&Value> = vec![];
+
+        let mut safety_vals = vec![];
+        let builder = LLVMCreateBuilderInContext(llcx);
+        let last_inst = LLVMRustGetLastInstruction(bb).unwrap();
+        LLVMPositionBuilderAtEnd(builder, bb);
+
+        let safety_run_checks = !ad.contains(&AutoDiff::NoSafetyChecks);
+
+        if inner_param_num == outer_param_num {
+            call_args = outer_args;
+        } else {
+            trace!("Different number of args, adjusting");
+            let mut outer_pos: usize = 0;
+            let mut inner_pos: usize = 0;
+            // copy over if they are identical.
+            // If not, skip the outer arg (and assert it's int).
+            while outer_pos < outer_param_num as usize {
+                let inner_arg = inner_args[inner_pos];
+                let outer_arg = outer_args[outer_pos];
+                let inner_arg_ty = llvm::LLVMTypeOf(inner_arg);
+                let outer_arg_ty = llvm::LLVMTypeOf(outer_arg);
+                if inner_arg_ty == outer_arg_ty {
+                    call_args.push(outer_arg);
+                    inner_pos += 1;
+                    outer_pos += 1;
+                } else {
+                    // out: rust: (&[f32], &mut [f32])
+                    // out: llvm: (ptr, <>int1, ptr, int2)
+                    // inner: (ptr, <>ptr, int)
+                    // goal: call (ptr, ptr, int1), skipping int2
+                    // we are here: <>
+                    assert!(llvm::LLVMRustGetTypeKind(outer_arg_ty) == llvm::TypeKind::Integer);
+                    assert!(llvm::LLVMRustGetTypeKind(inner_arg_ty) == llvm::TypeKind::Pointer);
+                    let next_outer_arg = outer_args[outer_pos + 1];
+                    let next_inner_arg = inner_args[inner_pos + 1];
+                    let next_outer_arg_ty = llvm::LLVMTypeOf(next_outer_arg);
+                    let next_inner_arg_ty = llvm::LLVMTypeOf(next_inner_arg);
+                    assert!(
+                        llvm::LLVMRustGetTypeKind(next_outer_arg_ty) == llvm::TypeKind::Pointer
+                    );
+                    assert!(
+                        llvm::LLVMRustGetTypeKind(next_inner_arg_ty) == llvm::TypeKind::Integer
+                    );
+                    let next2_outer_arg = outer_args[outer_pos + 2];
+                    let next2_outer_arg_ty = llvm::LLVMTypeOf(next2_outer_arg);
+                    assert!(
+                        llvm::LLVMRustGetTypeKind(next2_outer_arg_ty) == llvm::TypeKind::Integer
+                    );
+                    call_args.push(next_outer_arg);
+                    call_args.push(outer_arg);
+
+                    outer_pos += 3;
+                    inner_pos += 2;
+
+                    if safety_run_checks {
+                        // Now we assert if int1 <= int2
+                        let res = LLVMBuildICmp(
+                            builder,
+                            IntPredicate::IntULE as u32,
+                            outer_arg,
+                            next2_outer_arg,
+                            "safety_check".as_ptr() as *const c_char,
+                        );
+                        safety_vals.push(res);
+                    }
+                }
+            }
+        }
+
+        if inner_param_num as usize != call_args.len() {
+            panic!(
+                "Args len shouldn't differ. Please report this. {} : {}",
+                inner_param_num,
+                call_args.len()
+            );
+        }
+
+        // Now add the safety checks.
+        if !safety_vals.is_empty() {
+            dbg!("Adding safety checks");
+            assert!(safety_run_checks);
+            // first we create one bb per check and two more for the fail and success case.
+            let fail_bb = LLVMAppendBasicBlockInContext(
+                llcx,
+                tgt,
+                "ad_safety_fail".as_ptr() as *const c_char,
+            );
+            let success_bb = LLVMAppendBasicBlockInContext(
+                llcx,
+                tgt,
+                "ad_safety_success".as_ptr() as *const c_char,
+            );
+            for i in 1..safety_vals.len() {
+                // 'or' all safety checks together
+                // Doing some binary tree style or'ing here would be more efficient,
+                // but I assume LLVM will opt it anyway
+                let prev = safety_vals[i - 1];
+                let curr = safety_vals[i];
+                let res = llvm::LLVMBuildOr(
+                    builder,
+                    prev,
+                    curr,
+                    "safety_check".as_ptr() as *const c_char,
+                );
+                safety_vals[i] = res;
+            }
+            LLVMBuildCondBr(builder, safety_vals.last().unwrap(), success_bb, fail_bb);
+            LLVMPositionBuilderAtEnd(builder, fail_bb);
+
+            let panic_name: CString = get_panic_name(llmod);
+
+            let mut arg_vec = vec![add_panic_msg_to_global(llmod, llcx)];
+
+            let fnc1 = llvm::LLVMGetNamedFunction(llmod, panic_name.as_ptr() as *const c_char);
+            assert!(fnc1.is_some());
+            let fnc1 = fnc1.unwrap();
+            let ty = LLVMRustGetFunctionType(fnc1);
+            let call = LLVMBuildCall2(
+                builder,
+                ty,
+                fnc1,
+                arg_vec.as_mut_ptr(),
+                arg_vec.len(),
+                panic_name.as_ptr() as *const c_char,
+            );
+            llvm::LLVMSetTailCall(call, 1);
+            llvm::LLVMBuildUnreachable(builder);
+            LLVMPositionBuilderAtEnd(builder, success_bb);
+        }
+
+        let inner_fnc_name = llvm::get_value_name(src);
+        let c_inner_fnc_name = CString::new(inner_fnc_name).unwrap();
+
+        let mut struct_ret = LLVMBuildCall2(
+            builder,
+            f_ty,
+            src,
+            call_args.as_mut_ptr(),
+            call_args.len(),
+            c_inner_fnc_name.as_ptr(),
+        );
+
+        // Add dummy dbg info to our newly generated call, if we have any.
+        let md_ty = llvm::LLVMGetMDKindIDInContext(
+            llcx,
+            "dbg".as_ptr() as *const c_char,
+            "dbg".len() as c_uint,
+        );
+
+        if LLVMRustHasMetadata(last_inst, md_ty) {
+            let md = LLVMRustDIGetInstMetadata(last_inst);
+            let md_val = LLVMMetadataAsValue(llcx, md);
+            let _md2 = llvm::LLVMSetMetadata(struct_ret, md_ty, md_val);
+        } else {
+            trace!("No dbg info");
+        }
+
+        // Now clean up placeholder code.
+        LLVMRustEraseInstBefore(bb, last_inst);
+
+        let f_return_type = LLVMGetReturnType(LLVMGlobalGetValueType(src));
+        let f_is_struct = llvm::LLVMRustIsStructType(f_return_type);
+        let void_type = LLVMVoidTypeInContext(llcx);
+        // Now unwrap the struct_ret if it's actually a struct
+        if f_is_struct {
+            let num_elem_in_ret_struct = LLVMCountStructElementTypes(f_return_type);
+            if num_elem_in_ret_struct == 1 {
+                let inner_grad_name = "foo".to_string();
+                let c_inner_grad_name = CString::new(inner_grad_name).unwrap();
+                struct_ret =
+                    LLVMBuildExtractValue(builder, struct_ret, 0, c_inner_grad_name.as_ptr());
+            }
+        }
+        if f_return_type != void_type {
+            let _ret = LLVMBuildRet(builder, struct_ret);
+        } else {
+            let _ret = LLVMBuildRetVoid(builder);
+        }
+        LLVMDisposeBuilder(builder);
+        let _fnc_ok =
+            LLVMVerifyFunction(tgt, llvm::LLVMVerifierFailureAction::LLVMAbortProcessAction);
+    }
+}
+
+unsafe fn get_panic_name(llmod: &llvm::Module) -> CString {
+    // The names are mangled and their ending changes based on a hash, so just take whichever.
+    let mut f = unsafe { LLVMGetFirstFunction(llmod) };
+    loop {
+        if let Some(lf) = f {
+            f = unsafe { LLVMGetNextFunction(lf) };
+            let fnc_name = llvm::get_value_name(lf);
+            let fnc_name: String = String::from_utf8(fnc_name.to_vec()).unwrap();
+            if fnc_name.starts_with("_ZN4core9panicking14panic_explicit") {
+                return CString::new(fnc_name).unwrap();
+            } else if fnc_name.starts_with("_RN4core9panicking14panic_explicit") {
+                return CString::new(fnc_name).unwrap();
+            }
+        } else {
+            break;
+        }
+    }
+    panic!("Could not find panic function");
+}
+
+// This code is called when Enzyme detects at runtime that one of the safety invariants is violated.
+// For now we only check if shadow arguments are large enough. In this case we look for Rust panic
+// functions in the module and call it. Due to hashing we can't hardcode the panic function name.
+// Note: This worked even for panic=abort tests so seems solid enough for now.
+// FIXME: Pick a panic function which allows displaying an error message.
+// FIXME: We probably want to keep a handle at higher level and pass it down instead of searching.
+unsafe fn add_panic_msg_to_global<'a>(
+    llmod: &'a llvm::Module,
+    llcx: &'a llvm::Context,
+) -> &'a llvm::Value {
+    unsafe {
+        use llvm::*;
+
+        // Convert the message to a CString
+        let msg = "autodiff safety check failed!";
+        let cmsg = CString::new(msg).unwrap();
+
+        let msg_global_name = "ad_safety_msg".to_string();
+        let cmsg_global_name = CString::new(msg_global_name).unwrap();
+
+        // Get the length of the message
+        let msg_len = msg.len();
+
+        // Create the array type
+        let i8_array_type = LLVMArrayType2(LLVMInt8TypeInContext(llcx), msg_len as u64);
+
+        // Create the string constant
+        let _string_const_val =
+            LLVMConstStringInContext2(llcx, cmsg.as_ptr() as *const i8, msg_len as usize, 0);
+
+        // Create the array initializer
+        let mut array_elems: Vec<_> = Vec::with_capacity(msg_len);
+        for i in 0..msg_len {
+            let char_value =
+                LLVMConstInt(LLVMInt8TypeInContext(llcx), cmsg.as_bytes()[i] as u64, 0);
+            array_elems.push(char_value);
+        }
+        let array_initializer =
+            LLVMConstArray2(LLVMInt8TypeInContext(llcx), array_elems.as_mut_ptr(), msg_len as u64);
+
+        // Create the struct type
+        let global_type = LLVMStructTypeInContext(llcx, [i8_array_type].as_mut_ptr(), 1, 0);
+
+        // Create the struct initializer
+        let struct_initializer =
+            LLVMConstStructInContext(llcx, [array_initializer].as_mut_ptr(), 1, 0);
+
+        // Add the global variable to the module
+        let global_var = LLVMAddGlobal(llmod, global_type, cmsg_global_name.as_ptr() as *const i8);
+        LLVMRustSetLinkage(global_var, Linkage::PrivateLinkage);
+        LLVMSetInitializer(global_var, struct_initializer);
+
+        global_var
+    }
+}
+use rustc_errors::DiagCtxt;
+
+// As unsafe as it can be.
+#[allow(unused_variables)]
+#[allow(unused)]
+pub(crate) unsafe fn enzyme_ad(
+    llmod: &llvm::Module,
+    llcx: &llvm::Context,
+    diag_handler: &DiagCtxt,
+    item: AutoDiffItem,
+    logic_ref: EnzymeLogicRef,
+    ad: &[AutoDiff],
+) -> Result<(), FatalError> {
+    let autodiff_mode = item.attrs.mode;
+    let rust_name = item.source;
+    let rust_name2 = &item.target;
+
+    let args_activity = item.attrs.input_activity.clone();
+    let ret_activity: DiffActivity = item.attrs.ret_activity;
+
+    // get target and source function
+    let name = CString::new(rust_name.to_owned()).unwrap();
+    let name2 = CString::new(rust_name2.clone()).unwrap();
+    let src_fnc_opt = unsafe { llvm::LLVMGetNamedFunction(llmod, name.as_c_str().as_ptr()) };
+    let src_fnc = match src_fnc_opt {
+        Some(x) => x,
+        None => {
+            return Err(llvm_err(diag_handler.handle(), LlvmError::PrepareAutoDiff {
+                src: rust_name.to_owned(),
+                target: rust_name2.to_owned(),
+                error: "could not find src function".to_owned(),
+            }));
+        }
+    };
+    let target_fnc_opt = unsafe { llvm::LLVMGetNamedFunction(llmod, name2.as_ptr()) };
+    let target_fnc = match target_fnc_opt {
+        Some(x) => x,
+        None => {
+            return Err(llvm_err(diag_handler.handle(), LlvmError::PrepareAutoDiff {
+                src: rust_name.to_owned(),
+                target: rust_name2.to_owned(),
+                error: "could not find target function".to_owned(),
+            }));
+        }
+    };
+    let src_num_args = unsafe { llvm::LLVMCountParams(src_fnc) };
+    let target_num_args = unsafe { llvm::LLVMCountParams(target_fnc) };
+    // A really simple check
+    assert!(src_num_args <= target_num_args);
+
+    let type_analysis: EnzymeTypeAnalysisRef =
+        unsafe { CreateTypeAnalysis(logic_ref, std::ptr::null_mut(), std::ptr::null_mut(), 0) };
+
+    llvm::set_strict_aliasing(false);
+
+    if ad.contains(&AutoDiff::PrintTA) {
+        llvm::set_print_type(true);
+    }
+    if ad.contains(&AutoDiff::PrintTA) {
+        llvm::set_print_type(true);
+    }
+    if ad.contains(&AutoDiff::PrintPerf) {
+        llvm::set_print_perf(true);
+    }
+    if ad.contains(&AutoDiff::Print) {
+        llvm::set_print(true);
+    }
+
+    let mode = match autodiff_mode {
+        DiffMode::Forward => DiffMode::Forward,
+        DiffMode::Reverse => DiffMode::Reverse,
+        DiffMode::ForwardFirst => DiffMode::Forward,
+        DiffMode::ReverseFirst => DiffMode::Reverse,
+        _ => unreachable!(),
+    };
+
+    unsafe {
+        let void_type = LLVMVoidTypeInContext(llcx);
+        let return_type = LLVMGetReturnType(LLVMGlobalGetValueType(src_fnc));
+        let void_ret = void_type == return_type;
+        let mut tmp = match mode {
+            DiffMode::Forward => enzyme_rust_forward_diff(
+                logic_ref,
+                type_analysis,
+                src_fnc,
+                args_activity,
+                ret_activity,
+                void_ret,
+            ),
+            DiffMode::Reverse => enzyme_rust_reverse_diff(
+                logic_ref,
+                type_analysis,
+                src_fnc,
+                args_activity,
+                ret_activity,
+            ),
+            _ => unreachable!(),
+        };
+        let mut res: &Value = tmp.0;
+        // res is getting wrapped, but we don't want the perf overhead of a fnc call indirection.
+        // So we'll add an alwaysinline attribute to let llvm handle it for us.
+        //
+        // TODO(ZuseZ4): enable this, but with the right position of the arg
+        //let always_inline = llvm::AttributeKind::AlwaysInline;
+        //let attr = llvm::LLVMRustCreateAttrNoValue(llcx, always_inline);
+        //llvm::LLVMRustAddFunctionAttributes(res, 9, &attr, 1);
+
+        let size_positions: Vec<usize> = tmp.1;
+
+        let f_return_type = LLVMGetReturnType(LLVMGlobalGetValueType(res));
+
+        create_call(target_fnc, res, llmod, llcx, &size_positions, ad);
+        // TODO: implement drop for wrapper type?
+        FreeTypeAnalysis(type_analysis);
+    }
+
+    Ok(())
+}
+
+pub(crate) unsafe fn differentiate(
+    module: &ModuleCodegen<ModuleLlvm>,
+    cgcx: &CodegenContext<LlvmCodegenBackend>,
+    diff_items: Vec<AutoDiffItem>,
+    _typetrees: FxHashMap<String, DiffTypeTree>,
+    config: &ModuleConfig,
+) -> Result<(), FatalError> {
+    for item in &diff_items {
+        trace!("{}", item);
+    }
+
+    let llmod = module.module_llvm.llmod();
+    let llcx = &module.module_llvm.llcx;
+    let diag_handler = cgcx.create_dcx();
+
+    llvm::set_strict_aliasing(false);
+
+    let ad = &config.autodiff;
+
+    if ad.contains(&AutoDiff::LooseTypes) {
+        dbg!("Setting loose types to true");
+        llvm::set_loose_types(true);
+    }
+
+    // Before dumping the module, we want all the tt to become part of the module.
+    for (i, item) in diff_items.iter().enumerate() {
+        let tt: FncTree = FncTree { args: item.inputs.clone(), ret: item.output.clone() };
+        let name = CString::new(item.source.clone()).unwrap();
+        let fn_def: &llvm::Value =
+            unsafe { llvm::LLVMGetNamedFunction(llmod, name.as_ptr()).unwrap() };
+        crate::builder::add_tt2(llmod, llcx, fn_def, tt);
+
+        // Before dumping the module, we also might want to add dummy functions,  which will
+        // trigger the LLVMEnzyme pass to run on them, if we invoke the opt binary.
+        // This is super helpfull if we want to create a MWE bug reproducer, e.g. to run in
+        // Enzyme's compiler explorer. TODO: Can we run llvm-extract on the module to remove all other functions?
+        if ad.contains(&AutoDiff::OPT) {
+            dbg!("Enable extra debug helper to debug Enzyme through the opt plugin");
+            crate::builder::add_opt_dbg_helper(llmod, llcx, fn_def, item.attrs.clone(), i);
+        }
+    }
+
+    if ad.contains(&AutoDiff::PrintModBefore) || ad.contains(&AutoDiff::OPT) {
+        unsafe {
+            LLVMDumpModule(llmod);
+        }
+    }
+
+    if ad.contains(&AutoDiff::Inline) {
+        dbg!("Setting inline to true");
+        llvm::set_inline(true);
+    }
+
+    if ad.contains(&AutoDiff::RuntimeActivity) {
+        dbg!("Setting runtime activity check to true");
+        llvm::set_runtime_activity_check(true);
+    }
+
+    for val in ad {
+        match &val {
+            AutoDiff::TTDepth(depth) => {
+                assert!(*depth >= 1);
+                llvm::set_max_int_offset(*depth);
+            }
+            AutoDiff::TTWidth(width) => {
+                assert!(*width >= 1);
+                llvm::set_max_type_offset(*width);
+            }
+            _ => {}
+        }
+    }
+
+    let differentiate = !diff_items.is_empty();
+    let mut first_order_items: Vec<AutoDiffItem> = vec![];
+    let mut higher_order_items: Vec<AutoDiffItem> = vec![];
+    for item in diff_items {
+        if item.attrs.mode == DiffMode::ForwardFirst || item.attrs.mode == DiffMode::ReverseFirst {
+            first_order_items.push(item);
+        } else {
+            // default
+            higher_order_items.push(item);
+        }
+    }
+
+    let fnc_opt = ad.contains(&AutoDiff::EnableFncOpt);
+
+    // If a function is a base for some higher order ad, always optimize
+    let fnc_opt_base = true;
+    let logic_ref_opt: EnzymeLogicRef = unsafe { CreateEnzymeLogic(fnc_opt_base as u8) };
+
+    for item in first_order_items {
+        let res =
+            unsafe { enzyme_ad(llmod, llcx, &diag_handler.handle(), item, logic_ref_opt, ad) };
+        assert!(res.is_ok());
+    }
+
+    // For the rest, follow the user choice on debug vs release.
+    // Reuse the opt one if possible for better compile time (Enzyme internal caching).
+    let logic_ref = match fnc_opt {
+        true => {
+            dbg!("Enable extra optimizations for Enzyme");
+            logic_ref_opt
+        }
+        false => unsafe { CreateEnzymeLogic(fnc_opt as u8) },
+    };
+    for item in higher_order_items {
+        let res = unsafe { enzyme_ad(llmod, llcx, &diag_handler.handle(), item, logic_ref, ad) };
+        assert!(res.is_ok());
+    }
+
+    unsafe {
+        let mut f = LLVMGetFirstFunction(llmod);
+        loop {
+            if let Some(lf) = f {
+                f = LLVMGetNextFunction(lf);
+                let myhwattr = "enzyme_hw";
+                let attr = LLVMGetStringAttributeAtIndex(
+                    lf,
+                    c_uint::MAX,
+                    myhwattr.as_ptr() as *const c_char,
+                    myhwattr.as_bytes().len() as c_uint,
+                );
+                if LLVMIsStringAttribute(attr) {
+                    LLVMRemoveStringAttributeAtIndex(
+                        lf,
+                        c_uint::MAX,
+                        myhwattr.as_ptr() as *const c_char,
+                        myhwattr.as_bytes().len() as c_uint,
+                    );
+                } else {
+                    LLVMRustRemoveEnumAttributeAtIndex(
+                        lf,
+                        c_uint::MAX,
+                        AttributeKind::SanitizeHWAddress,
+                    );
+                }
+            } else {
+                break;
+            }
+        }
+        if ad.contains(&AutoDiff::PrintModAfterEnzyme) {
+            LLVMDumpModule(llmod);
+        }
+    }
+
+    if ad.contains(&AutoDiff::NoModOptAfter) || !differentiate {
+        trace!("Skipping module optimization after automatic differentiation");
+    } else {
+        if let Some(opt_level) = config.opt_level {
+            let opt_stage = match cgcx.lto {
+                Lto::Fat => llvm::OptStage::PreLinkFatLTO,
+                Lto::Thin | Lto::ThinLocal => llvm::OptStage::PreLinkThinLTO,
+                _ if cgcx.opts.cg.linker_plugin_lto.enabled() => llvm::OptStage::PreLinkThinLTO,
+                _ => llvm::OptStage::PreLinkNoLTO,
+            };
+            let mut first_run = false;
+            dbg!("Running Module Optimization after differentiation");
+            if ad.contains(&AutoDiff::NoVecUnroll) {
+                // disables vectorization and loop unrolling
+                first_run = true;
+            }
+            if ad.contains(&AutoDiff::AltPipeline) {
+                dbg!("Running first postAD optimization");
+                first_run = true;
+            }
+            let noop = false;
+            unsafe {
+                llvm_optimize(
+                    cgcx,
+                    diag_handler.handle(),
+                    module,
+                    config,
+                    opt_level,
+                    opt_stage,
+                    first_run,
+                    noop,
+                )?
+            };
+        }
+        if ad.contains(&AutoDiff::AltPipeline) {
+            dbg!("Running Second postAD optimization");
+            if let Some(opt_level) = config.opt_level {
+                let opt_stage = match cgcx.lto {
+                    Lto::Fat => llvm::OptStage::PreLinkFatLTO,
+                    Lto::Thin | Lto::ThinLocal => llvm::OptStage::PreLinkThinLTO,
+                    _ if cgcx.opts.cg.linker_plugin_lto.enabled() => llvm::OptStage::PreLinkThinLTO,
+                    _ => llvm::OptStage::PreLinkNoLTO,
+                };
+                let mut first_run = false;
+                dbg!("Running Module Optimization after differentiation");
+                if ad.contains(&AutoDiff::NoVecUnroll) {
+                    // enables vectorization and loop unrolling
+                    first_run = false;
+                }
+                let noop = false;
+                unsafe {
+                    llvm_optimize(
+                        cgcx,
+                        diag_handler.handle(),
+                        module,
+                        config,
+                        opt_level,
+                        opt_stage,
+                        first_run,
+                        noop,
+                    )?
+                };
+            }
+        }
+    }
+
+    if ad.contains(&AutoDiff::PrintModAfterOpts) {
+        unsafe {
+            LLVMDumpModule(llmod);
+        }
+    }
+
+    Ok(())
+}
+
 // Unsafe due to LLVM calls.
 pub(crate) unsafe fn optimize(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
@@ -627,6 +1325,47 @@ pub(crate) unsafe fn optimize(
         unsafe { llvm::LLVMWriteBitcodeToFile(llmod, out.as_ptr()) };
     }
 
+    // This code enables Enzyme to differentiate code containing Rust enums.
+    // By adding the SanitizeHWAddress attribute we prevent LLVM from Optimizing
+    // away the enums and allows Enzyme to understand why a value can be of different types in
+    // different code sections. We remove this attribute after Enzyme is done, to not affect the
+    // rest of the compilation.
+    // TODO: only enable this code when at least one function gets differentiated.
+    unsafe {
+        let mut f = LLVMGetFirstFunction(llmod);
+        loop {
+            if let Some(lf) = f {
+                f = LLVMGetNextFunction(lf);
+                let myhwattr = "enzyme_hw";
+                let myhwv = "";
+                let prevattr = LLVMRustGetEnumAttributeAtIndex(
+                    lf,
+                    c_uint::MAX,
+                    AttributeKind::SanitizeHWAddress,
+                );
+                if LLVMIsEnumAttribute(prevattr) {
+                    let attr = LLVMCreateStringAttribute(
+                        llcx,
+                        myhwattr.as_ptr() as *const c_char,
+                        myhwattr.as_bytes().len() as c_uint,
+                        myhwv.as_ptr() as *const c_char,
+                        myhwv.as_bytes().len() as c_uint,
+                    );
+                    LLVMRustAddFunctionAttributes(lf, c_uint::MAX, &attr, 1);
+                } else {
+                    LLVMRustAddEnumAttributeAtIndex(
+                        llcx,
+                        lf,
+                        c_uint::MAX,
+                        AttributeKind::SanitizeHWAddress,
+                    );
+                }
+            } else {
+                break;
+            }
+        }
+    }
+
     if let Some(opt_level) = config.opt_level {
         let opt_stage = match cgcx.lto {
             Lto::Fat => llvm::OptStage::PreLinkFatLTO,
@@ -634,7 +1373,19 @@ pub(crate) unsafe fn optimize(
             _ if cgcx.opts.cg.linker_plugin_lto.enabled() => llvm::OptStage::PreLinkThinLTO,
             _ => llvm::OptStage::PreLinkNoLTO,
         };
-        return unsafe { llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage) };
+
+        // Second run only relevant for AD
+        let first_run = true;
+        let noop = false;
+        //if ad.contains(&AutoDiff::AltPipeline) {
+        //    noop = true;
+        //    dbg!("Skipping PreAD optimization");
+        //} else {
+        //    noop = false;
+        //}
+        return unsafe {
+            llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage, first_run, noop)
+        };
     }
     Ok(())
 }
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index 8702532c36eee..f640ee2ba357f 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -5,6 +5,8 @@ use std::{iter, ptr};
 use libc::{c_char, c_uint};
 use rustc_abi as abi;
 use rustc_abi::{Align, Size, WrappingRange};
+use rustc_ast::expand::autodiff_attrs::{AutoDiffAttrs, DiffActivity, DiffMode};
+use rustc_ast::expand::typetree::{FncTree, TypeTree};
 use rustc_codegen_ssa::MemFlags;
 use rustc_codegen_ssa::common::{IntPredicate, RealPredicate, SynchronizationScope, TypeKind};
 use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
@@ -24,7 +26,7 @@ use rustc_span::Span;
 use rustc_target::abi::call::FnAbi;
 use rustc_target::spec::{HasTargetSpec, SanitizerSet, Target};
 use smallvec::SmallVec;
-use tracing::{debug, instrument};
+use tracing::{debug, instrument, trace};
 
 use crate::abi::FnAbiLlvmExt;
 use crate::attributes;
@@ -33,7 +35,204 @@ use crate::context::CodegenCx;
 use crate::llvm::{self, AtomicOrdering, AtomicRmwBinOp, BasicBlock, False, True};
 use crate::type_::Type;
 use crate::type_of::LayoutLlvmExt;
+use crate::typetree::to_enzyme_typetree;
 use crate::value::Value;
+use crate::llvm::Metadata;
+
+pub(crate) fn add_tt2<'ll>(
+    llmod: &'ll llvm::Module,
+    llcx: &'ll llvm::Context,
+    fn_def: &'ll Value,
+    tt: FncTree,
+) {
+    let inputs = tt.args;
+    let ret_tt: TypeTree = tt.ret;
+    let llvm_data_layout: *const c_char = unsafe { llvm::LLVMGetDataLayoutStr(&*llmod) };
+    let llvm_data_layout =
+        std::str::from_utf8(unsafe { std::ffi::CStr::from_ptr(llvm_data_layout) }.to_bytes())
+            .expect("got a non-UTF8 data-layout from LLVM");
+    let attr_name = "enzyme_type";
+    let c_attr_name = std::ffi::CString::new(attr_name).unwrap();
+    for (i, &ref input) in inputs.iter().enumerate() {
+        let c_tt = to_enzyme_typetree(input.clone(), llvm_data_layout, llcx);
+        let c_str = unsafe { llvm::EnzymeTypeTreeToString(c_tt.inner) };
+        let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
+        unsafe {
+            let attr = llvm::LLVMCreateStringAttribute(
+                llcx,
+                c_attr_name.as_ptr(),
+                c_attr_name.as_bytes().len() as c_uint,
+                c_str.as_ptr(),
+                c_str.to_bytes().len() as c_uint,
+            );
+            llvm::LLVMRustAddFncParamAttr(fn_def, i as u32, attr);
+        }
+        unsafe { llvm::EnzymeTypeTreeToStringFree(c_str.as_ptr()) };
+    }
+    let ret_attr = unsafe {
+        let c_tt = to_enzyme_typetree(ret_tt, llvm_data_layout, llcx);
+        let c_str = llvm::EnzymeTypeTreeToString(c_tt.inner);
+        let c_str = std::ffi::CStr::from_ptr(c_str);
+        let attr = llvm::LLVMCreateStringAttribute(
+            llcx,
+            c_attr_name.as_ptr(),
+            c_attr_name.as_bytes().len() as c_uint,
+            c_str.as_ptr(),
+            c_str.to_bytes().len() as c_uint,
+        );
+        llvm::EnzymeTypeTreeToStringFree(c_str.as_ptr());
+        attr
+    };
+    unsafe {
+        llvm::LLVMRustAddRetFncAttr(fn_def, ret_attr);
+    }
+}
+
+#[allow(unused)]
+pub(crate) fn add_opt_dbg_helper<'ll>(
+    llmod: &'ll llvm::Module,
+    llcx: &'ll llvm::Context,
+    val: &'ll Value,
+    attrs: AutoDiffAttrs,
+    i: usize,
+) {
+    let inputs = attrs.input_activity;
+    let outputs = attrs.ret_activity;
+    let ad_name = match attrs.mode {
+        DiffMode::Forward => "__enzyme_fwddiff",
+        DiffMode::Reverse => "__enzyme_autodiff",
+        DiffMode::ForwardFirst => "__enzyme_fwddiff",
+        DiffMode::ReverseFirst => "__enzyme_autodiff",
+        _ => panic!("Why are we here?"),
+    };
+
+    // Assuming that our val is the fnc square, want to generate the following llvm-ir:
+    // declare double @__enzyme_autodiff(...)
+    //
+    // define double @dsquare(double %x) {
+    // entry:
+    //   %0 = tail call double (...) @__enzyme_autodiff(double (double)* nonnull @square, double %x)
+    //   ret double %0
+    // }
+
+    let mut final_num_args;
+    unsafe {
+        let fn_ty = llvm::LLVMRustGetFunctionType(val);
+        let ret_ty = llvm::LLVMGetReturnType(fn_ty);
+
+        // First we add the declaration of the __enzyme function
+        let enzyme_ty = llvm::LLVMFunctionType(ret_ty, ptr::null(), 0, True);
+        let ad_fn = llvm::LLVMRustGetOrInsertFunction(
+            llmod,
+            ad_name.as_ptr() as *const c_char,
+            ad_name.len().try_into().unwrap(),
+            enzyme_ty,
+        );
+
+        let wrapper_name = String::from("enzyme_opt_helper_") + i.to_string().as_str();
+        let wrapper_fn = llvm::LLVMRustGetOrInsertFunction(
+            llmod,
+            wrapper_name.as_ptr() as *const c_char,
+            wrapper_name.len().try_into().unwrap(),
+            fn_ty,
+        );
+        let entry = llvm::LLVMAppendBasicBlockInContext(
+            llcx,
+            wrapper_fn,
+            "entry".as_ptr() as *const c_char,
+        );
+        let builder = llvm::LLVMCreateBuilderInContext(llcx);
+        llvm::LLVMPositionBuilderAtEnd(builder, entry);
+        let num_args = llvm::LLVMCountParams(wrapper_fn);
+        let mut args = Vec::with_capacity(num_args as usize + 1);
+        args.push(val);
+        let enzyme_const =
+            llvm::LLVMMDStringInContext2(llcx, "enzyme_const".as_ptr() as *const c_char, 12);
+        let enzyme_out =
+            llvm::LLVMMDStringInContext2(llcx, "enzyme_out".as_ptr() as *const c_char, 10);
+        let enzyme_dup =
+            llvm::LLVMMDStringInContext2(llcx, "enzyme_dup".as_ptr() as *const c_char, 10);
+        let enzyme_dupnoneed =
+            llvm::LLVMMDStringInContext2(llcx, "enzyme_dupnoneed".as_ptr() as *const c_char, 16);
+        final_num_args = num_args * 2 + 1;
+        for i in 0..num_args {
+            let arg = llvm::LLVMGetParam(wrapper_fn, i);
+            let activity = inputs[i as usize];
+            let (activity, duplicated): (&Metadata, bool) = match activity {
+                DiffActivity::None => panic!(),
+                DiffActivity::Const => (enzyme_const, false),
+                DiffActivity::Active => (enzyme_out, false),
+                DiffActivity::ActiveOnly => (enzyme_out, false),
+                DiffActivity::Dual => (enzyme_dup, true),
+                DiffActivity::DualOnly => (enzyme_dupnoneed, true),
+                DiffActivity::Duplicated => (enzyme_dup, true),
+                DiffActivity::DuplicatedOnly => (enzyme_dupnoneed, true),
+                DiffActivity::FakeActivitySize => (enzyme_const, false),
+            };
+            args.push(llvm::LLVMMetadataAsValue(llcx, activity));
+            args.push(arg);
+            if duplicated {
+                final_num_args += 1;
+                args.push(arg);
+            }
+        }
+
+        // declare void @__enzyme_autodiff(...)
+
+        // define void @enzyme_opt_helper_0(ptr %0, ptr %1) {
+        //   call void (...) @__enzyme_autodiff(ptr @ffff, ptr %0, ptr %1)
+        //   ret void
+        // }
+
+        let call = llvm::LLVMBuildCall2(
+            builder,
+            enzyme_ty,
+            ad_fn,
+            args.as_mut_ptr(),
+            final_num_args as usize,
+            ad_name.as_ptr() as *const c_char,
+        );
+        let void_ty = llvm::LLVMVoidTypeInContext(llcx);
+        if llvm::LLVMTypeOf(call) != void_ty {
+            llvm::LLVMBuildRet(builder, call);
+        } else {
+            llvm::LLVMBuildRetVoid(builder);
+        }
+        llvm::LLVMDisposeBuilder(builder);
+
+        let _fnc_ok = llvm::LLVMVerifyFunction(
+            wrapper_fn,
+            llvm::LLVMVerifierFailureAction::LLVMAbortProcessAction,
+        );
+    }
+}
+
+fn add_tt<'ll>(llmod: &'ll llvm::Module, llcx: &'ll llvm::Context, val: &'ll Value, tt: FncTree) {
+    let inputs = tt.args;
+    let _ret: TypeTree = tt.ret;
+    let llvm_data_layout: *const c_char = unsafe { llvm::LLVMGetDataLayoutStr(&*llmod) };
+    let llvm_data_layout =
+        std::str::from_utf8(unsafe { std::ffi::CStr::from_ptr(llvm_data_layout) }.to_bytes())
+            .expect("got a non-UTF8 data-layout from LLVM");
+    let attr_name = "enzyme_type";
+    let c_attr_name = std::ffi::CString::new(attr_name).unwrap();
+    for (i, &ref input) in inputs.iter().enumerate() {
+        let c_tt = to_enzyme_typetree(input.clone(), llvm_data_layout, llcx);
+        let c_str = unsafe { llvm::EnzymeTypeTreeToString(c_tt.inner) };
+        let c_str = unsafe { std::ffi::CStr::from_ptr(c_str) };
+        unsafe {
+            let attr = llvm::LLVMCreateStringAttribute(
+                llcx,
+                c_attr_name.as_ptr(),
+                c_attr_name.as_bytes().len() as c_uint,
+                c_str.as_ptr(),
+                c_str.to_bytes().len() as c_uint,
+            );
+            llvm::LLVMRustAddParamAttr(val, i as u32, attr);
+        }
+        unsafe { llvm::EnzymeTypeTreeToStringFree(c_str.as_ptr()) };
+    }
+}
 
 // All Builders must have an llfn associated with them
 #[must_use]
@@ -884,11 +1083,12 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         src_align: Align,
         size: &'ll Value,
         flags: MemFlags,
+        tt: Option<FncTree>,
     ) {
         assert!(!flags.contains(MemFlags::NONTEMPORAL), "non-temporal memcpy not supported");
         let size = self.intcast(size, self.type_isize(), false);
         let is_volatile = flags.contains(MemFlags::VOLATILE);
-        unsafe {
+        let val = unsafe {
             llvm::LLVMRustBuildMemCpy(
                 self.llbuilder,
                 dst,
@@ -897,7 +1097,15 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
                 src_align.bytes() as c_uint,
                 size,
                 is_volatile,
-            );
+            )
+        };
+
+        if let Some(tt) = tt {
+            let llmod = self.cx.llmod;
+            let llcx = self.cx.llcx;
+            add_tt(llmod, llcx, val, tt);
+        } else {
+            trace!("builder: no tt");
         }
     }
 
@@ -909,11 +1117,12 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         src_align: Align,
         size: &'ll Value,
         flags: MemFlags,
+        tt: Option<FncTree>,
     ) {
         assert!(!flags.contains(MemFlags::NONTEMPORAL), "non-temporal memmove not supported");
         let size = self.intcast(size, self.type_isize(), false);
         let is_volatile = flags.contains(MemFlags::VOLATILE);
-        unsafe {
+        let val = unsafe {
             llvm::LLVMRustBuildMemMove(
                 self.llbuilder,
                 dst,
@@ -922,7 +1131,13 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
                 src_align.bytes() as c_uint,
                 size,
                 is_volatile,
-            );
+            )
+        };
+
+        if let Some(tt) = tt {
+            let llmod = self.cx.llmod;
+            let llcx = self.cx.llcx;
+            add_tt(llmod, llcx, val, tt);
         }
     }
 
@@ -933,10 +1148,11 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         size: &'ll Value,
         align: Align,
         flags: MemFlags,
+        tt: Option<FncTree>,
     ) {
         assert!(!flags.contains(MemFlags::NONTEMPORAL), "non-temporal memset not supported");
         let is_volatile = flags.contains(MemFlags::VOLATILE);
-        unsafe {
+        let val = unsafe {
             llvm::LLVMRustBuildMemSet(
                 self.llbuilder,
                 ptr,
@@ -944,7 +1160,13 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
                 fill_byte,
                 size,
                 is_volatile,
-            );
+            )
+        };
+
+        if let Some(tt) = tt {
+            let llmod = self.cx.llmod;
+            let llcx = self.cx.llcx;
+            add_tt(llmod, llcx, val, tt);
         }
     }
 
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
index fb845c0087b1f..7d36a490c68e6 100644
--- a/compiler/rustc_codegen_llvm/src/context.rs
+++ b/compiler/rustc_codegen_llvm/src/context.rs
@@ -736,6 +736,11 @@ impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {
             None
         }
     }
+
+    // TODO: Manuel: I think we can drop this and construct the empty vec on the fly?
+    fn create_autodiff(&self) -> Vec<Self::Function> {
+        return vec![];
+    }
 }
 
 impl<'ll> CodegenCx<'ll, '_> {
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
index 61e474031bb08..0545e79b80da0 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
@@ -427,7 +427,7 @@ struct UsageSets<'tcx> {
 /// Prepare sets of definitions that are relevant to deciding whether something
 /// is an "unused function" for coverage purposes.
 fn prepare_usage_sets<'tcx>(tcx: TyCtxt<'tcx>) -> UsageSets<'tcx> {
-    let (all_mono_items, cgus) = tcx.collect_and_partition_mono_items(());
+    let (all_mono_items, _, cgus) = tcx.collect_and_partition_mono_items(());
 
     // Obtain a MIR body for each function participating in codegen, via an
     // arbitrary instance.
diff --git a/compiler/rustc_codegen_llvm/src/errors.rs b/compiler/rustc_codegen_llvm/src/errors.rs
index 0d436e1891ece..718504cd2c7fe 100644
--- a/compiler/rustc_codegen_llvm/src/errors.rs
+++ b/compiler/rustc_codegen_llvm/src/errors.rs
@@ -80,6 +80,11 @@ impl<G: EmissionGuarantee> Diagnostic<'_, G> for ParseTargetMachineConfig<'_> {
     }
 }
 
+#[derive(Diagnostic)]
+#[diag(codegen_llvm_autodiff_without_lto)]
+#[note]
+pub(crate) struct AutoDiffWithoutLTO;
+
 #[derive(Diagnostic)]
 #[diag(codegen_llvm_lto_disallowed)]
 pub(crate) struct LtoDisallowed;
@@ -122,6 +127,8 @@ pub enum LlvmError<'a> {
     PrepareThinLtoModule,
     #[diag(codegen_llvm_parse_bitcode)]
     ParseBitcode,
+    #[diag(codegen_llvm_prepare_autodiff)]
+    PrepareAutoDiff { src: String, target: String, error: String },
 }
 
 pub(crate) struct WithLlvmError<'a>(pub LlvmError<'a>, pub String);
@@ -143,6 +150,7 @@ impl<G: EmissionGuarantee> Diagnostic<'_, G> for WithLlvmError<'_> {
             }
             PrepareThinLtoModule => fluent::codegen_llvm_prepare_thin_lto_module_with_llvm_err,
             ParseBitcode => fluent::codegen_llvm_parse_bitcode_with_llvm_err,
+            PrepareAutoDiff { .. } => fluent::codegen_llvm_prepare_autodiff_with_llvm_err,
         };
         self.0
             .into_diag(dcx, level)
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index b85d28a2f1f71..538c0965104b7 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -27,23 +27,26 @@ use std::mem::ManuallyDrop;
 
 use back::owned_target_machine::OwnedTargetMachine;
 use back::write::{create_informational_target_machine, create_target_machine};
-use errors::ParseTargetMachineConfig;
+use errors::{AutoDiffWithoutLTO, ParseTargetMachineConfig};
+#[allow(unused_imports)]
+use llvm::TypeTree;
 pub use llvm_util::target_features;
 use rustc_ast::expand::allocator::AllocatorKind;
+use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule};
 use rustc_codegen_ssa::back::write::{
     CodegenContext, FatLtoInput, ModuleConfig, TargetMachineFactoryConfig, TargetMachineFactoryFn,
 };
 use rustc_codegen_ssa::traits::*;
 use rustc_codegen_ssa::{CodegenResults, CompiledModule, ModuleCodegen};
-use rustc_data_structures::fx::FxIndexMap;
+use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
 use rustc_errors::{DiagCtxtHandle, ErrorGuaranteed, FatalError};
 use rustc_metadata::EncodedMetadata;
 use rustc_middle::dep_graph::{WorkProduct, WorkProductId};
 use rustc_middle::ty::TyCtxt;
 use rustc_middle::util::Providers;
 use rustc_session::Session;
-use rustc_session::config::{OptLevel, OutputFilenames, PrintKind, PrintRequest};
+use rustc_session::config::{Lto, OptLevel, OutputFilenames, PrintKind, PrintRequest};
 use rustc_span::symbol::Symbol;
 
 mod back {
@@ -69,6 +72,7 @@ mod debuginfo;
 mod declare;
 mod errors;
 mod intrinsic;
+mod typetree;
 
 // The following is a workaround that replaces `pub mod llvm;` and that fixes issue 53912.
 #[path = "llvm/mod.rs"]
@@ -164,6 +168,7 @@ impl WriteBackendMethods for LlvmCodegenBackend {
     type TargetMachineError = crate::errors::LlvmError<'static>;
     type ThinData = back::lto::ThinData;
     type ThinBuffer = back::lto::ThinBuffer;
+    type TypeTree = DiffTypeTree;
     fn print_pass_timings(&self) {
         unsafe {
             let mut size = 0;
@@ -250,6 +255,26 @@ impl WriteBackendMethods for LlvmCodegenBackend {
     fn serialize_module(module: ModuleCodegen<Self::Module>) -> (String, Self::ModuleBuffer) {
         (module.name, back::lto::ModuleBuffer::new(module.module_llvm.llmod()))
     }
+    /// Generate autodiff rules
+    fn autodiff(
+        cgcx: &CodegenContext<Self>,
+        module: &ModuleCodegen<Self::Module>,
+        diff_fncs: Vec<AutoDiffItem>,
+        typetrees: FxHashMap<String, Self::TypeTree>,
+        config: &ModuleConfig,
+    ) -> Result<(), FatalError> {
+        if cgcx.lto != Lto::Fat {
+            let dcx = cgcx.create_dcx();
+            return Err(dcx.handle().emit_almost_fatal(AutoDiffWithoutLTO {}));
+        }
+        unsafe { back::write::differentiate(module, cgcx, diff_fncs, typetrees, config) }
+    }
+
+    // The typetrees contain all information, their order therefore is irrelevant.
+    #[allow(rustc::potential_query_instability)]
+    fn typetrees(module: &mut Self::Module) -> FxHashMap<String, Self::TypeTree> {
+        module.typetrees.drain().collect()
+    }
 }
 
 unsafe impl Send for LlvmCodegenBackend {} // Llvm is on a per-thread basis
@@ -405,6 +430,13 @@ impl CodegenBackend for LlvmCodegenBackend {
     }
 }
 
+#[derive(Clone, Debug)]
+pub struct DiffTypeTree {
+    pub ret_tt: TypeTree,
+    pub input_tt: Vec<TypeTree>,
+}
+
+#[allow(dead_code)]
 pub struct ModuleLlvm {
     llcx: &'static mut llvm::Context,
     llmod_raw: *const llvm::Module,
@@ -412,6 +444,7 @@ pub struct ModuleLlvm {
     // This field is `ManuallyDrop` because it is important that the `TargetMachine`
     // is disposed prior to the `Context` being disposed otherwise UAFs can occur.
     tm: ManuallyDrop<OwnedTargetMachine>,
+    typetrees: FxHashMap<String, DiffTypeTree>,
 }
 
 unsafe impl Send for ModuleLlvm {}
@@ -426,6 +459,7 @@ impl ModuleLlvm {
                 llmod_raw,
                 llcx,
                 tm: ManuallyDrop::new(create_target_machine(tcx, mod_name)),
+                typetrees: Default::default(),
             }
         }
     }
@@ -438,6 +472,7 @@ impl ModuleLlvm {
                 llmod_raw,
                 llcx,
                 tm: ManuallyDrop::new(create_informational_target_machine(tcx.sess, false)),
+                typetrees: Default::default(),
             }
         }
     }
@@ -459,7 +494,12 @@ impl ModuleLlvm {
                 }
             };
 
-            Ok(ModuleLlvm { llmod_raw, llcx, tm: ManuallyDrop::new(tm) })
+            Ok(ModuleLlvm {
+                llmod_raw,
+                llcx,
+                tm: ManuallyDrop::new(tm),
+                typetrees: Default::default(),
+            })
         }
     }
 
diff --git a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
new file mode 100644
index 0000000000000..f40d98883f746
--- /dev/null
+++ b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
@@ -0,0 +1,809 @@
+#![allow(non_camel_case_types)]
+
+use libc::{c_char, c_uint, size_t};
+use rustc_ast::expand::autodiff_attrs::DiffActivity;
+use tracing::trace;
+
+use super::ffi::*;
+
+extern "C" {
+    // Enzyme
+    pub fn LLVMRustAddFncParamAttr<'a>(F: &'a Value, index: c_uint, Attr: &'a Attribute);
+
+    pub fn LLVMRustAddRetFncAttr(F: &Value, attr: &Attribute);
+    pub fn LLVMRustHasMetadata(I: &Value, KindID: c_uint) -> bool;
+    pub fn LLVMRustEraseInstBefore(BB: &BasicBlock, I: &Value);
+    pub fn LLVMRustGetLastInstruction<'a>(BB: &BasicBlock) -> Option<&'a Value>;
+    pub fn LLVMRustDIGetInstMetadata(I: &Value) -> &Metadata;
+    pub fn LLVMRustEraseInstFromParent(V: &Value);
+    pub fn LLVMRustGetTerminator<'a>(B: &BasicBlock) -> &'a Value;
+    pub fn LLVMGetReturnType(T: &Type) -> &Type;
+    pub fn LLVMRustIsStructType(T: &Type) -> bool;
+    pub fn LLVMDumpModule(M: &Module);
+    pub fn LLVMCountStructElementTypes(T: &Type) -> c_uint;
+    pub fn LLVMVerifyFunction(V: &Value, action: LLVMVerifierFailureAction) -> bool;
+    pub fn LLVMGetParams(Fnc: &Value, parms: *mut &Value);
+    pub fn LLVMBuildCall2<'a>(
+        arg1: &Builder<'a>,
+        ty: &Type,
+        func: &Value,
+        args: *mut &Value,
+        num_args: size_t,
+        name: *const c_char,
+    ) -> &'a Value;
+    pub fn LLVMGetFirstFunction(M: &Module) -> Option<&Value>;
+    pub fn LLVMGetNextFunction(V: &Value) -> Option<&Value>;
+    pub fn LLVMGetNamedFunction(M: &Module, Name: *const c_char) -> Option<&Value>;
+    //pub fn LLVMGlobalGetValueType(val: &Value) -> &Type;
+    pub fn LLVMRustGetFunctionType(fnc: &Value) -> &Type;
+
+    pub fn LLVMRemoveStringAttributeAtIndex(F: &Value, Idx: c_uint, K: *const c_char, KLen: c_uint);
+    pub fn LLVMGetStringAttributeAtIndex(
+        F: &Value,
+        Idx: c_uint,
+        K: *const c_char,
+        KLen: c_uint,
+    ) -> &Attribute;
+    pub fn LLVMIsEnumAttribute(A: &Attribute) -> bool;
+    pub fn LLVMIsStringAttribute(A: &Attribute) -> bool;
+    pub fn LLVMRustAddEnumAttributeAtIndex(
+        C: &Context,
+        V: &Value,
+        index: c_uint,
+        attr: AttributeKind,
+    );
+    pub fn LLVMRustRemoveEnumAttributeAtIndex(V: &Value, index: c_uint, attr: AttributeKind);
+    pub fn LLVMRustGetEnumAttributeAtIndex(
+        V: &Value,
+        index: c_uint,
+        attr: AttributeKind,
+    ) -> &Attribute;
+
+    pub fn LLVMRustAddParamAttr<'a>(Instr: &'a Value, index: c_uint, Attr: &'a Attribute);
+
+}
+
+#[repr(C)]
+pub enum LLVMVerifierFailureAction {
+    LLVMAbortProcessAction,
+    LLVMPrintMessageAction,
+    LLVMReturnStatusAction,
+}
+
+pub(crate) unsafe fn enzyme_rust_forward_diff(
+    logic_ref: EnzymeLogicRef,
+    type_analysis: EnzymeTypeAnalysisRef,
+    fnc: &Value,
+    input_diffactivity: Vec<DiffActivity>,
+    ret_diffactivity: DiffActivity,
+    void_ret: bool,
+) -> (&Value, Vec<usize>) {
+    let ret_activity = cdiffe_from(ret_diffactivity);
+    assert!(ret_activity != CDIFFE_TYPE::DFT_OUT_DIFF);
+    let mut input_activity: Vec<CDIFFE_TYPE> = vec![];
+    for input in input_diffactivity {
+        let act = cdiffe_from(input);
+        assert!(
+            act == CDIFFE_TYPE::DFT_CONSTANT
+                || act == CDIFFE_TYPE::DFT_DUP_ARG
+                || act == CDIFFE_TYPE::DFT_DUP_NONEED
+        );
+        input_activity.push(act);
+    }
+
+    // if we have void ret, this must be false;
+    let ret_primary_ret = if void_ret {
+        false
+    } else {
+        match ret_activity {
+            CDIFFE_TYPE::DFT_CONSTANT => true,
+            CDIFFE_TYPE::DFT_DUP_ARG => true,
+            CDIFFE_TYPE::DFT_DUP_NONEED => false,
+            _ => panic!("Implementation error in enzyme_rust_forward_diff."),
+        }
+    };
+    trace!("ret_primary_ret: {}", &ret_primary_ret);
+
+    // We don't support volatile / extern / (global?) values.
+    // Just because I didn't had time to test them, and it seems less urgent.
+    let args_uncacheable = vec![0; input_activity.len()];
+    let num_fnc_args = unsafe { LLVMCountParams(fnc) };
+    trace!("num_fnc_args: {}", num_fnc_args);
+    trace!("input_activity.len(): {}", input_activity.len());
+    assert!(num_fnc_args == input_activity.len() as u32);
+
+    let kv_tmp = IntList { data: std::ptr::null_mut(), size: 0 };
+
+    let mut known_values = vec![kv_tmp; input_activity.len()];
+
+    let tree_tmp = TypeTree::new();
+    let mut args_tree = vec![tree_tmp.inner; input_activity.len()];
+
+    let ret_tt = TypeTree::new();
+    let dummy_type = CFnTypeInfo {
+        Arguments: args_tree.as_mut_ptr(),
+        Return: ret_tt.inner,
+        KnownValues: known_values.as_mut_ptr(),
+    };
+
+    trace!("ret_activity: {}", &ret_activity);
+    for i in &input_activity {
+        trace!("input_activity i: {}", &i);
+    }
+    trace!("before calling Enzyme");
+    let res = unsafe {
+        EnzymeCreateForwardDiff(
+            logic_ref, // Logic
+            std::ptr::null(),
+            std::ptr::null(),
+            fnc,
+            ret_activity, // LLVM function, return type
+            input_activity.as_ptr(),
+            input_activity.len(), // constant arguments
+            type_analysis,        // type analysis struct
+            ret_primary_ret as u8,
+            CDerivativeMode::DEM_ForwardMode, // return value, dret_used, top_level which was 1
+            1,                                // free memory
+            1,                                // vector mode width
+            Option::None,
+            dummy_type, // additional_arg, type info (return + args)
+            args_uncacheable.as_ptr(),
+            args_uncacheable.len(), // uncacheable arguments
+            std::ptr::null_mut(),   // write augmented function to this
+        )
+    };
+    trace!("after calling Enzyme");
+    (res, vec![])
+}
+
+pub(crate) unsafe fn enzyme_rust_reverse_diff(
+    logic_ref: EnzymeLogicRef,
+    type_analysis: EnzymeTypeAnalysisRef,
+    fnc: &Value,
+    rust_input_activity: Vec<DiffActivity>,
+    ret_activity: DiffActivity,
+) -> (&Value, Vec<usize>) {
+    let (primary_ret, ret_activity) = match ret_activity {
+        DiffActivity::Const => (true, CDIFFE_TYPE::DFT_CONSTANT),
+        DiffActivity::Active => (true, CDIFFE_TYPE::DFT_OUT_DIFF),
+        DiffActivity::ActiveOnly => (false, CDIFFE_TYPE::DFT_OUT_DIFF),
+        DiffActivity::None => (false, CDIFFE_TYPE::DFT_CONSTANT),
+        _ => panic!("Invalid return activity"),
+    };
+    // This only is needed for split-mode AD, which we don't support.
+    // See Julia:
+    // https://github.com/EnzymeAD/Enzyme.jl/blob/a511e4e6979d6161699f5c9919d49801c0764a09/src/compiler.jl#L3132
+    // https://github.com/EnzymeAD/Enzyme.jl/blob/a511e4e6979d6161699f5c9919d49801c0764a09/src/compiler.jl#L3092
+    let diff_ret = false;
+
+    let mut primal_sizes = vec![];
+    let mut input_activity: Vec<CDIFFE_TYPE> = vec![];
+    for (i, &x) in rust_input_activity.iter().enumerate() {
+        if is_size(x) {
+            primal_sizes.push(i);
+            input_activity.push(CDIFFE_TYPE::DFT_CONSTANT);
+            continue;
+        }
+        input_activity.push(cdiffe_from(x));
+    }
+
+    // We don't support volatile / extern / (global?) values.
+    // Just because I didn't had time to test them, and it seems less urgent.
+    let args_uncacheable = vec![0; input_activity.len()];
+    let num_fnc_args = unsafe { LLVMCountParams(fnc) };
+    println!("num_fnc_args: {}", num_fnc_args);
+    println!("input_activity.len(): {}", input_activity.len());
+    assert!(num_fnc_args == input_activity.len() as u32);
+    let kv_tmp = IntList { data: std::ptr::null_mut(), size: 0 };
+
+    let mut known_values = vec![kv_tmp; input_activity.len()];
+
+    let tree_tmp = TypeTree::new();
+    let mut args_tree = vec![tree_tmp.inner; input_activity.len()];
+    let ret_tt = TypeTree::new();
+    let dummy_type = CFnTypeInfo {
+        Arguments: args_tree.as_mut_ptr(),
+        Return: ret_tt.inner,
+        KnownValues: known_values.as_mut_ptr(),
+    };
+
+    trace!("primary_ret: {}", &primary_ret);
+    trace!("ret_activity: {}", &ret_activity);
+    for i in &input_activity {
+        trace!("input_activity i: {}", &i);
+    }
+    trace!("before calling Enzyme");
+    let res = unsafe {
+        EnzymeCreatePrimalAndGradient(
+            logic_ref, // Logic
+            std::ptr::null(),
+            std::ptr::null(),
+            fnc,
+            ret_activity, // LLVM function, return type
+            input_activity.as_ptr(),
+            input_activity.len(), // constant arguments
+            type_analysis,        // type analysis struct
+            primary_ret as u8,
+            diff_ret as u8,                           //0
+            CDerivativeMode::DEM_ReverseModeCombined, // return value, dret_used, top_level which was 1
+            1,                                        // vector mode width
+            1,                                        // free memory
+            Option::None,
+            0,          // do not force anonymous tape
+            dummy_type, // additional_arg, type info (return + args)
+            args_uncacheable.as_ptr(),
+            args_uncacheable.len(), // uncacheable arguments
+            std::ptr::null_mut(),   // write augmented function to this
+            0,
+        )
+    };
+    trace!("after calling Enzyme");
+    (res, primal_sizes)
+}
+
+#[cfg(not(llvm_enzyme))]
+pub use self::Fallback_AD::*;
+
+#[cfg(not(llvm_enzyme))]
+pub mod Fallback_AD {
+    #![allow(unused_variables)]
+    use super::*;
+
+    pub fn EnzymeNewTypeTree() -> CTypeTreeRef {
+        unimplemented!()
+    }
+    pub fn EnzymeFreeTypeTree(CTT: CTypeTreeRef) {
+        unimplemented!()
+    }
+    pub fn EnzymeSetCLBool(arg1: *mut ::std::os::raw::c_void, arg2: u8) {
+        unimplemented!()
+    }
+    pub fn EnzymeSetCLInteger(arg1: *mut ::std::os::raw::c_void, arg2: i64) {
+        unimplemented!()
+    }
+
+    pub fn set_inline(val: bool) {
+        unimplemented!()
+    }
+    pub fn set_runtime_activity_check(check: bool) {
+        unimplemented!()
+    }
+    pub fn set_max_int_offset(offset: u64) {
+        unimplemented!()
+    }
+    pub fn set_max_type_offset(offset: u64) {
+        unimplemented!()
+    }
+    pub fn set_max_type_depth(depth: u64) {
+        unimplemented!()
+    }
+    pub fn set_print_perf(print: bool) {
+        unimplemented!()
+    }
+    pub fn set_print_activity(print: bool) {
+        unimplemented!()
+    }
+    pub fn set_print_type(print: bool) {
+        unimplemented!()
+    }
+    pub fn set_print(print: bool) {
+        unimplemented!()
+    }
+    pub fn set_strict_aliasing(strict: bool) {
+        unimplemented!()
+    }
+    pub fn set_loose_types(loose: bool) {
+        unimplemented!()
+    }
+
+    pub fn EnzymeCreatePrimalAndGradient<'a>(
+        arg1: EnzymeLogicRef,
+        _builderCtx: *const u8, // &'a Builder<'_>,
+        _callerCtx: *const u8,  // &'a Value,
+        todiff: &'a Value,
+        retType: CDIFFE_TYPE,
+        constant_args: *const CDIFFE_TYPE,
+        constant_args_size: size_t,
+        TA: EnzymeTypeAnalysisRef,
+        returnValue: u8,
+        dretUsed: u8,
+        mode: CDerivativeMode,
+        width: ::std::os::raw::c_uint,
+        freeMemory: u8,
+        additionalArg: Option<&Type>,
+        forceAnonymousTape: u8,
+        typeInfo: CFnTypeInfo,
+        _uncacheable_args: *const u8,
+        uncacheable_args_size: size_t,
+        augmented: EnzymeAugmentedReturnPtr,
+        AtomicAdd: u8,
+    ) -> &'a Value {
+        unimplemented!()
+    }
+    pub fn EnzymeCreateForwardDiff<'a>(
+        arg1: EnzymeLogicRef,
+        _builderCtx: *const u8, // &'a Builder<'_>,
+        _callerCtx: *const u8,  // &'a Value,
+        todiff: &'a Value,
+        retType: CDIFFE_TYPE,
+        constant_args: *const CDIFFE_TYPE,
+        constant_args_size: size_t,
+        TA: EnzymeTypeAnalysisRef,
+        returnValue: u8,
+        mode: CDerivativeMode,
+        freeMemory: u8,
+        width: ::std::os::raw::c_uint,
+        additionalArg: Option<&Type>,
+        typeInfo: CFnTypeInfo,
+        _uncacheable_args: *const u8,
+        uncacheable_args_size: size_t,
+        augmented: EnzymeAugmentedReturnPtr,
+    ) -> &'a Value {
+        unimplemented!()
+    }
+    pub type CustomRuleType = ::std::option::Option<
+        unsafe extern "C" fn(
+            direction: ::std::os::raw::c_int,
+            ret: CTypeTreeRef,
+            args: *mut CTypeTreeRef,
+            known_values: *mut IntList,
+            num_args: size_t,
+            fnc: &Value,
+            ta: *const ::std::os::raw::c_void,
+        ) -> u8,
+    >;
+    extern "C" {
+        pub fn CreateTypeAnalysis(
+            Log: EnzymeLogicRef,
+            customRuleNames: *mut *mut ::std::os::raw::c_char,
+            customRules: *mut CustomRuleType,
+            numRules: size_t,
+        ) -> EnzymeTypeAnalysisRef;
+    }
+    //pub fn ClearTypeAnalysis(arg1: EnzymeTypeAnalysisRef) { unimplemented!() }
+    pub fn FreeTypeAnalysis(arg1: EnzymeTypeAnalysisRef) {
+        unimplemented!()
+    }
+    pub fn CreateEnzymeLogic(PostOpt: u8) -> EnzymeLogicRef {
+        unimplemented!()
+    }
+    pub fn ClearEnzymeLogic(arg1: EnzymeLogicRef) {
+        unimplemented!()
+    }
+    pub fn FreeEnzymeLogic(arg1: EnzymeLogicRef) {
+        unimplemented!()
+    }
+
+    pub fn EnzymeNewTypeTreeCT(arg1: CConcreteType, ctx: &Context) -> CTypeTreeRef {
+        unimplemented!()
+    }
+    pub fn EnzymeNewTypeTreeTR(arg1: CTypeTreeRef) -> CTypeTreeRef {
+        unimplemented!()
+    }
+    pub fn EnzymeMergeTypeTree(arg1: CTypeTreeRef, arg2: CTypeTreeRef) -> bool {
+        unimplemented!()
+    }
+    pub fn EnzymeTypeTreeOnlyEq(arg1: CTypeTreeRef, pos: i64) {
+        unimplemented!()
+    }
+    pub fn EnzymeTypeTreeData0Eq(arg1: CTypeTreeRef) {
+        unimplemented!()
+    }
+    pub fn EnzymeTypeTreeShiftIndiciesEq(
+        arg1: CTypeTreeRef,
+        data_layout: *const c_char,
+        offset: i64,
+        max_size: i64,
+        add_offset: u64,
+    ) {
+        unimplemented!()
+    }
+    pub fn EnzymeTypeTreeToStringFree(arg1: *const c_char) {
+        unimplemented!()
+    }
+    pub fn EnzymeTypeTreeToString(arg1: CTypeTreeRef) -> *const c_char {
+        unimplemented!()
+    }
+}
+
+// Enzyme specific, but doesn't require Enzyme to be build
+pub use self::Shared_AD::*;
+pub mod Shared_AD {
+    // Depending on the AD backend (Enzyme or Fallback), some functions might or might not be
+    // unsafe. So we just allways call them in an unsafe context.
+    #![allow(unused_unsafe)]
+    #![allow(unused_variables)]
+
+    use core::fmt;
+    use std::ffi::{CStr, CString};
+
+    use libc::size_t;
+    use rustc_ast::expand::autodiff_attrs::DiffActivity;
+
+    use super::Context;
+    #[cfg(llvm_enzyme)]
+    use super::Enzyme_AD::*;
+    #[cfg(not(llvm_enzyme))]
+    use super::Fallback_AD::*;
+    #[repr(u32)]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub enum CDIFFE_TYPE {
+        DFT_OUT_DIFF = 0,
+        DFT_DUP_ARG = 1,
+        DFT_CONSTANT = 2,
+        DFT_DUP_NONEED = 3,
+    }
+
+    impl fmt::Display for CDIFFE_TYPE {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            let value = match self {
+                CDIFFE_TYPE::DFT_OUT_DIFF => "DFT_OUT_DIFF",
+                CDIFFE_TYPE::DFT_DUP_ARG => "DFT_DUP_ARG",
+                CDIFFE_TYPE::DFT_CONSTANT => "DFT_CONSTANT",
+                CDIFFE_TYPE::DFT_DUP_NONEED => "DFT_DUP_NONEED",
+            };
+            write!(f, "{}", value)
+        }
+    }
+
+    pub fn cdiffe_from(act: DiffActivity) -> CDIFFE_TYPE {
+        return match act {
+            DiffActivity::None => CDIFFE_TYPE::DFT_CONSTANT,
+            DiffActivity::Const => CDIFFE_TYPE::DFT_CONSTANT,
+            DiffActivity::Active => CDIFFE_TYPE::DFT_OUT_DIFF,
+            DiffActivity::ActiveOnly => CDIFFE_TYPE::DFT_OUT_DIFF,
+            DiffActivity::Dual => CDIFFE_TYPE::DFT_DUP_ARG,
+            DiffActivity::DualOnly => CDIFFE_TYPE::DFT_DUP_NONEED,
+            DiffActivity::Duplicated => CDIFFE_TYPE::DFT_DUP_ARG,
+            DiffActivity::DuplicatedOnly => CDIFFE_TYPE::DFT_DUP_NONEED,
+            DiffActivity::FakeActivitySize => panic!("Implementation error"),
+        };
+    }
+
+    pub fn is_size(act: DiffActivity) -> bool {
+        return act == DiffActivity::FakeActivitySize;
+    }
+
+    #[repr(u32)]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub enum CDerivativeMode {
+        DEM_ForwardMode = 0,
+        DEM_ReverseModePrimal = 1,
+        DEM_ReverseModeGradient = 2,
+        DEM_ReverseModeCombined = 3,
+        DEM_ForwardModeSplit = 4,
+    }
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct EnzymeOpaqueTypeAnalysis {
+        _unused: [u8; 0],
+    }
+    pub type EnzymeTypeAnalysisRef = *mut EnzymeOpaqueTypeAnalysis;
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct EnzymeOpaqueLogic {
+        _unused: [u8; 0],
+    }
+    pub type EnzymeLogicRef = *mut EnzymeOpaqueLogic;
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct EnzymeOpaqueAugmentedReturn {
+        _unused: [u8; 0],
+    }
+    pub type EnzymeAugmentedReturnPtr = *mut EnzymeOpaqueAugmentedReturn;
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct IntList {
+        pub data: *mut i64,
+        pub size: size_t,
+    }
+    #[repr(u32)]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub enum CConcreteType {
+        DT_Anything = 0,
+        DT_Integer = 1,
+        DT_Pointer = 2,
+        DT_Half = 3,
+        DT_Float = 4,
+        DT_Double = 5,
+        DT_Unknown = 6,
+    }
+
+    pub type CTypeTreeRef = *mut EnzymeTypeTree;
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct EnzymeTypeTree {
+        _unused: [u8; 0],
+    }
+    pub struct TypeTree {
+        pub inner: CTypeTreeRef,
+    }
+
+    impl TypeTree {
+        pub fn new() -> TypeTree {
+            let inner = unsafe { EnzymeNewTypeTree() };
+            TypeTree { inner }
+        }
+
+        #[must_use]
+        pub fn from_type(t: CConcreteType, ctx: &Context) -> TypeTree {
+            let inner = unsafe { EnzymeNewTypeTreeCT(t, ctx) };
+            TypeTree { inner }
+        }
+
+        #[must_use]
+        pub fn only(self, idx: isize) -> TypeTree {
+            unsafe {
+                EnzymeTypeTreeOnlyEq(self.inner, idx as i64);
+            }
+            self
+        }
+
+        #[must_use]
+        pub fn data0(self) -> TypeTree {
+            unsafe {
+                EnzymeTypeTreeData0Eq(self.inner);
+            }
+            self
+        }
+
+        pub fn merge(self, other: Self) -> Self {
+            unsafe {
+                EnzymeMergeTypeTree(self.inner, other.inner);
+            }
+            drop(other);
+            self
+        }
+
+        #[must_use]
+        pub fn shift(
+            self,
+            layout: &str,
+            offset: isize,
+            max_size: isize,
+            add_offset: usize,
+        ) -> Self {
+            let layout = CString::new(layout).unwrap();
+
+            unsafe {
+                EnzymeTypeTreeShiftIndiciesEq(
+                    self.inner,
+                    layout.as_ptr(),
+                    offset as i64,
+                    max_size as i64,
+                    add_offset as u64,
+                )
+            }
+
+            self
+        }
+    }
+
+    impl Clone for TypeTree {
+        fn clone(&self) -> Self {
+            let inner = unsafe { EnzymeNewTypeTreeTR(self.inner) };
+            TypeTree { inner }
+        }
+    }
+
+    impl fmt::Display for TypeTree {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            let ptr = unsafe { EnzymeTypeTreeToString(self.inner) };
+            let cstr = unsafe { CStr::from_ptr(ptr) };
+            match cstr.to_str() {
+                Ok(x) => write!(f, "{}", x)?,
+                Err(err) => write!(f, "could not parse: {}", err)?,
+            }
+
+            // delete C string pointer
+            unsafe { EnzymeTypeTreeToStringFree(ptr) }
+
+            Ok(())
+        }
+    }
+
+    impl fmt::Debug for TypeTree {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            <Self as fmt::Display>::fmt(self, f)
+        }
+    }
+
+    impl Drop for TypeTree {
+        fn drop(&mut self) {
+            unsafe { EnzymeFreeTypeTree(self.inner) }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct CFnTypeInfo {
+        #[doc = " Types of arguments, assumed of size len(Arguments)"]
+        pub Arguments: *mut CTypeTreeRef,
+        #[doc = " Type of return"]
+        pub Return: CTypeTreeRef,
+        #[doc = " The specific constant(s) known to represented by an argument, if constant"]
+        pub KnownValues: *mut IntList,
+    }
+}
+
+#[cfg(llvm_enzyme)]
+pub use self::Enzyme_AD::*;
+
+// Enzyme is an optional component, so we do need to provide a fallback when it is ont getting
+// compiled. We deny the usage of #[autodiff(..)] on a higher level, so a placeholder implementation
+// here is completely fine.
+#[cfg(llvm_enzyme)]
+pub mod Enzyme_AD {
+    use libc::{c_char, c_void, size_t};
+
+    use super::*;
+
+    extern "C" {
+        pub fn EnzymeNewTypeTree() -> CTypeTreeRef;
+        pub fn EnzymeFreeTypeTree(CTT: CTypeTreeRef);
+        pub fn EnzymeSetCLBool(arg1: *mut ::std::os::raw::c_void, arg2: u8);
+        pub fn EnzymeSetCLInteger(arg1: *mut ::std::os::raw::c_void, arg2: i64);
+    }
+
+    extern "C" {
+        static mut MaxIntOffset: c_void;
+        static mut MaxTypeOffset: c_void;
+        static mut EnzymeMaxTypeDepth: c_void;
+
+        static mut EnzymeRuntimeActivityCheck: c_void;
+        static mut EnzymePrintPerf: c_void;
+        static mut EnzymePrintActivity: c_void;
+        static mut EnzymePrintType: c_void;
+        static mut EnzymePrint: c_void;
+        static mut EnzymeStrictAliasing: c_void;
+        static mut looseTypeAnalysis: c_void;
+        static mut EnzymeInline: c_void;
+    }
+    pub fn set_runtime_activity_check(check: bool) {
+        unsafe {
+            EnzymeSetCLBool(std::ptr::addr_of_mut!(EnzymeRuntimeActivityCheck), check as u8);
+        }
+    }
+    pub fn set_max_int_offset(offset: u64) {
+        let offset = offset.try_into().unwrap();
+        unsafe {
+            EnzymeSetCLInteger(std::ptr::addr_of_mut!(MaxIntOffset), offset);
+        }
+    }
+    pub fn set_max_type_offset(offset: u64) {
+        let offset = offset.try_into().unwrap();
+        unsafe {
+            EnzymeSetCLInteger(std::ptr::addr_of_mut!(MaxTypeOffset), offset);
+        }
+    }
+    pub fn set_max_type_depth(depth: u64) {
+        let depth = depth.try_into().unwrap();
+        unsafe {
+            EnzymeSetCLInteger(std::ptr::addr_of_mut!(EnzymeMaxTypeDepth), depth);
+        }
+    }
+    pub fn set_print_perf(print: bool) {
+        unsafe {
+            EnzymeSetCLBool(std::ptr::addr_of_mut!(EnzymePrintPerf), print as u8);
+        }
+    }
+    pub fn set_print_activity(print: bool) {
+        unsafe {
+            EnzymeSetCLBool(std::ptr::addr_of_mut!(EnzymePrintActivity), print as u8);
+        }
+    }
+    pub fn set_print_type(print: bool) {
+        unsafe {
+            EnzymeSetCLBool(std::ptr::addr_of_mut!(EnzymePrintType), print as u8);
+        }
+    }
+    pub fn set_print(print: bool) {
+        unsafe {
+            EnzymeSetCLBool(std::ptr::addr_of_mut!(EnzymePrint), print as u8);
+        }
+    }
+    pub fn set_strict_aliasing(strict: bool) {
+        unsafe {
+            EnzymeSetCLBool(std::ptr::addr_of_mut!(EnzymeStrictAliasing), strict as u8);
+        }
+    }
+    pub fn set_loose_types(loose: bool) {
+        unsafe {
+            EnzymeSetCLBool(std::ptr::addr_of_mut!(looseTypeAnalysis), loose as u8);
+        }
+    }
+    pub fn set_inline(val: bool) {
+        unsafe {
+            EnzymeSetCLBool(std::ptr::addr_of_mut!(EnzymeInline), val as u8);
+        }
+    }
+
+    extern "C" {
+        pub fn EnzymeCreatePrimalAndGradient<'a>(
+            arg1: EnzymeLogicRef,
+            _builderCtx: *const u8, // &'a Builder<'_>,
+            _callerCtx: *const u8,  // &'a Value,
+            todiff: &'a Value,
+            retType: CDIFFE_TYPE,
+            constant_args: *const CDIFFE_TYPE,
+            constant_args_size: size_t,
+            TA: EnzymeTypeAnalysisRef,
+            returnValue: u8,
+            dretUsed: u8,
+            mode: CDerivativeMode,
+            width: ::std::os::raw::c_uint,
+            freeMemory: u8,
+            additionalArg: Option<&Type>,
+            forceAnonymousTape: u8,
+            typeInfo: CFnTypeInfo,
+            _uncacheable_args: *const u8,
+            uncacheable_args_size: size_t,
+            augmented: EnzymeAugmentedReturnPtr,
+            AtomicAdd: u8,
+        ) -> &'a Value;
+    }
+    extern "C" {
+        pub fn EnzymeCreateForwardDiff<'a>(
+            arg1: EnzymeLogicRef,
+            _builderCtx: *const u8, // &'a Builder<'_>,
+            _callerCtx: *const u8,  // &'a Value,
+            todiff: &'a Value,
+            retType: CDIFFE_TYPE,
+            constant_args: *const CDIFFE_TYPE,
+            constant_args_size: size_t,
+            TA: EnzymeTypeAnalysisRef,
+            returnValue: u8,
+            mode: CDerivativeMode,
+            freeMemory: u8,
+            width: ::std::os::raw::c_uint,
+            additionalArg: Option<&Type>,
+            typeInfo: CFnTypeInfo,
+            _uncacheable_args: *const u8,
+            uncacheable_args_size: size_t,
+            augmented: EnzymeAugmentedReturnPtr,
+        ) -> &'a Value;
+    }
+    pub type CustomRuleType = ::std::option::Option<
+        unsafe extern "C" fn(
+            direction: ::std::os::raw::c_int,
+            ret: CTypeTreeRef,
+            args: *mut CTypeTreeRef,
+            known_values: *mut IntList,
+            num_args: size_t,
+            fnc: &Value,
+            ta: *const ::std::os::raw::c_void,
+        ) -> u8,
+    >;
+    extern "C" {
+        pub fn CreateTypeAnalysis(
+            Log: EnzymeLogicRef,
+            customRuleNames: *mut *mut ::std::os::raw::c_char,
+            customRules: *mut CustomRuleType,
+            numRules: size_t,
+        ) -> EnzymeTypeAnalysisRef;
+    }
+    extern "C" {
+        //pub(super) fn ClearTypeAnalysis(arg1: EnzymeTypeAnalysisRef);
+        pub fn FreeTypeAnalysis(arg1: EnzymeTypeAnalysisRef);
+        pub fn CreateEnzymeLogic(PostOpt: u8) -> EnzymeLogicRef;
+        pub fn ClearEnzymeLogic(arg1: EnzymeLogicRef);
+        pub fn FreeEnzymeLogic(arg1: EnzymeLogicRef);
+    }
+
+    extern "C" {
+        pub(super) fn EnzymeNewTypeTreeCT(arg1: CConcreteType, ctx: &Context) -> CTypeTreeRef;
+        pub(super) fn EnzymeNewTypeTreeTR(arg1: CTypeTreeRef) -> CTypeTreeRef;
+        pub(super) fn EnzymeMergeTypeTree(arg1: CTypeTreeRef, arg2: CTypeTreeRef) -> bool;
+        pub(super) fn EnzymeTypeTreeOnlyEq(arg1: CTypeTreeRef, pos: i64);
+        pub(super) fn EnzymeTypeTreeData0Eq(arg1: CTypeTreeRef);
+        pub(super) fn EnzymeTypeTreeShiftIndiciesEq(
+            arg1: CTypeTreeRef,
+            data_layout: *const c_char,
+            offset: i64,
+            max_size: i64,
+            add_offset: u64,
+        );
+        pub fn EnzymeTypeTreeToStringFree(arg1: *const c_char);
+        pub fn EnzymeTypeTreeToString(arg1: CTypeTreeRef) -> *const c_char;
+    }
+}
diff --git a/compiler/rustc_codegen_llvm/src/llvm/mod.rs b/compiler/rustc_codegen_llvm/src/llvm/mod.rs
index e837022044ee0..7e582f949a101 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/mod.rs
@@ -20,8 +20,10 @@ pub use self::RealPredicate::*;
 
 pub mod archive_ro;
 pub mod diagnostic;
+pub mod enzyme_ffi;
 mod ffi;
 
+pub use self::enzyme_ffi::*;
 pub use self::ffi::*;
 
 impl LLVMRustResult {
diff --git a/compiler/rustc_codegen_llvm/src/typetree.rs b/compiler/rustc_codegen_llvm/src/typetree.rs
new file mode 100644
index 0000000000000..9612ac335a873
--- /dev/null
+++ b/compiler/rustc_codegen_llvm/src/typetree.rs
@@ -0,0 +1,34 @@
+use rustc_ast::expand::typetree::{Kind, TypeTree};
+
+use crate::llvm;
+
+pub(crate) fn to_enzyme_typetree(
+    tree: TypeTree,
+    llvm_data_layout: &str,
+    llcx: &llvm::Context,
+) -> llvm::TypeTree {
+    tree.0.iter().fold(llvm::TypeTree::new(), |obj, x| {
+        let scalar = match x.kind {
+            Kind::Integer => llvm::CConcreteType::DT_Integer,
+            Kind::Float => llvm::CConcreteType::DT_Float,
+            Kind::Double => llvm::CConcreteType::DT_Double,
+            Kind::Pointer => llvm::CConcreteType::DT_Pointer,
+            _ => panic!("Unknown kind {:?}", x.kind),
+        };
+
+        let tt = llvm::TypeTree::from_type(scalar, llcx).only(-1);
+
+        let tt = if !x.child.0.is_empty() {
+            let inner_tt = to_enzyme_typetree(x.child.clone(), llvm_data_layout, llcx);
+            tt.merge(inner_tt.only(-1))
+        } else {
+            tt
+        };
+
+        if x.offset != -1 {
+            obj.merge(tt.shift(llvm_data_layout, 0, x.size as isize, x.offset as usize))
+        } else {
+            obj.merge(tt)
+        }
+    })
+}
diff --git a/compiler/rustc_codegen_ssa/messages.ftl b/compiler/rustc_codegen_ssa/messages.ftl
index d07274920feaf..d376fdf0f1428 100644
--- a/compiler/rustc_codegen_ssa/messages.ftl
+++ b/compiler/rustc_codegen_ssa/messages.ftl
@@ -351,3 +351,6 @@ codegen_ssa_use_cargo_directive = use the `cargo:rustc-link-lib` directive to sp
 codegen_ssa_version_script_write_failure = failed to write version script: {$error}
 
 codegen_ssa_visual_studio_not_installed = you may need to install Visual Studio build tools with the "C++ build tools" workload
+
+codegen_ssa_autodiff_without_lto = using the autodiff feature requires using fat-lto
+
diff --git a/compiler/rustc_codegen_ssa/src/assert_module_sources.rs b/compiler/rustc_codegen_ssa/src/assert_module_sources.rs
index 11bcd727501c9..27767e498fa36 100644
--- a/compiler/rustc_codegen_ssa/src/assert_module_sources.rs
+++ b/compiler/rustc_codegen_ssa/src/assert_module_sources.rs
@@ -48,7 +48,7 @@ pub fn assert_module_sources(tcx: TyCtxt<'_>, set_reuse: &dyn Fn(&mut CguReuseTr
         }
 
         let available_cgus =
-            tcx.collect_and_partition_mono_items(()).1.iter().map(|cgu| cgu.name()).collect();
+            tcx.collect_and_partition_mono_items(()).2.iter().map(|cgu| cgu.name()).collect();
 
         let mut ams = AssertModuleSource {
             tcx,
diff --git a/compiler/rustc_codegen_ssa/src/back/lto.rs b/compiler/rustc_codegen_ssa/src/back/lto.rs
index ab8b06a05fc74..355a8ffa4a7ea 100644
--- a/compiler/rustc_codegen_ssa/src/back/lto.rs
+++ b/compiler/rustc_codegen_ssa/src/back/lto.rs
@@ -1,11 +1,14 @@
 use std::ffi::CString;
 use std::sync::Arc;
 
+use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
+use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::memmap::Mmap;
 use rustc_errors::FatalError;
 
 use super::write::CodegenContext;
 use crate::ModuleCodegen;
+use crate::back::write::ModuleConfig;
 use crate::traits::*;
 
 pub struct ThinModule<B: WriteBackendMethods> {
@@ -72,6 +75,24 @@ impl<B: WriteBackendMethods> LtoModuleCodegen<B> {
         }
     }
 
+    /// Run autodiff on Fat LTO module
+    pub unsafe fn autodiff(
+        self,
+        cgcx: &CodegenContext<B>,
+        diff_fncs: Vec<AutoDiffItem>,
+        typetrees: FxHashMap<String, B::TypeTree>,
+        config: &ModuleConfig,
+    ) -> Result<LtoModuleCodegen<B>, FatalError> {
+        match &self {
+            LtoModuleCodegen::Fat(module) => {
+                B::autodiff(cgcx, &module, diff_fncs, typetrees, config)?;
+            }
+            _ => panic!("Unreachable? Autodiff called with non-fat LTO module"),
+        }
+
+        Ok(self)
+    }
+
     /// A "gauge" of how costly it is to optimize this module, used to sort
     /// biggest modules first.
     pub fn cost(&self) -> u64 {
diff --git a/compiler/rustc_codegen_ssa/src/back/symbol_export.rs b/compiler/rustc_codegen_ssa/src/back/symbol_export.rs
index 77c35a1fe79e9..c638546f779ff 100644
--- a/compiler/rustc_codegen_ssa/src/back/symbol_export.rs
+++ b/compiler/rustc_codegen_ssa/src/back/symbol_export.rs
@@ -293,7 +293,7 @@ fn exported_symbols_provider_local(
         // external linkage is enough for monomorphization to be linked to.
         let need_visibility = tcx.sess.target.dynamic_linking && !tcx.sess.target.only_cdylib;
 
-        let (_, cgus) = tcx.collect_and_partition_mono_items(());
+        let (_, _, cgus) = tcx.collect_and_partition_mono_items(());
 
         // The symbols created in this loop are sorted below it
         #[allow(rustc::potential_query_instability)]
diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs
index e3d11cfaf4fe3..eb0bc471130a6 100644
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@@ -8,6 +8,7 @@ use std::{fs, io, mem, str, thread};
 
 use jobserver::{Acquired, Client};
 use rustc_ast::attr;
+use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
 use rustc_data_structures::memmap::Mmap;
 use rustc_data_structures::profiling::{SelfProfilerRef, VerboseTimingGuard};
@@ -41,7 +42,7 @@ use tracing::debug;
 use super::link::{self, ensure_removed};
 use super::lto::{self, SerializedModule};
 use super::symbol_export::symbol_name_for_instance_in_crate;
-use crate::errors::ErrorCreatingRemarkDir;
+use crate::errors::{AutodiffWithoutLto, ErrorCreatingRemarkDir};
 use crate::traits::*;
 use crate::{
     CachedModuleCodegen, CodegenResults, CompiledModule, CrateInfo, ModuleCodegen, ModuleKind,
@@ -120,6 +121,7 @@ pub struct ModuleConfig {
     pub merge_functions: bool,
     pub emit_lifetime_markers: bool,
     pub llvm_plugins: Vec<String>,
+    pub autodiff: Vec<config::AutoDiff>,
 }
 
 impl ModuleConfig {
@@ -280,6 +282,7 @@ impl ModuleConfig {
 
             emit_lifetime_markers: sess.emit_lifetime_markers(),
             llvm_plugins: if_regular!(sess.opts.unstable_opts.llvm_plugins.clone(), vec![]),
+            autodiff: if_regular!(sess.opts.unstable_opts.autodiff.clone(), vec![]),
         }
     }
 
@@ -401,6 +404,8 @@ impl<B: WriteBackendMethods> CodegenContext<B> {
 
 fn generate_lto_work<B: ExtraBackendMethods>(
     cgcx: &CodegenContext<B>,
+    autodiff: Vec<AutoDiffItem>,
+    typetrees: FxHashMap<String, B::TypeTree>,
     needs_fat_lto: Vec<FatLtoInput<B>>,
     needs_thin_lto: Vec<(String, B::ThinBuffer)>,
     import_only_modules: Vec<(SerializedModule<B::ModuleBuffer>, WorkProduct)>,
@@ -409,11 +414,19 @@ fn generate_lto_work<B: ExtraBackendMethods>(
 
     if !needs_fat_lto.is_empty() {
         assert!(needs_thin_lto.is_empty());
-        let module =
+        let mut module =
             B::run_fat_lto(cgcx, needs_fat_lto, import_only_modules).unwrap_or_else(|e| e.raise());
+        if cgcx.lto == Lto::Fat {
+            let config = cgcx.config(ModuleKind::Regular);
+            module = unsafe { module.autodiff(cgcx, autodiff, typetrees, config).unwrap() };
+        }
         // We are adding a single work item, so the cost doesn't matter.
         vec![(WorkItem::LTO(module), 0)]
     } else {
+        if !autodiff.is_empty() {
+            let dcx = cgcx.create_dcx();
+            dcx.handle().emit_fatal(AutodiffWithoutLto {});
+        }
         assert!(needs_fat_lto.is_empty());
         let (lto_modules, copy_jobs) = B::run_thin_lto(cgcx, needs_thin_lto, import_only_modules)
             .unwrap_or_else(|e| e.raise());
@@ -1041,6 +1054,9 @@ pub(crate) enum Message<B: WriteBackendMethods> {
     /// Sent from a backend worker thread.
     WorkItem { result: Result<WorkItemResult<B>, Option<WorkerFatalError>>, worker_id: usize },
 
+    /// A vector containing all the AutoDiff tasks that we have to pass to Enzyme.
+    AddAutoDiffItems(Vec<AutoDiffItem>),
+
     /// The frontend has finished generating something (backend IR or a
     /// post-LTO artifact) for a codegen unit, and it should be passed to the
     /// backend. Sent from the main thread.
@@ -1367,6 +1383,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
 
         // This is where we collect codegen units that have gone all the way
         // through codegen and LLVM.
+        let mut autodiff_items = Vec::new();
         let mut compiled_modules = vec![];
         let mut compiled_allocator_module = None;
         let mut needs_link = Vec::new();
@@ -1374,6 +1391,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
         let mut needs_thin_lto = Vec::new();
         let mut lto_import_only_modules = Vec::new();
         let mut started_lto = false;
+        let mut typetrees = FxHashMap::<String, B::TypeTree>::default();
 
         /// Possible state transitions:
         /// - Ongoing -> Completed
@@ -1478,9 +1496,14 @@ fn start_executing_work<B: ExtraBackendMethods>(
                     let needs_thin_lto = mem::take(&mut needs_thin_lto);
                     let import_only_modules = mem::take(&mut lto_import_only_modules);
 
-                    for (work, cost) in
-                        generate_lto_work(&cgcx, needs_fat_lto, needs_thin_lto, import_only_modules)
-                    {
+                    for (work, cost) in generate_lto_work(
+                        &cgcx,
+                        autodiff_items.clone(),
+                        typetrees.clone(),
+                        needs_fat_lto,
+                        needs_thin_lto,
+                        import_only_modules,
+                    ) {
                         let insertion_index = work_items
                             .binary_search_by_key(&cost, |&(_, cost)| cost)
                             .unwrap_or_else(|e| e);
@@ -1593,7 +1616,16 @@ fn start_executing_work<B: ExtraBackendMethods>(
                     }
                 }
 
-                Message::CodegenDone { llvm_work_item, cost } => {
+                Message::CodegenDone { mut llvm_work_item, cost } => {
+                    //// extract build typetrees
+                    match &mut llvm_work_item {
+                        WorkItem::Optimize(module) => {
+                            let tt = B::typetrees(&mut module.module_llvm);
+                            typetrees.extend(tt);
+                        }
+                        _ => {}
+                    }
+
                     // We keep the queue sorted by estimated processing cost,
                     // so that more expensive items are processed earlier. This
                     // is good for throughput as it gives the main thread more
@@ -1615,6 +1647,10 @@ fn start_executing_work<B: ExtraBackendMethods>(
                     main_thread_state = MainThreadState::Idle;
                 }
 
+                Message::AddAutoDiffItems(mut items) => {
+                    autodiff_items.append(&mut items);
+                }
+
                 Message::CodegenComplete => {
                     if codegen_state != Aborted {
                         codegen_state = Completed;
@@ -2092,6 +2128,10 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {
         drop(self.coordinator.sender.send(Box::new(Message::CodegenComplete::<B>)));
     }
 
+    pub(crate) fn submit_autodiff_items(&self, items: Vec<AutoDiffItem>) {
+        drop(self.coordinator.sender.send(Box::new(Message::<B>::AddAutoDiffItems(items))));
+    }
+
     pub(crate) fn check_for_errors(&self, sess: &Session) {
         self.shared_emitter_main.check(sess, false);
     }
diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs
index a726ee73aaa26..e201d93986e07 100644
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@@ -621,7 +621,8 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
 
     // Run the monomorphization collector and partition the collected items into
     // codegen units.
-    let codegen_units = tcx.collect_and_partition_mono_items(()).1;
+    let (_, autodiff_fncs, codegen_units) = tcx.collect_and_partition_mono_items(());
+    let autodiff_fncs = autodiff_fncs.to_vec();
 
     // Force all codegen_unit queries so they are already either red or green
     // when compile_codegen_unit accesses them. We are not able to re-execute
@@ -692,6 +693,10 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
         );
     }
 
+    if !autodiff_fncs.is_empty() {
+        ongoing_codegen.submit_autodiff_items(autodiff_fncs);
+    }
+
     // For better throughput during parallel processing by LLVM, we used to sort
     // CGUs largest to smallest. This would lead to better thread utilization
     // by, for example, preventing a large CGU from being processed last and
@@ -1051,7 +1056,7 @@ pub(crate) fn provide(providers: &mut Providers) {
             config::OptLevel::SizeMin => config::OptLevel::Default,
         };
 
-        let (defids, _) = tcx.collect_and_partition_mono_items(cratenum);
+        let (defids, _, _) = tcx.collect_and_partition_mono_items(cratenum);
 
         let any_for_speed = defids.items().any(|id| {
             let CodegenFnAttrs { optimize, .. } = tcx.codegen_fn_attrs(*id);
diff --git a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
index a5bd3adbcddc9..79a208925891e 100644
--- a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
+++ b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
@@ -1,4 +1,9 @@
-use rustc_ast::{MetaItemInner, MetaItemKind, ast, attr};
+use std::str::FromStr;
+
+use rustc_ast::expand::autodiff_attrs::{
+    AutoDiffAttrs, DiffActivity, DiffMode, valid_ret_activity, valid_input_activity,
+};
+use rustc_ast::{MetaItemInner, MetaItem, MetaItemKind, ast, attr};
 use rustc_attr::{InlineAttr, InstructionSetAttr, OptimizeAttr, list_contains_name};
 use rustc_data_structures::fx::FxHashMap;
 use rustc_errors::codes::*;
@@ -779,6 +784,133 @@ fn check_link_name_xor_ordinal(
     }
 }
 
+/// We now check the #[rustc_autodiff] attributes which we generated from the #[autodiff(...)]
+/// macros. There are two forms. The pure one without args to mark primal functions (the functions
+/// being differentiated). The other form is #[rustc_autodiff(Mode, ActivityList)] on top of the
+/// placeholder functions. We wrote the rustc_autodiff attributes ourself, so this should never
+/// panic, unless we introduced a bug when parsing the autodiff macro.
+fn autodiff_attrs(tcx: TyCtxt<'_>, id: DefId) -> AutoDiffAttrs {
+    let attrs = tcx.get_attrs(id, sym::rustc_autodiff);
+
+    let attrs =
+        attrs.filter(|attr| attr.name_or_empty() == sym::rustc_autodiff).collect::<Vec<_>>();
+
+    // check for exactly one autodiff attribute on placeholder functions.
+    // There should only be one, since we generate a new placeholder per ad macro.
+    // TODO: re-enable this. We should fix that rustc_autodiff isn't applied multiple times to the
+    // source function.
+    let msg_once = "cg_ssa: implementation bug. Autodiff attribute can only be applied once";
+    let attr = match attrs.len() {
+        0 => return AutoDiffAttrs::error(),
+        1 => attrs.get(0).unwrap(),
+        _ => {
+            attrs.get(0).unwrap()
+            //tcx.dcx().struct_span_err(attrs[1].span, msg_once).with_note("more than one").emit();
+            //return AutoDiffAttrs::error();
+        },
+    };
+
+    let list = attr.meta_item_list().unwrap_or_default();
+
+    // empty autodiff attribute macros (i.e. `#[autodiff]`) are used to mark source functions
+    if list.len() == 0 {
+        return AutoDiffAttrs::source();
+    }
+
+    let [mode, input_activities @ .., ret_activity] = &list[..] else {
+        tcx.dcx()
+            .struct_span_err(attr.span, msg_once)
+            .with_note("Implementation bug in autodiff_attrs. Please report this!")
+            .emit();
+        return AutoDiffAttrs::error();
+    };
+    let mode = if let MetaItemInner::MetaItem(MetaItem { path: ref p1, .. }) = mode {
+        p1.segments.first().unwrap().ident
+    } else {
+        let msg = "autodiff attribute must contain autodiff mode";
+        tcx.dcx().struct_span_err(attr.span, msg).with_note("empty argument list").emit();
+        return AutoDiffAttrs::error();
+    };
+
+    // parse mode
+    let msg_mode = "mode should be either forward or reverse";
+    let mode = match mode.as_str() {
+        "Forward" => DiffMode::Forward,
+        "Reverse" => DiffMode::Reverse,
+        "ForwardFirst" => DiffMode::ForwardFirst,
+        "ReverseFirst" => DiffMode::ReverseFirst,
+        _ => {
+            tcx.dcx().struct_span_err(attr.span, msg_mode).with_note("invalid mode").emit();
+            return AutoDiffAttrs::error();
+        }
+    };
+
+    // First read the ret symbol from the attribute
+    let ret_symbol = if let MetaItemInner::MetaItem(MetaItem { path: ref p1, .. }) = ret_activity {
+        p1.segments.first().unwrap().ident
+    } else {
+        let msg = "autodiff attribute must contain the return activity";
+        tcx.dcx().struct_span_err(attr.span, msg).with_note("missing return activity").emit();
+        return AutoDiffAttrs::error();
+    };
+
+    // Then parse it into an actual DiffActivity
+    let msg_unknown_ret_activity = "unknown return activity";
+    let ret_activity = match DiffActivity::from_str(ret_symbol.as_str()) {
+        Ok(x) => x,
+        Err(_) => {
+            tcx.dcx()
+                .struct_span_err(attr.span, msg_unknown_ret_activity)
+                .with_note("invalid return activity")
+                .emit();
+            return AutoDiffAttrs::error();
+        }
+    };
+
+    // Now parse all the intermediate (input) activities
+    let msg_arg_activity = "autodiff attribute must contain the return activity";
+    let mut arg_activities: Vec<DiffActivity> = vec![];
+    for arg in input_activities {
+        let arg_symbol = if let MetaItemInner::MetaItem(MetaItem { path: ref p2, .. }) = arg {
+            p2.segments.first().unwrap().ident
+        } else {
+            tcx.dcx()
+                .struct_span_err(attr.span, msg_arg_activity)
+                .with_note("Implementation bug, please report this!")
+                .emit();
+            return AutoDiffAttrs::error();
+        };
+
+        match DiffActivity::from_str(arg_symbol.as_str()) {
+            Ok(arg_activity) => arg_activities.push(arg_activity),
+            Err(_) => {
+                tcx.dcx()
+                    .struct_span_err(attr.span, msg_unknown_ret_activity)
+                    .with_note("invalid input activity")
+                    .emit();
+                return AutoDiffAttrs::error();
+            }
+        }
+    }
+
+    let mut msg = "".to_string();
+    for &input in &arg_activities {
+        if !valid_input_activity(mode, input) {
+            msg = format!("Invalid input activity {} for {} mode", input, mode);
+        }
+    }
+    if !valid_ret_activity(mode, ret_activity) {
+        msg = format!("Invalid return activity {} for {} mode", ret_activity, mode);
+    }
+    if msg != "".to_string() {
+        tcx.dcx().struct_span_err(attr.span, msg).with_note("invalid activity").emit();
+        return AutoDiffAttrs::error();
+    }
+
+    AutoDiffAttrs { mode, ret_activity, input_activity: arg_activities }
+}
+
 pub(crate) fn provide(providers: &mut Providers) {
-    *providers = Providers { codegen_fn_attrs, should_inherit_track_caller, ..*providers };
+    *providers =
+        Providers { codegen_fn_attrs, should_inherit_track_caller, autodiff_attrs, ..*providers };
 }
diff --git a/compiler/rustc_codegen_ssa/src/errors.rs b/compiler/rustc_codegen_ssa/src/errors.rs
index d67cf0e3a6d5f..e4f60286ee19c 100644
--- a/compiler/rustc_codegen_ssa/src/errors.rs
+++ b/compiler/rustc_codegen_ssa/src/errors.rs
@@ -37,6 +37,10 @@ pub(crate) struct CguNotRecorded<'a> {
     pub cgu_name: &'a str,
 }
 
+#[derive(Diagnostic)]
+#[diag(codegen_ssa_autodiff_without_lto)]
+pub struct AutodiffWithoutLto;
+
 #[derive(Diagnostic)]
 #[diag(codegen_ssa_unknown_reuse_kind)]
 pub(crate) struct UnknownReuseKind {
diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs
index e3553dc03e106..52436fbaa8311 100644
--- a/compiler/rustc_codegen_ssa/src/mir/block.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/block.rs
@@ -1520,6 +1520,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                     align,
                     bx.const_usize(copy_bytes),
                     MemFlags::empty(),
+                    None,
                 );
                 // ...and then load it with the ABI type.
                 let cast_ty = bx.cast_backend_type(cast);
diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
index 32cc78187b9eb..6906bfca571a9 100644
--- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
@@ -1,9 +1,12 @@
-use rustc_middle::ty::{self, Ty, TyCtxt};
+use rustc_ast::expand::typetree::{FncTree, TypeTree};
+use rustc_middle::ty::layout::HasTyCtxt;
+use rustc_middle::ty::{self, Ty, TyCtxt, typetree_from};
 use rustc_middle::{bug, span_bug};
 use rustc_session::config::OptLevel;
 use rustc_span::{Span, sym};
 use rustc_target::abi::WrappingRange;
 use rustc_target::abi::call::{FnAbi, PassMode};
+use tracing::trace;
 
 use super::FunctionCx;
 use super::operand::OperandRef;
@@ -21,15 +24,21 @@ fn copy_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
     src: Bx::Value,
     count: Bx::Value,
 ) {
+    let tcx: TyCtxt<'_> = bx.cx().tcx();
+    let tt: TypeTree = typetree_from(tcx, ty);
+    let fnc_tree: FncTree =
+        FncTree { args: vec![tt.clone(), tt.clone(), TypeTree::all_ints()], ret: TypeTree::new() };
+
     let layout = bx.layout_of(ty);
     let size = layout.size;
     let align = layout.align.abi;
     let size = bx.mul(bx.const_usize(size.bytes()), count);
     let flags = if volatile { MemFlags::VOLATILE } else { MemFlags::empty() };
+    trace!("copy: mir ty: {:?}, enzyme tt: {:?}", ty, fnc_tree);
     if allow_overlap {
-        bx.memmove(dst, align, src, align, size, flags);
+        bx.memmove(dst, align, src, align, size, flags, Some(fnc_tree));
     } else {
-        bx.memcpy(dst, align, src, align, size, flags);
+        bx.memcpy(dst, align, src, align, size, flags, Some(fnc_tree));
     }
 }
 
@@ -41,12 +50,17 @@ fn memset_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
     val: Bx::Value,
     count: Bx::Value,
 ) {
+    let tcx: TyCtxt<'_> = bx.cx().tcx();
+    let tt: TypeTree = typetree_from(tcx, ty);
+    let fnc_tree: FncTree =
+        FncTree { args: vec![tt.clone(), tt.clone(), TypeTree::all_ints()], ret: TypeTree::new() };
+
     let layout = bx.layout_of(ty);
     let size = layout.size;
     let align = layout.align.abi;
     let size = bx.mul(bx.const_usize(size.bytes()), count);
     let flags = if volatile { MemFlags::VOLATILE } else { MemFlags::empty() };
-    bx.memset(dst, val, size, align, flags);
+    bx.memset(dst, val, size, align, flags, Some(fnc_tree));
 }
 
 impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
diff --git a/compiler/rustc_codegen_ssa/src/mir/operand.rs b/compiler/rustc_codegen_ssa/src/mir/operand.rs
index 88ceff327d0aa..aeacafc3dd1b4 100644
--- a/compiler/rustc_codegen_ssa/src/mir/operand.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/operand.rs
@@ -542,7 +542,7 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
         let neg_address = bx.neg(address);
         let offset = bx.and(neg_address, align_minus_1);
         let dst = bx.inbounds_ptradd(alloca, offset);
-        bx.memcpy(dst, min_align, llptr, min_align, size, MemFlags::empty());
+        bx.memcpy(dst, min_align, llptr, min_align, size, MemFlags::empty(), None);
 
         // Store the allocated region and the extra to the indirect place.
         let indirect_operand = OperandValue::Pair(dst, llextra);
diff --git a/compiler/rustc_codegen_ssa/src/mir/rvalue.rs b/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
index 82fea4c58e191..4f925cd101bc7 100644
--- a/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
@@ -100,14 +100,14 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                     // Use llvm.memset.p0i8.* to initialize all zero arrays
                     if bx.cx().const_to_opt_u128(v, false) == Some(0) {
                         let fill = bx.cx().const_u8(0);
-                        bx.memset(start, fill, size, dest.val.align, MemFlags::empty());
+                        bx.memset(start, fill, size, dest.val.align, MemFlags::empty(), None);
                         return;
                     }
 
                     // Use llvm.memset.p0i8.* to initialize byte arrays
                     let v = bx.from_immediate(v);
                     if bx.cx().val_ty(v) == bx.cx().type_i8() {
-                        bx.memset(start, v, size, dest.val.align, MemFlags::empty());
+                        bx.memset(start, v, size, dest.val.align, MemFlags::empty(), None);
                         return;
                     }
                 }
diff --git a/compiler/rustc_codegen_ssa/src/mir/statement.rs b/compiler/rustc_codegen_ssa/src/mir/statement.rs
index 6338d16c897f6..1a54796c2cbee 100644
--- a/compiler/rustc_codegen_ssa/src/mir/statement.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/statement.rs
@@ -85,7 +85,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                 let align = pointee_layout.align;
                 let dst = dst_val.immediate();
                 let src = src_val.immediate();
-                bx.memcpy(dst, align, src, align, bytes, crate::MemFlags::empty());
+                bx.memcpy(dst, align, src, align, bytes, crate::MemFlags::empty(), None);
             }
             mir::StatementKind::FakeRead(..)
             | mir::StatementKind::Retag { .. }
diff --git a/compiler/rustc_codegen_ssa/src/traits/builder.rs b/compiler/rustc_codegen_ssa/src/traits/builder.rs
index 50a5171414695..7d5a5025d9a84 100644
--- a/compiler/rustc_codegen_ssa/src/traits/builder.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/builder.rs
@@ -1,6 +1,7 @@
 use std::assert_matches::assert_matches;
 use std::ops::Deref;
 
+use rustc_ast::expand::typetree::FncTree;
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrs;
 use rustc_middle::ty::layout::{FnAbiOf, LayoutOf, TyAndLayout};
 use rustc_middle::ty::{Instance, Ty};
@@ -297,6 +298,7 @@ pub trait BuilderMethods<'a, 'tcx>:
         src_align: Align,
         size: Self::Value,
         flags: MemFlags,
+        tt: Option<FncTree>,
     );
     fn memmove(
         &mut self,
@@ -306,6 +308,7 @@ pub trait BuilderMethods<'a, 'tcx>:
         src_align: Align,
         size: Self::Value,
         flags: MemFlags,
+        tt: Option<FncTree>,
     );
     fn memset(
         &mut self,
@@ -314,6 +317,7 @@ pub trait BuilderMethods<'a, 'tcx>:
         size: Self::Value,
         align: Align,
         flags: MemFlags,
+        tt: Option<FncTree>,
     );
 
     /// *Typed* copy for non-overlapping places.
@@ -353,7 +357,7 @@ pub trait BuilderMethods<'a, 'tcx>:
             temp.val.store_with_flags(self, dst.with_type(layout), flags);
         } else if !layout.is_zst() {
             let bytes = self.const_usize(layout.size.bytes());
-            self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, flags);
+            self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, flags, None);
         }
     }
 
diff --git a/compiler/rustc_codegen_ssa/src/traits/misc.rs b/compiler/rustc_codegen_ssa/src/traits/misc.rs
index 5b33fd7ab1028..ee1ba2e275b20 100644
--- a/compiler/rustc_codegen_ssa/src/traits/misc.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/misc.rs
@@ -28,4 +28,6 @@ pub trait MiscCodegenMethods<'tcx>: BackendTypes {
     /// Declares the extern "C" main function for the entry point. Returns None if the symbol
     /// already exists.
     fn declare_c_main(&self, fn_type: Self::Type) -> Option<Self::Function>;
+    // TODO: Manuel: I think we can drop this and construct the empty vec on the fly?
+    fn create_autodiff(&self) -> Vec<Self::Function>;
 }
diff --git a/compiler/rustc_codegen_ssa/src/traits/write.rs b/compiler/rustc_codegen_ssa/src/traits/write.rs
index aabe9e33c4aa1..5846edbd441c3 100644
--- a/compiler/rustc_codegen_ssa/src/traits/write.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/write.rs
@@ -1,3 +1,5 @@
+use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
+use rustc_data_structures::fx::FxHashMap;
 use rustc_errors::{DiagCtxtHandle, FatalError};
 use rustc_middle::dep_graph::WorkProduct;
 
@@ -12,6 +14,7 @@ pub trait WriteBackendMethods: 'static + Sized + Clone {
     type ModuleBuffer: ModuleBufferMethods;
     type ThinData: Send + Sync;
     type ThinBuffer: ThinBufferMethods;
+    type TypeTree: Clone;
 
     /// Merge all modules into main_module and returning it
     fn run_link(
@@ -61,6 +64,15 @@ pub trait WriteBackendMethods: 'static + Sized + Clone {
         want_summary: bool,
     ) -> (String, Self::ThinBuffer);
     fn serialize_module(module: ModuleCodegen<Self::Module>) -> (String, Self::ModuleBuffer);
+    /// Generate autodiff rules
+    fn autodiff(
+        cgcx: &CodegenContext<Self>,
+        module: &ModuleCodegen<Self::Module>,
+        diff_fncs: Vec<AutoDiffItem>,
+        typetrees: FxHashMap<String, Self::TypeTree>,
+        config: &ModuleConfig,
+    ) -> Result<(), FatalError>;
+    fn typetrees(module: &mut Self::Module) -> FxHashMap<String, Self::TypeTree>;
 }
 
 pub trait ThinBufferMethods: Send + Sync {
diff --git a/compiler/rustc_interface/src/tests.rs b/compiler/rustc_interface/src/tests.rs
index d3762e739db80..04f822d91ebdc 100644
--- a/compiler/rustc_interface/src/tests.rs
+++ b/compiler/rustc_interface/src/tests.rs
@@ -758,6 +758,7 @@ fn test_unstable_options_tracking_hash() {
     tracked!(allow_features, Some(vec![String::from("lang_items")]));
     tracked!(always_encode_mir, true);
     tracked!(assume_incomplete_release, true);
+    tracked!(autodiff, vec![String::from("ad_flags")]);
     tracked!(binary_dep_depinfo, true);
     tracked!(box_noalias, false);
     tracked!(
diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
index cb75888abd76d..3943b6247e403 100644
--- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
+++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
@@ -336,6 +336,10 @@ extern "C" void LLVMRustAddFunctionAttributes(LLVMValueRef Fn, unsigned Index,
   AddAttributes(F, Index, Attrs, AttrsLen);
 }
 
+extern "C" bool LLVMRustIsStructType(LLVMTypeRef Ty) {
+  return unwrap(Ty)->isStructTy();
+}
+
 extern "C" void LLVMRustAddCallSiteAttributes(LLVMValueRef Instr,
                                               unsigned Index,
                                               LLVMAttributeRef *Attrs,
@@ -344,11 +348,44 @@ extern "C" void LLVMRustAddCallSiteAttributes(LLVMValueRef Instr,
   AddAttributes(Call, Index, Attrs, AttrsLen);
 }
 
+extern "C" LLVMValueRef LLVMRustGetTerminator(LLVMBasicBlockRef BB) {
+  Instruction *ret = unwrap(BB)->getTerminator();
+  return wrap(ret);
+}
+
+extern "C" void LLVMRustEraseInstFromParent(LLVMValueRef Instr) {
+  if (auto I = dyn_cast<Instruction>(unwrap<Value>(Instr))) {
+    I->eraseFromParent();
+  }
+}
+
+extern "C" LLVMTypeRef LLVMRustGetFunctionType(LLVMValueRef Fn) {
+  auto Ftype = unwrap<Function>(Fn)->getFunctionType();
+  return wrap(Ftype);
+}
+
+extern "C" void LLVMRustRemoveEnumAttributeAtIndex(LLVMValueRef F, size_t index,
+                                                   LLVMRustAttribute RustAttr) {
+  LLVMRemoveEnumAttributeAtIndex(F, index, fromRust(RustAttr));
+}
+
 extern "C" LLVMAttributeRef
 LLVMRustCreateAttrNoValue(LLVMContextRef C, LLVMRustAttribute RustAttr) {
   return wrap(Attribute::get(*unwrap(C), fromRust(RustAttr)));
 }
 
+extern "C" void LLVMRustAddEnumAttributeAtIndex(LLVMContextRef C,
+                                                LLVMValueRef F, size_t index,
+                                                LLVMRustAttribute RustAttr) {
+  LLVMAddAttributeAtIndex(F, index, LLVMRustCreateAttrNoValue(C, RustAttr));
+}
+
+extern "C" LLVMAttributeRef
+LLVMRustGetEnumAttributeAtIndex(LLVMValueRef F, size_t index,
+                                LLVMRustAttribute RustAttr) {
+  return LLVMGetEnumAttributeAtIndex(F, index, fromRust(RustAttr));
+}
+
 extern "C" LLVMAttributeRef LLVMRustCreateAlignmentAttr(LLVMContextRef C,
                                                         uint64_t Bytes) {
   return wrap(Attribute::getWithAlignment(*unwrap(C), llvm::Align(Bytes)));
@@ -872,6 +909,67 @@ extern "C" bool LLVMRustHasModuleFlag(LLVMModuleRef M, const char *Name,
   return unwrap(M)->getModuleFlag(StringRef(Name, Len)) != nullptr;
 }
 
+// pub fn LLVMRustGetLastInstruction<'a>(BB: &BasicBlock) -> Option<&'a Value>;
+extern "C" LLVMValueRef LLVMRustGetLastInstruction(LLVMBasicBlockRef BB) {
+  auto Point = unwrap(BB)->rbegin();
+  if (Point != unwrap(BB)->rend())
+    return wrap(&*Point);
+  return nullptr;
+}
+
+extern "C" void LLVMRustEraseInstBefore(LLVMBasicBlockRef bb, LLVMValueRef I) {
+  auto &BB = *unwrap(bb);
+  auto &Inst = *unwrap<Instruction>(I);
+  auto It = BB.begin();
+  while (&*It != &Inst)
+    ++It;
+  assert(It != BB.end());
+  // Delete in rev order to ensure no dangling references.
+  while (It != BB.begin()) {
+    auto Prev = std::prev(It);
+    It->eraseFromParent();
+    It = Prev;
+  }
+  It->eraseFromParent();
+}
+
+extern "C" bool LLVMRustHasMetadata(LLVMValueRef inst, unsigned kindID) {
+  if (auto *I = dyn_cast<Instruction>(unwrap<Value>(inst))) {
+    return I->hasMetadata(kindID);
+  }
+  return false;
+}
+
+extern "C" void LLVMRustAddFncParamAttr(LLVMValueRef F, unsigned i,
+                                      LLVMAttributeRef RustAttr) {
+  if (auto *Fn = dyn_cast<Function>(unwrap<Value>(F))) {
+    Fn->addParamAttr(i, unwrap(RustAttr));
+  }
+}
+
+extern "C" void LLVMRustAddRetFncAttr(LLVMValueRef F,
+                                      LLVMAttributeRef RustAttr) {
+  if (auto *Fn = dyn_cast<Function>(unwrap<Value>(F))) {
+    Fn->addRetAttr(unwrap(RustAttr));
+  }
+}
+
+extern "C" LLVMMetadataRef LLVMRustDIGetInstMetadata(LLVMValueRef x) {
+  if (auto *I = dyn_cast<Instruction>(unwrap<Value>(x))) {
+    // auto *MD = I->getMetadata(LLVMContext::MD_dbg);
+    auto *MD = I->getDebugLoc().getAsMDNode();
+    return wrap(MD);
+  }
+  return nullptr;
+}
+
+extern "C" void LLVMRustAddParamAttr(LLVMValueRef call, unsigned i,
+                                     LLVMAttributeRef RustAttr) {
+  if (auto *CI = dyn_cast<CallInst>(unwrap<Value>(call))) {
+    CI->addParamAttr(i, unwrap(RustAttr));
+  }
+}
+
 extern "C" void LLVMRustGlobalAddMetadata(LLVMValueRef Global, unsigned Kind,
                                           LLVMMetadataRef MD) {
   unwrap<GlobalObject>(Global)->addMetadata(Kind, *unwrap<MDNode>(MD));
diff --git a/compiler/rustc_middle/messages.ftl b/compiler/rustc_middle/messages.ftl
index 39485a324f2e2..d1a9235064ae1 100644
--- a/compiler/rustc_middle/messages.ftl
+++ b/compiler/rustc_middle/messages.ftl
@@ -1,3 +1,7 @@
+middle_autodiff_unsafe_inner_const_ref = reading from a `Duplicated` const {$ty} is unsafe
+
+middle_unsupported_union = we don't support unions yet: '{$ty_name}'
+
 middle_adjust_for_foreign_abi_error =
     target architecture {$arch} does not support `extern {$abi}` ABI
 
diff --git a/compiler/rustc_middle/src/arena.rs b/compiler/rustc_middle/src/arena.rs
index 52fe9956b4777..8279348378fd7 100644
--- a/compiler/rustc_middle/src/arena.rs
+++ b/compiler/rustc_middle/src/arena.rs
@@ -86,6 +86,7 @@ macro_rules! arena_types {
             [] dyn_compatibility_violations: rustc_middle::traits::DynCompatibilityViolation,
             [] codegen_unit: rustc_middle::mir::mono::CodegenUnit<'tcx>,
             [decode] attribute: rustc_ast::Attribute,
+            [] autodiff_item: rustc_ast::expand::autodiff_attrs::AutoDiffItem,
             [] name_set: rustc_data_structures::unord::UnordSet<rustc_span::symbol::Symbol>,
             [] ordered_name_set: rustc_data_structures::fx::FxIndexSet<rustc_span::symbol::Symbol>,
             [] pats: rustc_middle::ty::PatternKind<'tcx>,
diff --git a/compiler/rustc_middle/src/error.rs b/compiler/rustc_middle/src/error.rs
index 5c2aa0005d405..aa9a931d42389 100644
--- a/compiler/rustc_middle/src/error.rs
+++ b/compiler/rustc_middle/src/error.rs
@@ -30,6 +30,20 @@ pub struct OpaqueHiddenTypeMismatch<'tcx> {
     pub sub: TypeMismatchReason,
 }
 
+#[derive(Diagnostic)]
+#[diag(middle_unsupported_union)]
+pub struct UnsupportedUnion {
+    pub ty_name: String,
+}
+
+#[derive(Diagnostic)]
+#[diag(middle_autodiff_unsafe_inner_const_ref)]
+pub struct AutodiffUnsafeInnerConstRef {
+    #[primary_span]
+    pub span: Span,
+    pub ty: String,
+}
+
 #[derive(Subdiagnostic)]
 pub enum TypeMismatchReason {
     #[label(middle_conflict_types)]
diff --git a/compiler/rustc_middle/src/query/erase.rs b/compiler/rustc_middle/src/query/erase.rs
index 5f8427bd707aa..c4eefa82f0950 100644
--- a/compiler/rustc_middle/src/query/erase.rs
+++ b/compiler/rustc_middle/src/query/erase.rs
@@ -221,6 +221,9 @@ impl<T0, T1> EraseType for (&'_ T0, &'_ [T1]) {
 impl<T0> EraseType for (&'_ T0, Result<(), ErrorGuaranteed>) {
     type Result = [u8; size_of::<(&'static (), Result<(), ErrorGuaranteed>)>()];
 }
+impl<T0, T1, T2> EraseType for (&'_ T0, &'_ [T1], &'_ [T2]) {
+    type Result = [u8; size_of::<(&'static (), &'static [()], &'static [()])>()];
+}
 
 macro_rules! trivial {
     ($($ty:ty),+ $(,)?) => {
diff --git a/compiler/rustc_middle/src/query/mod.rs b/compiler/rustc_middle/src/query/mod.rs
index d03fc39c9ade1..a5637ae99ef33 100644
--- a/compiler/rustc_middle/src/query/mod.rs
+++ b/compiler/rustc_middle/src/query/mod.rs
@@ -14,6 +14,7 @@ use std::sync::Arc;
 use rustc_arena::TypedArena;
 use rustc_ast::expand::StrippedCfgItem;
 use rustc_ast::expand::allocator::AllocatorKind;
+use rustc_ast::expand::autodiff_attrs::{AutoDiffAttrs, AutoDiffItem};
 use rustc_data_structures::fingerprint::Fingerprint;
 use rustc_data_structures::fx::{FxIndexMap, FxIndexSet};
 use rustc_data_structures::sorted_map::SortedMap;
@@ -1270,6 +1271,13 @@ rustc_queries! {
         feedable
     }
 
+    /// The list autodiff extern functions in current crate
+    query autodiff_attrs(def_id: DefId) -> &'tcx AutoDiffAttrs {
+        desc { |tcx| "computing autodiff attributes of `{}`", tcx.def_path_str(def_id) }
+        arena_cache
+        cache_on_disk_if { def_id.is_local() }
+    }
+
     query asm_target_features(def_id: DefId) -> &'tcx FxIndexSet<Symbol> {
         desc { |tcx| "computing target features for inline asm of `{}`", tcx.def_path_str(def_id) }
     }
@@ -1977,7 +1985,7 @@ rustc_queries! {
         separate_provide_extern
     }
 
-    query collect_and_partition_mono_items(_: ()) -> (&'tcx DefIdSet, &'tcx [CodegenUnit<'tcx>]) {
+    query collect_and_partition_mono_items(_: ()) -> (&'tcx DefIdSet, &'tcx [AutoDiffItem], &'tcx [CodegenUnit<'tcx>]) {
         eval_always
         desc { "collect_and_partition_mono_items" }
     }
diff --git a/compiler/rustc_middle/src/ty/mod.rs b/compiler/rustc_middle/src/ty/mod.rs
index 854147648178f..6a217a71b2082 100644
--- a/compiler/rustc_middle/src/ty/mod.rs
+++ b/compiler/rustc_middle/src/ty/mod.rs
@@ -48,7 +48,7 @@ pub use rustc_session::lint::RegisteredTools;
 use rustc_span::hygiene::MacroKind;
 use rustc_span::symbol::{Ident, Symbol, kw, sym};
 use rustc_span::{ExpnId, ExpnKind, Span};
-use rustc_target::abi::{Align, FieldIdx, Integer, IntegerType, VariantIdx};
+use rustc_target::abi::{Align, FieldIdx, FieldsShape, Integer, IntegerType, VariantIdx};
 pub use rustc_target::abi::{ReprFlags, ReprOptions};
 pub use rustc_type_ir::ConstKind::{
     Bound as BoundCt, Error as ErrorCt, Expr as ExprCt, Infer as InferCt, Param as ParamCt,
@@ -107,6 +107,7 @@ pub use self::typeck_results::{
     CanonicalUserType, CanonicalUserTypeAnnotation, CanonicalUserTypeAnnotations, IsIdentity,
     TypeckResults, UserType, UserTypeAnnotationIndex,
 };
+pub use self::typetree::*;
 pub use self::visit::{TypeSuperVisitable, TypeVisitable, TypeVisitableExt, TypeVisitor};
 use crate::error::{OpaqueHiddenTypeMismatch, TypeMismatchReason};
 use crate::metadata::ModChild;
@@ -134,6 +135,7 @@ pub mod pattern;
 pub mod print;
 pub mod relate;
 pub mod trait_def;
+pub mod typetree;
 pub mod util;
 pub mod visit;
 pub mod vtable;
@@ -213,6 +215,9 @@ pub struct ResolverAstLowering {
 
     pub next_node_id: ast::NodeId,
 
+    /// Mapping of autodiff function IDs
+    pub autodiff_map: FxHashMap<LocalDefId, LocalDefId>,
+
     pub node_id_to_def_id: NodeMap<LocalDefId>,
 
     pub trait_map: NodeMap<Vec<hir::TraitCandidate>>,
diff --git a/compiler/rustc_middle/src/ty/typetree.rs b/compiler/rustc_middle/src/ty/typetree.rs
new file mode 100644
index 0000000000000..86180938c09db
--- /dev/null
+++ b/compiler/rustc_middle/src/ty/typetree.rs
@@ -0,0 +1,330 @@
+use rustc_ast::expand::typetree::{FncTree, Kind, Type, TypeTree};
+use rustc_span::Span;
+use rustc_type_ir::Adt;
+use tracing::trace;
+
+//, Type, Kind, TypeTree, FncTree, FieldsShape};
+use super::context::TyCtxt;
+use super::{ParamEnv, ParamEnvAnd};
+use crate::error::AutodiffUnsafeInnerConstRef;
+use crate::ty::{self, FieldsShape, Ty};
+
+pub fn typetree_from<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> TypeTree {
+    let mut visited = vec![];
+    let ty = typetree_from_ty(ty, tcx, 0, false, &mut visited, None);
+    let tt = Type { offset: -1, kind: Kind::Pointer, size: 8, child: ty };
+    return TypeTree(vec![tt]);
+}
+
+use rustc_ast::expand::autodiff_attrs::DiffActivity;
+
+// This function combines three tasks. To avoid traversing each type 3x, we combine them.
+// 1. Create a TypeTree from a Ty. This is the main task.
+// 2. IFF da is not empty, we also want to adjust DiffActivity to account for future MIR->LLVM
+//    lowering. E.g. fat ptr are going to introduce an extra int.
+// 3. IFF da is not empty, we are creating TT for a function directly differentiated (has an
+//    autodiff macro on top). Here we want to make sure that shadows are mutable internally.
+//    We know the outermost ref/ptr indirection is mutability - we generate it like that.
+//    We now have to make sure that inner ptr/ref are mutable too, or issue a warning.
+//    Not an error, becaues it only causes issues if they are actually read, which we don't check
+//    yet. We should add such analysis to relibably either issue an error or accept without warning.
+//    If there only were some reasearch to do that...
+pub fn fnc_typetrees<'tcx>(
+    tcx: TyCtxt<'tcx>,
+    fn_ty: Ty<'tcx>,
+    da: &mut Vec<DiffActivity>,
+    span: Option<Span>,
+) -> FncTree {
+    if !fn_ty.is_fn() {
+        return FncTree { args: vec![], ret: TypeTree::new() };
+    }
+    let fnc_binder: ty::Binder<'_, ty::FnSig<'_>> = fn_ty.fn_sig(tcx);
+
+    // If rustc compiles the unmodified primal, we know that this copy of the function
+    // also has correct lifetimes. We know that Enzyme won't free the shadow too early
+    // (or actually at all), so let's strip lifetimes when computing the layout.
+    // Recommended by compiler-errors:
+    // https://discord.com/channels/273534239310479360/957720175619215380/1223454360676208751
+    let x = tcx.instantiate_bound_regions_with_erased(fnc_binder);
+
+    let mut new_activities = vec![];
+    let mut new_positions = vec![];
+    let mut visited = vec![];
+    let mut args = vec![];
+    for (i, ty) in x.inputs().iter().enumerate() {
+        // We care about safety checks, if an argument get's duplicated and we write into the
+        // shadow. That's equivalent to Duplicated or DuplicatedOnly.
+        let safety = if !da.is_empty() {
+            assert!(da.len() == x.inputs().len(), "{:?} != {:?}", da.len(), x.inputs().len());
+            // If we have Activities, we also have spans
+            assert!(span.is_some());
+            match da[i] {
+                DiffActivity::DuplicatedOnly | DiffActivity::Duplicated => true,
+                _ => false,
+            }
+        } else {
+            false
+        };
+
+        visited.clear();
+        if ty.is_unsafe_ptr() || ty.is_ref() || ty.is_box() {
+            if ty.is_fn_ptr() {
+                unimplemented!("what to do whith fn ptr?");
+            }
+            let inner_ty = ty.builtin_deref(true).unwrap();
+            if inner_ty.is_slice() {
+                // We know that the lenght will be passed as extra arg.
+                let child = typetree_from_ty(inner_ty, tcx, 1, safety, &mut visited, span);
+                let tt = Type { offset: -1, kind: Kind::Pointer, size: 8, child };
+                args.push(TypeTree(vec![tt]));
+                let i64_tt =
+                    Type { offset: -1, kind: Kind::Integer, size: 8, child: TypeTree::new() };
+                args.push(TypeTree(vec![i64_tt]));
+                if !da.is_empty() {
+                    // We are looking at a slice. The length of that slice will become an
+                    // extra integer on llvm level. Integers are always const.
+                    // However, if the slice get's duplicated, we want to know to later check the
+                    // size. So we mark the new size argument as FakeActivitySize.
+                    let activity = match da[i] {
+                        DiffActivity::DualOnly
+                        | DiffActivity::Dual
+                        | DiffActivity::DuplicatedOnly
+                        | DiffActivity::Duplicated => DiffActivity::FakeActivitySize,
+                        DiffActivity::Const => DiffActivity::Const,
+                        _ => panic!("unexpected activity for ptr/ref"),
+                    };
+                    new_activities.push(activity);
+                    new_positions.push(i + 1);
+                }
+                trace!("ABI MATCHING!");
+                continue;
+            }
+        }
+        let arg_tt = typetree_from_ty(*ty, tcx, 0, safety, &mut visited, span);
+        args.push(arg_tt);
+    }
+
+    // now add the extra activities coming from slices
+    // Reverse order to not invalidate the indices
+    for _ in 0..new_activities.len() {
+        let pos = new_positions.pop().unwrap();
+        let activity = new_activities.pop().unwrap();
+        da.insert(pos, activity);
+    }
+
+    visited.clear();
+    let ret = typetree_from_ty(x.output(), tcx, 0, false, &mut visited, span);
+
+    FncTree { args, ret }
+}
+
+fn typetree_from_ty<'a>(
+    ty: Ty<'a>,
+    tcx: TyCtxt<'a>,
+    depth: usize,
+    safety: bool,
+    visited: &mut Vec<Ty<'a>>,
+    span: Option<Span>,
+) -> TypeTree {
+    if depth > 20 {
+        trace!("depth > 20 for ty: {}", &ty);
+    }
+    if visited.contains(&ty) {
+        // recursive type
+        trace!("recursive type: {}", &ty);
+        return TypeTree::new();
+    }
+    visited.push(ty);
+
+    if ty.is_unsafe_ptr() || ty.is_ref() || ty.is_box() {
+        if ty.is_fn_ptr() {
+            unimplemented!("what to do whith fn ptr?");
+        }
+
+        let inner_ty = ty.builtin_deref(true).unwrap();
+        let is_mut = inner_ty.is_mutable_ptr();
+
+        // Now account for inner mutability.
+        if !is_mut && depth > 0 && safety {
+            let ptr_ty: String = if ty.is_ref() {
+                "ref"
+            } else if ty.is_unsafe_ptr() {
+                "ptr"
+            } else {
+                assert!(ty.is_box());
+                "box"
+            }
+            .to_string();
+
+            // If we have mutability, we also have a span
+            assert!(span.is_some());
+            let span = span.unwrap();
+
+            tcx.sess.dcx().emit_warn(AutodiffUnsafeInnerConstRef { span, ty: ptr_ty });
+        }
+
+        //visited.push(inner_ty);
+        let child = typetree_from_ty(inner_ty, tcx, depth + 1, safety, visited, span);
+        let tt = Type { offset: -1, kind: Kind::Pointer, size: 8, child };
+        visited.pop();
+        return TypeTree(vec![tt]);
+    }
+
+    if ty.is_closure() || ty.is_coroutine() || ty.is_fresh() || ty.is_fn() {
+        visited.pop();
+        return TypeTree::new();
+    }
+
+    if ty.is_scalar() {
+        let (kind, size) = if ty.is_integral() || ty.is_char() || ty.is_bool() {
+            (Kind::Integer, ty.primitive_size(tcx).bytes_usize())
+        } else if ty.is_floating_point() {
+            match ty {
+                x if x == tcx.types.f32 => (Kind::Float, 4),
+                x if x == tcx.types.f64 => (Kind::Double, 8),
+                _ => panic!("floatTy scalar that is neither f32 nor f64"),
+            }
+        } else {
+            panic!("scalar that is neither integral nor floating point");
+        };
+        visited.pop();
+        return TypeTree(vec![Type { offset: -1, child: TypeTree::new(), kind, size }]);
+    }
+
+    let param_env_and = ParamEnvAnd { param_env: ParamEnv::empty(), value: ty };
+
+    let layout = tcx.layout_of(param_env_and);
+    assert!(layout.is_ok());
+
+    let layout = layout.unwrap().layout;
+    let fields = layout.fields();
+    let max_size = layout.size();
+
+    if ty.is_adt() && !ty.is_simd() {
+        let adt_def = ty.ty_adt_def().unwrap();
+
+        if adt_def.is_struct() {
+            let (offsets, _memory_index) = match fields {
+                // Manuel TODO:
+                FieldsShape::Arbitrary { offsets: o, memory_index: m } => (o, m),
+                FieldsShape::Array { .. } => {
+                    return TypeTree::new();
+                } //e.g. core::arch::x86_64::__m128i, TODO: later
+                FieldsShape::Union(_) => {
+                    return TypeTree::new();
+                }
+                FieldsShape::Primitive => {
+                    return TypeTree::new();
+                }
+            };
+
+            let substs = match ty.kind() {
+                Adt(_, subst_ref) => subst_ref,
+                _ => panic!(""),
+            };
+
+            let fields = adt_def.all_fields();
+            let fields = fields
+                .into_iter()
+                .zip(offsets.into_iter())
+                .filter_map(|(field, offset)| {
+                    let field_ty: Ty<'_> = field.ty(tcx, substs);
+                    let field_ty: Ty<'_> =
+                        tcx.normalize_erasing_regions(ParamEnv::empty(), field_ty);
+
+                    if field_ty.is_phantom_data() {
+                        return None;
+                    }
+
+                    //visited.push(field_ty);
+                    let mut child =
+                        typetree_from_ty(field_ty, tcx, depth + 1, safety, visited, span).0;
+
+                    for c in &mut child {
+                        if c.offset == -1 {
+                            c.offset = offset.bytes() as isize
+                        } else {
+                            c.offset += offset.bytes() as isize;
+                        }
+                    }
+
+                    Some(child)
+                })
+                .flatten()
+                .collect::<Vec<Type>>();
+
+            visited.pop();
+            let ret_tt = TypeTree(fields);
+            return ret_tt;
+        } else if adt_def.is_enum() {
+            // Enzyme can't represent enums, so let it figure it out itself, without seeeding
+            // typetree
+            //unimplemented!("adt that is an enum");
+        } else {
+            //let ty_name = tcx.def_path_debug_str(adt_def.did());
+            //tcx.sess.emit_fatal(UnsupportedUnion { ty_name });
+        }
+    }
+
+    if ty.is_simd() {
+        trace!("simd");
+        let (_size, inner_ty) = ty.simd_size_and_type(tcx);
+        //visited.push(inner_ty);
+        let _sub_tt = typetree_from_ty(inner_ty, tcx, depth + 1, safety, visited, span);
+        //let tt = TypeTree(
+        //    std::iter::repeat(subtt)
+        //        .take(*count as usize)
+        //        .enumerate()
+        //        .map(|(idx, x)| x.0.into_iter().map(move |x| x.add_offset((idx * size) as isize)))
+        //        .flatten()
+        //        .collect(),
+        //);
+        // TODO
+        visited.pop();
+        return TypeTree::new();
+    }
+
+    if ty.is_array() {
+        let (stride, count) = match fields {
+            FieldsShape::Array { stride: s, count: c } => (s, c),
+            _ => panic!(""),
+        };
+        let byte_stride = stride.bytes_usize();
+        let byte_max_size = max_size.bytes_usize();
+
+        assert!(byte_stride * *count as usize == byte_max_size);
+        if (*count as usize) == 0 {
+            return TypeTree::new();
+        }
+        let sub_ty = ty.builtin_index().unwrap();
+        //visited.push(sub_ty);
+        let subtt = typetree_from_ty(sub_ty, tcx, depth + 1, safety, visited, span);
+
+        // calculate size of subtree
+        let param_env_and = ParamEnvAnd { param_env: ParamEnv::empty(), value: sub_ty };
+        let size = tcx.layout_of(param_env_and).unwrap().size.bytes() as usize;
+        let tt = TypeTree(
+            std::iter::repeat(subtt)
+                .take(*count as usize)
+                .enumerate()
+                .map(|(idx, x)| x.0.into_iter().map(move |x| x.add_offset((idx * size) as isize)))
+                .flatten()
+                .collect(),
+        );
+
+        visited.pop();
+        return tt;
+    }
+
+    if ty.is_slice() {
+        let sub_ty = ty.builtin_index().unwrap();
+        //visited.push(sub_ty);
+        let subtt = typetree_from_ty(sub_ty, tcx, depth + 1, safety, visited, span);
+
+        visited.pop();
+        return subtt;
+    }
+
+    visited.pop();
+    TypeTree::new()
+}
diff --git a/compiler/rustc_monomorphize/Cargo.toml b/compiler/rustc_monomorphize/Cargo.toml
index c7f1b9fa78454..bce8d9c4a9878 100644
--- a/compiler/rustc_monomorphize/Cargo.toml
+++ b/compiler/rustc_monomorphize/Cargo.toml
@@ -5,6 +5,7 @@ edition = "2021"
 
 [dependencies]
 # tidy-alphabetical-start
+rustc_ast = { path = "../rustc_ast" }
 rustc_data_structures = { path = "../rustc_data_structures" }
 rustc_errors = { path = "../rustc_errors" }
 rustc_fluent_macro = { path = "../rustc_fluent_macro" }
@@ -13,6 +14,7 @@ rustc_macros = { path = "../rustc_macros" }
 rustc_middle = { path = "../rustc_middle" }
 rustc_session = { path = "../rustc_session" }
 rustc_span = { path = "../rustc_span" }
+rustc_symbol_mangling = { path = "../rustc_symbol_mangling" }
 rustc_target = { path = "../rustc_target" }
 serde = "1"
 serde_json = "1"
diff --git a/compiler/rustc_monomorphize/src/collector.rs b/compiler/rustc_monomorphize/src/collector.rs
index 3f9a0df030150..8b41fe2582950 100644
--- a/compiler/rustc_monomorphize/src/collector.rs
+++ b/compiler/rustc_monomorphize/src/collector.rs
@@ -250,7 +250,7 @@ pub(crate) enum MonoItemCollectionStrategy {
 
 pub(crate) struct UsageMap<'tcx> {
     // Maps every mono item to the mono items used by it.
-    used_map: UnordMap<MonoItem<'tcx>, Vec<MonoItem<'tcx>>>,
+    pub used_map: UnordMap<MonoItem<'tcx>, Vec<MonoItem<'tcx>>>,
 
     // Maps every mono item to the mono items that use it.
     user_map: UnordMap<MonoItem<'tcx>, Vec<MonoItem<'tcx>>>,
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 9bf7e67417eff..238a9b873608c 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -98,6 +98,7 @@ use std::fs::{self, File};
 use std::io::Write;
 use std::path::{Path, PathBuf};
 
+use rustc_ast::expand::autodiff_attrs::{AutoDiffAttrs, AutoDiffItem, DiffActivity};
 use rustc_data_structures::fx::{FxIndexMap, FxIndexSet};
 use rustc_data_structures::sync;
 use rustc_data_structures::unord::{UnordMap, UnordSet};
@@ -114,13 +115,14 @@ use rustc_middle::mir::mono::{
 };
 use rustc_middle::ty::print::{characteristic_def_id_of_type, with_no_trimmed_paths};
 use rustc_middle::ty::visit::TypeVisitableExt;
-use rustc_middle::ty::{self, InstanceKind, TyCtxt};
+use rustc_middle::ty::{self, InstanceKind, ParamEnv, TyCtxt, fnc_typetrees};
 use rustc_middle::util::Providers;
 use rustc_session::CodegenUnits;
 use rustc_session::config::{DumpMonoStatsFormat, SwitchWithOptPath};
 use rustc_span::symbol::Symbol;
 use rustc_target::spec::SymbolVisibility;
-use tracing::debug;
+use rustc_symbol_mangling::symbol_name_for_instance_in_crate;
+use tracing::{debug, trace};
 
 use crate::collector::{self, MonoItemCollectionStrategy, UsageMap};
 use crate::errors::{CouldntDumpMonoStats, SymbolAlreadyDefined, UnknownCguCollectionMode};
@@ -251,7 +253,14 @@ where
             &mut can_be_internalized,
             export_generics,
         );
-        if visibility == Visibility::Hidden && can_be_internalized {
+
+        // We can't differentiate something that got inlined.
+        let autodiff_active = match characteristic_def_id {
+            Some(def_id) => cx.tcx.autodiff_attrs(def_id).is_active(),
+            None => false,
+        };
+
+        if !autodiff_active && visibility == Visibility::Hidden && can_be_internalized {
             internalization_candidates.insert(mono_item);
         }
         let size_estimate = mono_item.size_estimate(cx.tcx);
@@ -1102,7 +1111,10 @@ where
     }
 }
 
-fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[CodegenUnit<'_>]) {
+fn collect_and_partition_mono_items(
+    tcx: TyCtxt<'_>,
+    (): (),
+) -> (&DefIdSet, &[AutoDiffItem], &[CodegenUnit<'_>]) {
     let collection_strategy = match tcx.sess.opts.unstable_opts.print_mono_items {
         Some(ref s) => {
             let mode = s.to_lowercase();
@@ -1164,6 +1176,60 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[Co
         })
         .collect();
 
+    let autodiff_items2: Vec<_> = items
+        .iter()
+        .filter_map(|item| match *item {
+            MonoItem::Fn(ref instance) => Some((item, instance)),
+            _ => None,
+        })
+        .collect();
+    let mut autodiff_items: Vec<AutoDiffItem> = vec![];
+
+    for (item, instance) in autodiff_items2 {
+        let target_id = instance.def_id();
+        let target_attrs: &AutoDiffAttrs = tcx.autodiff_attrs(target_id);
+        let mut input_activities: Vec<DiffActivity> = target_attrs.input_activity.clone();
+        if target_attrs.is_source() {
+            trace!("source found: {:?}", target_id);
+        }
+        if !target_attrs.apply_autodiff() {
+            continue;
+        }
+
+        let target_symbol = symbol_name_for_instance_in_crate(tcx, instance.clone(), LOCAL_CRATE);
+
+        let source =
+            usage_map.used_map.get(&item).unwrap().into_iter().find_map(|item| match *item {
+                MonoItem::Fn(ref instance_s) => {
+                    let source_id = instance_s.def_id();
+                    if tcx.autodiff_attrs(source_id).is_active() {
+                        return Some(instance_s);
+                    }
+                    None
+                }
+                _ => None,
+            });
+        let inst = match source {
+            Some(source) => source,
+            None => continue,
+        };
+
+        println!("source_id: {:?}", inst.def_id());
+        let fn_ty = inst.ty(tcx, ParamEnv::empty());
+        assert!(fn_ty.is_fn());
+        let span = tcx.def_span(inst.def_id());
+        let fnc_tree = fnc_typetrees(tcx, fn_ty, &mut input_activities, Some(span));
+        let (inputs, output) = (fnc_tree.args, fnc_tree.ret);
+        //check_types(inst.ty(tcx, ParamEnv::empty()), tcx, &target_attrs.input_activity);
+        let symb = symbol_name_for_instance_in_crate(tcx, inst.clone(), LOCAL_CRATE);
+
+        let mut new_target_attrs = target_attrs.clone();
+        new_target_attrs.input_activity = input_activities;
+        let itm = new_target_attrs.into_item(symb, target_symbol, inputs, output);
+        autodiff_items.push(itm);
+    }
+    let autodiff_items = tcx.arena.alloc_from_iter(autodiff_items);
+
     // Output monomorphization stats per def_id
     if let SwitchWithOptPath::Enabled(ref path) = tcx.sess.opts.unstable_opts.dump_mono_stats {
         if let Err(err) =
@@ -1224,7 +1290,14 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[Co
         }
     }
 
-    (tcx.arena.alloc(mono_items), codegen_units)
+    if autodiff_items.len() > 0 {
+        trace!("AUTODIFF ITEMS EXIST");
+        for item in &mut *autodiff_items {
+            trace!("{}", &item);
+        }
+    }
+
+    (tcx.arena.alloc(mono_items), autodiff_items, codegen_units)
 }
 
 /// Outputs stats about instantiation counts and estimated size, per `MonoItem`'s
@@ -1308,12 +1381,12 @@ pub(crate) fn provide(providers: &mut Providers) {
     providers.collect_and_partition_mono_items = collect_and_partition_mono_items;
 
     providers.is_codegened_item = |tcx, def_id| {
-        let (all_mono_items, _) = tcx.collect_and_partition_mono_items(());
+        let (all_mono_items, _, _) = tcx.collect_and_partition_mono_items(());
         all_mono_items.contains(&def_id)
     };
 
     providers.codegen_unit = |tcx, name| {
-        let (_, all) = tcx.collect_and_partition_mono_items(());
+        let (_, _, all) = tcx.collect_and_partition_mono_items(());
         all.iter()
             .find(|cgu| cgu.name() == name)
             .unwrap_or_else(|| panic!("failed to find cgu with name {name:?}"))
diff --git a/compiler/rustc_passes/messages.ftl b/compiler/rustc_passes/messages.ftl
index 3f98236595bd7..c1fd3efdb3b7a 100644
--- a/compiler/rustc_passes/messages.ftl
+++ b/compiler/rustc_passes/messages.ftl
@@ -13,6 +13,10 @@ passes_abi_ne =
 passes_abi_of =
     fn_abi_of({$fn_name}) = {$fn_abi}
 
+passes_autodiff_attr =
+    `#[autodiff]` should be applied to a function
+    .label = not a function
+
 passes_allow_incoherent_impl =
     `rustc_allow_incoherent_impl` attribute should be applied to impl items
     .label = the only currently supported targets are inherent methods
@@ -49,10 +53,6 @@ passes_attr_crate_level =
 passes_attr_only_in_functions =
     `{$attr}` attribute can only be used on functions
 
-passes_autodiff_attr =
-    `#[autodiff]` should be applied to a function
-    .label = not a function
-
 passes_both_ffi_const_and_pure =
     `#[ffi_const]` function cannot be `#[ffi_pure]`
 
diff --git a/compiler/rustc_resolve/src/lib.rs b/compiler/rustc_resolve/src/lib.rs
index 35d491cfc18e7..14e6091ef2118 100644
--- a/compiler/rustc_resolve/src/lib.rs
+++ b/compiler/rustc_resolve/src/lib.rs
@@ -1168,6 +1168,8 @@ pub struct Resolver<'ra, 'tcx> {
     node_id_to_def_id: NodeMap<Feed<'tcx, LocalDefId>>,
     def_id_to_node_id: IndexVec<LocalDefId, ast::NodeId>,
 
+    autodiff_map: FxHashMap<LocalDefId, LocalDefId>,
+
     /// Indices of unnamed struct or variant fields with unresolved attributes.
     placeholder_field_indices: FxHashMap<NodeId, usize>,
     /// When collecting definitions from an AST fragment produced by a macro invocation `ExpnId`
@@ -1539,6 +1541,7 @@ impl<'ra, 'tcx> Resolver<'ra, 'tcx> {
             next_node_id: CRATE_NODE_ID,
             node_id_to_def_id,
             def_id_to_node_id,
+            autodiff_map: Default::default(),
             placeholder_field_indices: Default::default(),
             invocation_parents,
             trait_impl_items: Default::default(),
@@ -1668,6 +1671,7 @@ impl<'ra, 'tcx> Resolver<'ra, 'tcx> {
                 .into_items()
                 .map(|(k, f)| (k, f.key()))
                 .collect(),
+            autodiff_map: self.autodiff_map,
             trait_map: self.trait_map,
             lifetime_elision_allowed: self.lifetime_elision_allowed,
             lint_buffer: Steal::new(self.lint_buffer),
diff --git a/compiler/rustc_session/src/config.rs b/compiler/rustc_session/src/config.rs
index d733e32f209db..3c93a7eda36f1 100644
--- a/compiler/rustc_session/src/config.rs
+++ b/compiler/rustc_session/src/config.rs
@@ -194,6 +194,53 @@ impl Default for CoverageLevel {
     }
 }
 
+/// The different settings that the `-Z ad` flag can have.
+#[derive(Clone, Copy, PartialEq, Hash, Debug)]
+pub enum AutoDiff {
+    /// Print TypeAnalysis information
+    PrintTA,
+    /// Print ActivityAnalysis Information
+    PrintAA,
+    /// Print Performance Warnings from Enzyme
+    PrintPerf,
+    /// Combines the three print flags above.
+    Print,
+    /// Print the whole module, before running opts.
+    PrintModBefore,
+    /// Print the whole module just before we pass it to Enzyme.
+    /// For Debug purpose, prefer the OPT flag below
+    PrintModAfterOpts,
+    /// Print the module after Enzyme differentiated everything.
+    PrintModAfterEnzyme,
+
+    /// Enzyme's loose type debug helper (can cause incorrect gradients)
+    LooseTypes,
+    /// Output a Module using __enzyme calls to prepare it for opt + enzyme pass usage
+    OPT,
+
+    /// TypeTree options
+    /// TODO: Figure out how to let users construct these,
+    /// or whether we want to leave this option in the first place.
+    TTWidth(u64),
+    TTDepth(u64),
+
+    /// More flags
+    NoModOptAfter,
+    /// Tell Enzyme to run LLVM Opts on each function it generated. By default off,
+    /// since we already optimize the whole module after Enzyme is done.
+    EnableFncOpt,
+    NoVecUnroll,
+    /// Obviously unsafe, disable the length checks that we have for shadow args.
+    NoSafetyChecks,
+    RuntimeActivity,
+    /// Runs Enzyme specific Inlining
+    Inline,
+    /// Runs Optimization twice after AD, and zero times after.
+    /// This is mainly for Benchmarking purpose to show that
+    /// compiler based AD has a performance benefit. TODO: fix
+    AltPipeline,
+}
+
 /// Settings for `-Z instrument-xray` flag.
 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)]
 pub struct InstrumentXRay {
@@ -3022,7 +3069,7 @@ pub(crate) mod dep_tracking {
     };
 
     use super::{
-        BranchProtection, CFGuard, CFProtection, CollapseMacroDebuginfo, CoverageOptions,
+        AutoDiff, BranchProtection, CFGuard, CFProtection, CollapseMacroDebuginfo, CoverageOptions,
         CrateType, DebugInfo, DebugInfoCompression, ErrorOutputType, FmtDebug, FunctionReturn,
         InliningThreshold, InstrumentCoverage, InstrumentXRay, LinkerPluginLto, LocationDetail,
         LtoCli, NextSolverConfig, OomStrategy, OptLevel, OutFileName, OutputType, OutputTypes,
@@ -3070,6 +3117,7 @@ pub(crate) mod dep_tracking {
     }
 
     impl_dep_tracking_hash_via_hash!(
+        AutoDiff,
         bool,
         usize,
         NonZero<usize>,
diff --git a/compiler/rustc_session/src/config/cfg.rs b/compiler/rustc_session/src/config/cfg.rs
index ccc01728958b8..d9da9e9f26a45 100644
--- a/compiler/rustc_session/src/config/cfg.rs
+++ b/compiler/rustc_session/src/config/cfg.rs
@@ -176,6 +176,8 @@ pub(crate) fn default_configuration(sess: &Session) -> Cfg {
     // NOTE: These insertions should be kept in sync with
     // `CheckCfg::fill_well_known` below.
 
+    ins_none!(sym::autodiff_fallback);
+
     if sess.opts.debug_assertions {
         ins_none!(sym::debug_assertions);
     }
@@ -339,6 +341,7 @@ impl CheckCfg {
         // Don't forget to update `src/doc/rustc/src/check-cfg.md`
         // in the unstable book as well!
 
+        ins!(sym::autodiff_fallback, no_values);
         ins!(sym::debug_assertions, no_values);
 
         ins!(sym::fmt_debug, empty_values).extend(FmtDebug::all());
diff --git a/compiler/rustc_session/src/options.rs b/compiler/rustc_session/src/options.rs
index 54a4621db2462..81381e7e9855c 100644
--- a/compiler/rustc_session/src/options.rs
+++ b/compiler/rustc_session/src/options.rs
@@ -370,6 +370,7 @@ mod desc {
     pub(crate) const parse_list: &str = "a space-separated list of strings";
     pub(crate) const parse_list_with_polarity: &str =
         "a comma-separated list of strings, with elements beginning with + or -";
+    pub(crate) const parse_autodiff: &str = "various values";
     pub(crate) const parse_comma_list: &str = "a comma-separated list of strings";
     pub(crate) const parse_opt_comma_list: &str = parse_comma_list;
     pub(crate) const parse_number: &str = "a number";
@@ -996,6 +997,38 @@ mod parse {
         }
     }
 
+    pub(crate) fn parse_autodiff(slot: &mut Vec<AutoDiff>, v: Option<&str>) -> bool {
+        let Some(v) = v else {
+            *slot = vec![];
+            return true;
+        };
+        let mut v: Vec<&str> = v.split(",").collect();
+        v.sort_unstable();
+        for &val in v.iter() {
+            let variant = match val {
+                "PrintTA" => AutoDiff::PrintTA,
+                "PrintAA" => AutoDiff::PrintAA,
+                "PrintPerf" => AutoDiff::PrintPerf,
+                "Print" => AutoDiff::Print,
+                "PrintModBefore" => AutoDiff::PrintModBefore,
+                "PrintModAfterOpts" => AutoDiff::PrintModAfterOpts,
+                "PrintModAfterEnzyme" => AutoDiff::PrintModAfterEnzyme,
+                "LooseTypes" => AutoDiff::LooseTypes,
+                "OPT" => AutoDiff::OPT,
+                "NoModOptAfter" => AutoDiff::NoModOptAfter,
+                "EnableFncOpt" => AutoDiff::EnableFncOpt,
+                "NoVecUnroll" => AutoDiff::NoVecUnroll,
+                "NoSafetyChecks" => AutoDiff::NoSafetyChecks,
+                "Inline" => AutoDiff::Inline,
+                "AltPipeline" => AutoDiff::AltPipeline,
+                _ => return false,
+            };
+            slot.push(variant);
+        }
+
+        true
+    }
+
     pub(crate) fn parse_instrument_coverage(
         slot: &mut InstrumentCoverage,
         v: Option<&str>,
@@ -1680,6 +1713,8 @@ options! {
          either `loaded` or `not-loaded`."),
     assume_incomplete_release: bool = (false, parse_bool, [TRACKED],
         "make cfg(version) treat the current version as incomplete (default: no)"),
+    autodiff: Vec<crate::config::AutoDiff> = (Vec::new(), parse_autodiff, [TRACKED],
+        "a list autodiff flags to enable (comma separated)"),
     #[rustc_lint_opt_deny_field_access("use `Session::binary_dep_depinfo` instead of this field")]
     binary_dep_depinfo: bool = (false, parse_bool, [TRACKED],
         "include artifacts (sysroot, crate dependencies) used during compilation in dep-info \
diff --git a/config.example.toml b/config.example.toml
index a52968e9a414d..440274d51e344 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -158,6 +158,9 @@
 # Whether to build the clang compiler.
 #clang = false
 
+# Wheter to build Enzyme as AutoDiff backend.
+#enzyme = true
+
 # Whether to enable llvm compilation warnings.
 #enable-warnings = false