diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 97b7cdf4..408374b8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,4 +6,4 @@ repos:
- id: ruff
args: [check, --fix, scripts, src, setup.py, setup_data.py]
- id: ruff
- args: [format, scripts, src, setup.py setup_data.py]
\ No newline at end of file
+ args: [format, --check, scripts, src, setup.py setup_data.py]
\ No newline at end of file
diff --git a/README.md b/README.md
index 986b874f..de808d98 100644
--- a/README.md
+++ b/README.md
@@ -21,18 +21,37 @@
## π Overview
-We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 6+ LLM architectures. Each of these can be easily extended to incorporate more variants.
+We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 9+ evaluation metrics, and 6+ LLM architectures. Each of these can be easily extended to incorporate more variants.
We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field.
+---
+
### π’ Updates
-#### [Mar 27, 2025]
-- **Easier contributions, leaderboard and reproducibility**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details.
+#### [Apr 6, 2025]
+- **More Metrics!** Added 6 Membership Inference Attacks (MIA) (LOSS, ZLib, Reference, GradNorm, MinK, and MinK++), along with Extraction Strength (ES) and Exact Memorization (EM) as additional evaluation metrics.
+- **More TOFU Evaluations!** Now includes a holdout set and supports MIA attack-based evaluation. You can now compute MUSE's privleak on TOFU.
+- **More Documentation!** [`docs/links.md`](docs/links.md) contains resources for each of the implemented features and other useful LLM unlearning resources.
+
+
+
+Older Updates
+
+#### [Mar 27, 2025]
+- **More Documentation: easy contributions and the leaderboard functionality**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details.
+
+#### [Mar 9, 2025]
+- **More Methods!** Added support for [RMU](https://arxiv.org/abs/2403.03218) (representation-engineering based unlearning).
#### [Feb 27, 2025]
β οΈ **Repository Update**: This repo replaces the original TOFU codebase at [`github.com/locuslab/tofu`](https://github.com/locuslab/tofu), which is no longer maintained.
+
+
+
+---
+
## ποΈ Available Components
We provide several variants for each of the components in the unlearning pipeline.
@@ -41,7 +60,7 @@ We provide several variants for each of the components in the unlearning pipelin
|------------------------|----------------------|
| **Benchmarks** | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/) |
| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU |
-| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, QA-ROUGE, MIA Attacks, TruthRatio, Model Utility |
+| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks |
| **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) |
| **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma |
@@ -77,14 +96,15 @@ pip install --no-build-isolation flash-attn==2.6.3
# data setup
python setup_data.py # saves/eval now contains evaluation results of the uploaded models
-# Downloads log files with metric eval results (incl retain model logs) from the models used in the supported benchmarks.
+# Downloads log files with metric eval results (incl retain model logs) from the models
+# used in the supported benchmarks.
```
---
### π Updated TOFU benchmark
-We've updated Open-Unlearning's TOFU benchmark target models to use a wider variety of newer architectures with sizes varying from 1B to 8B. These include LLaMA 3.2 1B, LLaMA 3.2 3B, LLaMA 3.1 8B, and the original LLaMA-2 7B from [the old version of TOFU](github.com/locuslab/tofu).
+We've updated Open-Unlearning's TOFU benchmark target models to use a wider variety of newer architectures with sizes varying from 1B to 8B. These include LLaMA 3.2 1B, LLaMA 3.2 3B, LLaMA 3.1 8B, and the original LLaMA-2 7B (re-created) target models from [the old version of TOFU](github.com/locuslab/tofu).
For each architecture, we have finetuned with four different splits of the TOFU datasets: `full`, `retain90`, `retain95`, `retain99`, for a total of 16 finetuned models. The first serves as the target (base model for unlearning) and the rest are retain models used to measure performance against for each forget split. These models are on [HuggingFace](`https://huggingface.co/collections/open-unlearning/tofu-new-models-67bcf636334ea81727573a9f0`) and the paths to these models can be set in the experimental configs or in command-line overrides.
@@ -112,15 +132,18 @@ python src/train.py --config-name=unlearn.yaml experiment=unlearn/tofu/default \
An example command for launching a TOFU evaluation process on `forget10` split:
```bash
+model=Llama-3.2-1B-Instruct
python src/eval.py --config-name=eval.yaml experiment=eval/tofu/default \
- model=Llama-3.2-1B-Instruct \
- model.model_args.pretrained_model_name_or_path=open-unlearning/tofu_Llama-3.2-1B-Instruct_full \
+ model=${model} \
+ model.model_args.pretrained_model_name_or_path=open-unlearning/tofu_${model}_full \
+ retain_logs_path=saves/eval/tofu_${model}_retain90/TOFU_EVAL.json \
task_name=SAMPLE_EVAL
```
- `experiment`- Path to the evaluation configuration [`configs/experiment/eval/tofu/default.yaml`](configs/experiment/eval/tofu/default.yaml).
- `model`- Sets up the model and tokenizer configs for the `Llama-3.2-1B-Instruct` model.
- `model.model_args.pretrained_model_name_or_path`- Overrides the default experiment config to evaluate a model from a HuggingFace ID (can use a local model checkpoint path as well).
+- `retain_logs_path`- Sets the path to the reference model eval logs that is needed to compute reference model based metrics like `forget_quality` in TOFU.
For more details about creating and running evaluations, refer [`docs/evaluation.md`](docs/evaluation.md).
@@ -153,7 +176,8 @@ For more in-depth information on specific aspects of the framework, refer to the
| [`docs/experiments.md`](docs/experiments.md) | Guide on running experiments in various configurations and settings, including distributed training, fine-tuning, and overriding arguments. |
| [`docs/hydra.md`](docs/hydra.md) | Explanation of the Hydra features used in configuration management for experiments. |
| [`community/leaderboard.md`](community/leaderboard.md) | Reference results from various unlearning methods run using this framework on TOFU and MUSE benchmarks. |
-| [`docs/repro.md`](docs/repro.md) (deprecated) | Results are provided solely for reproducibility purposes, without any parameter tuning. |
+| [`docs/links.md`](docs/links.md) | List of all links to the research papers or other sources the implemented features are sourced from. |
+| [`docs/repro.md`](docs/repro.md) | Results are provided solely for reproducibility purposes, without any parameter tuning. |
---
## π Support & Contributors
@@ -197,9 +221,15 @@ If you use OpenUnlearning in your research, please cite OpenUnlearning and the b
### π€ Acknowledgements
- This repo is inspired from [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory).
-- The [TOFU](https://github.com/locuslab/tofu) and [MUSE](https://github.com/jaechan-repo/muse_bench) benchmarks served as the foundation for our re-implementation.
+- The [TOFU](https://github.com/locuslab/tofu) and [MUSE](https://github.com/swj0419/muse_bench) benchmarks served as the foundation for our re-implementation.
---
### π License
This project is licensed under the MIT License. See the [`LICENSE`](LICENSE) file for details.
+
+---
+
+### Star History
+
+[](https://www.star-history.com/#locuslab/open-unlearning&Date)
diff --git a/community/leaderboard.md b/community/leaderboard.md
index 88039321..56c007ee 100644
--- a/community/leaderboard.md
+++ b/community/leaderboard.md
@@ -8,48 +8,35 @@ We encourage the community to develop new methods, optimize them for specific be
To implement a new method, refer to our [contributing guide](../docs/contributing.md).
-> **Note:** The [results.md](../docs/results.md) file is maintained for reproducibility purposes. However, we encourage contributors to update the leaderboard table instead of the reproducibility table. We will continue refining and tuning baseline methods to keep the leaderboard up to date.
+> [!NOTE]
+> The [results.md](../docs/results.md) file is maintained for reproducibility purposes. However, we encourage contributors to update the leaderboard table instead of the reproducibility table. We will continue refining and tuning baseline methods to keep the leaderboard up to date.
-### TOFU unlearning on the `Llama-3.2-1B-Instruct` architecture
+### TOFU unlearning on the `Llama-2-7b-hf-chat` architecture
| Method |
- forget01 |
- forget05 |
forget10 |
|
forget_quality |
model_utility |
- forget_quality |
- model_utility |
- forget_quality |
- model_utility |
| Finetuned |
- 0.01 |
- 0.60 |
- 2.96e-13 |
- 0.6 |
- 8.08e-22 |
- 0.6 |
+ 4.35e-25 |
+ 0.63 |
| Retain |
1.0 |
- 0.60 |
- 1.0 |
- 0.6 |
- 1.0 |
- 0.59 |
+ 0.61 |
| |
@@ -70,37 +57,23 @@ To implement a new method, refer to our [contributing guide](../docs/contributin
| Method |
- forget01 |
- forget05 |
forget10 |
|
forget_quality |
model_utility |
- forget_quality |
- model_utility |
- forget_quality |
- model_utility |
| Finetuned |
- 0.01 |
- 0.60 |
- 2.96e-13 |
- 0.6 |
- 8.08e-22 |
+ 1.66e-21 |
0.6 |
| Retain |
1.0 |
- 0.60 |
- 1.0 |
- 0.6 |
- 1.0 |
0.59 |
@@ -143,7 +116,7 @@ To implement a new method, refer to our [contributing guide](../docs/contributin
| 0.64 |
0.58 |
-99.81 |
- 0.55 |
+ 0.56 |
0.47 |
1.0 |
-57.26 |
@@ -152,7 +125,7 @@ To implement a new method, refer to our [contributing guide](../docs/contributin
| Retain |
0.33 |
- 0.21 |
+ 0.20 |
0 |
0.56 |
0.3 |
diff --git a/configs/data/datasets/MUSE_MIA.yaml b/configs/data/datasets/MUSE_MIA.yaml
new file mode 100644
index 00000000..66e818c6
--- /dev/null
+++ b/configs/data/datasets/MUSE_MIA.yaml
@@ -0,0 +1,22 @@
+MUSE_MIA_holdout:
+ access_key: holdout
+ handler: CompletionDataset
+ args:
+ hf_args:
+ path: "muse-bench/MUSE-News"
+ name: "privleak"
+ split: "holdout"
+ prefix_key: "prompt" # doesn't exist in dataset
+ text_key: "text"
+ max_length: 2048
+MUSE_MIA_forget:
+ access_key: forget
+ handler: CompletionDataset
+ args:
+ hf_args:
+ path: "muse-bench/MUSE-News"
+ name: "privleak"
+ split: "forget"
+ prefix_key: "prompt" # doesn't exist in dataset
+ text_key: "text"
+ max_length: 2048
\ No newline at end of file
diff --git a/configs/data/datasets/MUSE_forget_privleak.yaml b/configs/data/datasets/MUSE_forget_privleak.yaml
deleted file mode 100644
index 4013eb0d..00000000
--- a/configs/data/datasets/MUSE_forget_privleak.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-MUSE_forget_privleak:
- handler: CompletionDataset
- args:
- hf_args:
- path: "muse-bench/MUSE-News"
- name: "privleak"
- split: "forget"
- prefix_key: "prompt" # doesn't exist in dataset
- text_key: "text"
- max_length: 2048
\ No newline at end of file
diff --git a/configs/data/datasets/MUSE_holdout_privleak.yaml b/configs/data/datasets/MUSE_holdout_privleak.yaml
deleted file mode 100644
index 4fcda6e2..00000000
--- a/configs/data/datasets/MUSE_holdout_privleak.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-MUSE_holdout_privleak:
- handler: CompletionDataset
- args:
- hf_args:
- path: "muse-bench/MUSE-News"
- name: "privleak"
- split: "holdout"
- prefix_key: "prompt" # doesn't exist in dataset
- text_key: "text"
- max_length: 2048
\ No newline at end of file
diff --git a/configs/data/datasets/MUSE_retain_privleak.yaml b/configs/data/datasets/MUSE_retain_privleak.yaml
deleted file mode 100644
index e52813cd..00000000
--- a/configs/data/datasets/MUSE_retain_privleak.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-MUSE_retain_privleak:
- handler: PretrainingDataset
- args:
- hf_args:
- path: "muse-bench/MUSE-News"
- name: "privleak"
- split: "retain"
- text_key: "text"
- max_length: 2048
\ No newline at end of file
diff --git a/configs/data/datasets/TOFU_MIA.yaml b/configs/data/datasets/TOFU_MIA.yaml
new file mode 100644
index 00000000..20b6c97e
--- /dev/null
+++ b/configs/data/datasets/TOFU_MIA.yaml
@@ -0,0 +1,22 @@
+TOFU_QA_forget:
+ access_key: forget
+ handler: QADataset
+ args:
+ hf_args:
+ name: "forget10"
+ split: "train"
+ path: "locuslab/TOFU"
+ question_key: "question"
+ answer_key: "answer"
+ max_length: 512
+TOFU_QA_holdout:
+ access_key: holdout
+ handler: QADataset
+ args:
+ hf_args:
+ name: "holdout10"
+ path: "locuslab/TOFU"
+ split: "train"
+ question_key: "question"
+ answer_key: "answer"
+ max_length: 512
\ No newline at end of file
diff --git a/configs/eval.yaml b/configs/eval.yaml
index 06e90a39..fef5ed44 100644
--- a/configs/eval.yaml
+++ b/configs/eval.yaml
@@ -13,4 +13,5 @@ model:
device_map: cuda
mode: eval
-task_name: ???
\ No newline at end of file
+task_name: ???
+seed: 0
\ No newline at end of file
diff --git a/configs/eval/muse.yaml b/configs/eval/muse.yaml
index 6350e423..ecdd98f8 100644
--- a/configs/eval/muse.yaml
+++ b/configs/eval/muse.yaml
@@ -7,6 +7,14 @@ defaults:
- retain_knowmem_ROUGE
- forget_verbmem_ROUGE
- privleak
+ - extraction_strength
+ # - exact_memorization
+ # - mia_min_k_plus_plus
+ # - mia_min_k
+ # - mia_loss
+ # - mia_reference
+ # - mia_zlib
+ # - mia_gradnorm
handler: MUSEEvaluator
output_dir: ${paths.output_dir} # set to default eval directory
diff --git a/configs/eval/muse_metrics/forget_minKpc_neg_logprob.yaml b/configs/eval/muse_metrics/exact_memorization.yaml
similarity index 50%
rename from configs/eval/muse_metrics/forget_minKpc_neg_logprob.yaml
rename to configs/eval/muse_metrics/exact_memorization.yaml
index d9829df4..68b940dd 100644
--- a/configs/eval/muse_metrics/forget_minKpc_neg_logprob.yaml
+++ b/configs/eval/muse_metrics/exact_memorization.yaml
@@ -1,13 +1,12 @@
-# @package eval.muse.metrics.forget_minKpc_neg_logprob
+# @package eval.muse.metrics.exact_memorization
defaults:
- - ../../data/datasets@datasets: MUSE_forget_privleak
+ - ../../data/datasets@datasets: MUSE_forget_verbmem
- ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
-handler: minKpc_negative_logprob
-batch_size: 8
-percentile_K: 40
+handler: exact_memorization
+batch_size: 8
datasets:
- MUSE_forget_privleak:
+ MUSE_forget_verbmem:
args:
hf_args:
path: muse-bench/MUSE-${eval.muse.data_split}
\ No newline at end of file
diff --git a/configs/eval/muse_metrics/extraction_strength.yaml b/configs/eval/muse_metrics/extraction_strength.yaml
new file mode 100644
index 00000000..18d716a5
--- /dev/null
+++ b/configs/eval/muse_metrics/extraction_strength.yaml
@@ -0,0 +1,12 @@
+# @package eval.muse.metrics.extraction_strength
+defaults:
+ - ../../data/datasets@datasets: MUSE_forget_verbmem
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+
+handler: extraction_strength
+batch_size: 8
+datasets:
+ MUSE_forget_verbmem:
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
\ No newline at end of file
diff --git a/configs/eval/muse_metrics/holdout_minKpc_neg_logprob.yaml b/configs/eval/muse_metrics/holdout_minKpc_neg_logprob.yaml
deleted file mode 100644
index 4d3d1fa3..00000000
--- a/configs/eval/muse_metrics/holdout_minKpc_neg_logprob.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-# @package eval.muse.metrics.holdout_minKpc_neg_logprob
-defaults:
- - ../../data/datasets@datasets: MUSE_holdout_privleak
- - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
-handler: minKpc_negative_logprob
-batch_size: 8
-percentile_K: 40
-
-datasets:
- MUSE_holdout_privleak :
- args:
- hf_args:
- path: muse-bench/MUSE-${eval.muse.data_split}
\ No newline at end of file
diff --git a/configs/eval/muse_metrics/mia_gradnorm.yaml b/configs/eval/muse_metrics/mia_gradnorm.yaml
new file mode 100644
index 00000000..89cb8c56
--- /dev/null
+++ b/configs/eval/muse_metrics/mia_gradnorm.yaml
@@ -0,0 +1,18 @@
+# @package eval.muse.metrics.mia_gradnorm
+defaults:
+ - ../../data/datasets@datasets: MUSE_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+datasets:
+ MUSE_MIA_holdout:
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+ MUSE_MIA_forget:
+ access_key: forget
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+
+handler: mia_gradnorm
+batch_size: 1
+p: 2
\ No newline at end of file
diff --git a/configs/eval/muse_metrics/mia_loss.yaml b/configs/eval/muse_metrics/mia_loss.yaml
new file mode 100644
index 00000000..dfca1136
--- /dev/null
+++ b/configs/eval/muse_metrics/mia_loss.yaml
@@ -0,0 +1,17 @@
+# @package eval.muse.metrics.mia_loss
+defaults:
+ - ../../data/datasets@datasets: MUSE_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+datasets:
+ MUSE_MIA_holdout:
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+ MUSE_MIA_forget:
+ access_key: forget
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+
+batch_size: 8
+handler: mia_loss
diff --git a/configs/eval/muse_metrics/mia_min_k.yaml b/configs/eval/muse_metrics/mia_min_k.yaml
new file mode 100644
index 00000000..9b2b14c7
--- /dev/null
+++ b/configs/eval/muse_metrics/mia_min_k.yaml
@@ -0,0 +1,18 @@
+# @package eval.muse.metrics.mia_min_k
+defaults:
+ - ../../data/datasets@datasets: MUSE_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+datasets:
+ MUSE_MIA_holdout:
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+ MUSE_MIA_forget:
+ access_key: forget
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+
+batch_size: 8
+handler: mia_min_k
+k: 0.4
\ No newline at end of file
diff --git a/configs/eval/muse_metrics/mia_min_k_plus_plus.yaml b/configs/eval/muse_metrics/mia_min_k_plus_plus.yaml
new file mode 100644
index 00000000..e497c206
--- /dev/null
+++ b/configs/eval/muse_metrics/mia_min_k_plus_plus.yaml
@@ -0,0 +1,18 @@
+# @package eval.muse.metrics.mia_min_k_plus_plus
+defaults:
+ - ../../data/datasets@datasets: MUSE_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+datasets:
+ MUSE_MIA_holdout:
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+ MUSE_MIA_forget:
+ access_key: forget
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+
+batch_size: 8
+handler: mia_min_k_plus_plus
+k: 0.4
\ No newline at end of file
diff --git a/configs/eval/muse_metrics/mia_reference.yaml b/configs/eval/muse_metrics/mia_reference.yaml
new file mode 100644
index 00000000..ea5d38cd
--- /dev/null
+++ b/configs/eval/muse_metrics/mia_reference.yaml
@@ -0,0 +1,18 @@
+# @package eval.muse.metrics.mia_reference
+defaults:
+ - ../../data/datasets@datasets: MUSE_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+datasets:
+ MUSE_MIA_holdout:
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+ MUSE_MIA_forget:
+ access_key: forget
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+
+batch_size: 8
+handler: mia_reference
+reference_model_path: muse-bench/MUSE-${eval.muse.data_split}_retrain # modify appropriately
diff --git a/configs/eval/muse_metrics/mia_zlib.yaml b/configs/eval/muse_metrics/mia_zlib.yaml
new file mode 100644
index 00000000..d8813320
--- /dev/null
+++ b/configs/eval/muse_metrics/mia_zlib.yaml
@@ -0,0 +1,17 @@
+# @package eval.muse.metrics.mia_zlib
+defaults:
+ - ../../data/datasets@datasets: MUSE_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+datasets:
+ MUSE_MIA_holdout:
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+ MUSE_MIA_forget:
+ access_key: forget
+ args:
+ hf_args:
+ path: muse-bench/MUSE-${eval.muse.data_split}
+
+batch_size: 8
+handler: mia_zlib
\ No newline at end of file
diff --git a/configs/eval/muse_metrics/privleak.yaml b/configs/eval/muse_metrics/privleak.yaml
index d16aa756..048f946e 100644
--- a/configs/eval/muse_metrics/privleak.yaml
+++ b/configs/eval/muse_metrics/privleak.yaml
@@ -1,22 +1,17 @@
# @package eval.muse.metrics.privleak
defaults:
- - .@pre_compute.forget_minKpc_neg_logprob: forget_minKpc_neg_logprob
- - .@pre_compute.holdout_minKpc_neg_logprob: holdout_minKpc_neg_logprob
+ - .@pre_compute.mia_min_k: mia_min_k
pre_compute:
- forget_minKpc_neg_logprob:
+ mia_min_k:
access_key: forget
- holdout_minKpc_neg_logprob:
- access_key: holdout
reference_logs:
retain_model_logs:
path: ${eval.muse.retain_logs_path}
include:
- forget_minKpc_neg_logprob:
+ mia_min_k:
access_key: retain
- holdout_minKpc_neg_logprob:
- access_key: holdout
-handler: relative_auc
-ref_value: 0.5
+handler: privleak
+ref_value: 0.5
\ No newline at end of file
diff --git a/configs/eval/tofu.yaml b/configs/eval/tofu.yaml
index 61aaddf7..bbfea269 100644
--- a/configs/eval/tofu.yaml
+++ b/configs/eval/tofu.yaml
@@ -4,11 +4,19 @@
defaults: # include all defined metrics files
- tofu_metrics: # When you import a metric here, its configuration automatically populates the
# metric key below, enabled by the @package directive at the top of each configuration file.
-
- forget_quality
- forget_Q_A_Prob
- forget_Q_A_ROUGE
- model_utility # populated in the metrics key as metrics.model_utility
+ - privleak
+ - extraction_strength
+ # - exact_memorization
+ # - mia_min_k_plus_plus
+ # - mia_min_k
+ # - mia_loss
+ # - mia_zlib
+ # - mia_gradnorm
+ # - mia_reference # set reference model path appropriately
handler: TOFUEvaluator
output_dir: ${paths.output_dir} # set to default eval directory
@@ -16,4 +24,5 @@ metrics: {} # lists a mapping from each evaluation metric to its config
# populated through the first (@package) line in each metric config
overwrite: false
forget_split: forget10
+holdout_split: holdout10
retain_logs_path: null
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/exact_memorization.yaml b/configs/eval/tofu_metrics/exact_memorization.yaml
new file mode 100644
index 00000000..c8ebb7aa
--- /dev/null
+++ b/configs/eval/tofu_metrics/exact_memorization.yaml
@@ -0,0 +1,14 @@
+# @package eval.tofu.metrics.exact_memorization
+defaults:
+ - ../../data/datasets@datasets: TOFU_QA_forget
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+ # ^ get default dataset and generation config information
+
+handler: exact_memorization
+batch_size: 32
+
+datasets:
+ TOFU_QA_forget:
+ args:
+ hf_args:
+ name: ${eval.tofu.forget_split}
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/extraction_strength.yaml b/configs/eval/tofu_metrics/extraction_strength.yaml
new file mode 100644
index 00000000..654da682
--- /dev/null
+++ b/configs/eval/tofu_metrics/extraction_strength.yaml
@@ -0,0 +1,14 @@
+# @package eval.tofu.metrics.extraction_strength
+defaults:
+ - ../../data/datasets@datasets: TOFU_QA_forget
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+ # ^ get default dataset and generation config information
+
+handler: extraction_strength
+batch_size: 32
+
+datasets:
+ TOFU_QA_forget:
+ args:
+ hf_args:
+ name: ${eval.tofu.forget_split}
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/forget_quality.yaml b/configs/eval/tofu_metrics/forget_quality.yaml
index 5119a5d2..888e8176 100644
--- a/configs/eval/tofu_metrics/forget_quality.yaml
+++ b/configs/eval/tofu_metrics/forget_quality.yaml
@@ -13,4 +13,4 @@ pre_compute:
forget_truth_ratio:
access_key: forget
-handler: forget_quality
\ No newline at end of file
+handler: ks_test
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/mia_gradnorm.yaml b/configs/eval/tofu_metrics/mia_gradnorm.yaml
new file mode 100644
index 00000000..1f2c3b25
--- /dev/null
+++ b/configs/eval/tofu_metrics/mia_gradnorm.yaml
@@ -0,0 +1,18 @@
+# @package eval.tofu.metrics.mia_gradnorm
+defaults:
+ - ../../data/datasets@datasets: TOFU_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+
+handler: mia_gradnorm
+batch_size: 1
+p: 2
+
+datasets:
+ TOFU_QA_forget:
+ args:
+ hf_args:
+ name: ${eval.tofu.forget_split}
+ TOFU_QA_holdout:
+ args:
+ hf_args:
+ name: ${eval.tofu.holdout_split}
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/mia_loss.yaml b/configs/eval/tofu_metrics/mia_loss.yaml
new file mode 100644
index 00000000..ee605377
--- /dev/null
+++ b/configs/eval/tofu_metrics/mia_loss.yaml
@@ -0,0 +1,16 @@
+# @package eval.tofu.metrics.mia_loss
+defaults:
+ - ../../data/datasets@datasets: TOFU_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+batch_size: 32
+handler: mia_loss
+
+datasets:
+ TOFU_QA_forget:
+ args:
+ hf_args:
+ name: ${eval.tofu.forget_split}
+ TOFU_QA_holdout:
+ args:
+ hf_args:
+ name: ${eval.tofu.holdout_split}
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/mia_min_k.yaml b/configs/eval/tofu_metrics/mia_min_k.yaml
new file mode 100644
index 00000000..fb87080c
--- /dev/null
+++ b/configs/eval/tofu_metrics/mia_min_k.yaml
@@ -0,0 +1,17 @@
+# @package eval.tofu.metrics.mia_min_k
+defaults:
+ - ../../data/datasets@datasets: TOFU_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+batch_size: 32
+handler: mia_min_k
+k: 0.4
+
+datasets:
+ TOFU_QA_forget:
+ args:
+ hf_args:
+ name: ${eval.tofu.forget_split}
+ TOFU_QA_holdout:
+ args:
+ hf_args:
+ name: ${eval.tofu.holdout_split}
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml b/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml
new file mode 100644
index 00000000..c95be8fc
--- /dev/null
+++ b/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml
@@ -0,0 +1,17 @@
+# @package eval.tofu.metrics.mia_min_k_plus_plus
+defaults:
+ - ../../data/datasets@datasets: TOFU_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+batch_size: 32
+k: 0.4
+handler: mia_min_k_plus_plus
+
+datasets:
+ TOFU_QA_forget:
+ args:
+ hf_args:
+ name: ${eval.tofu.forget_split}
+ TOFU_QA_holdout:
+ args:
+ hf_args:
+ name: ${eval.tofu.holdout_split}
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/mia_reference.yaml b/configs/eval/tofu_metrics/mia_reference.yaml
new file mode 100644
index 00000000..b571d190
--- /dev/null
+++ b/configs/eval/tofu_metrics/mia_reference.yaml
@@ -0,0 +1,17 @@
+# @package eval.tofu.metrics.mia_reference
+defaults:
+ - ../../data/datasets@datasets: TOFU_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+batch_size: 32
+handler: mia_reference
+reference_model_path: ??? # modify appropriately for example open-unlearning/tofu_Llama-3.2-1B-Instruct_retain90
+
+datasets:
+ TOFU_QA_forget:
+ args:
+ hf_args:
+ name: ${eval.tofu.forget_split}
+ TOFU_QA_holdout:
+ args:
+ hf_args:
+ name: ${eval.tofu.holdout_split}
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/mia_zlib.yaml b/configs/eval/tofu_metrics/mia_zlib.yaml
new file mode 100644
index 00000000..68fbe2d0
--- /dev/null
+++ b/configs/eval/tofu_metrics/mia_zlib.yaml
@@ -0,0 +1,16 @@
+# @package eval.tofu.metrics.mia_zlib
+defaults:
+ - ../../data/datasets@datasets: TOFU_MIA
+ - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+batch_size: 32
+handler: mia_zlib
+
+datasets:
+ TOFU_QA_forget:
+ args:
+ hf_args:
+ name: ${eval.tofu.forget_split}
+ TOFU_QA_holdout:
+ args:
+ hf_args:
+ name: ${eval.tofu.holdout_split}
\ No newline at end of file
diff --git a/configs/eval/tofu_metrics/privleak.yaml b/configs/eval/tofu_metrics/privleak.yaml
new file mode 100644
index 00000000..44a461e2
--- /dev/null
+++ b/configs/eval/tofu_metrics/privleak.yaml
@@ -0,0 +1,17 @@
+# @package eval.tofu.metrics.privleak
+defaults:
+ - .@pre_compute.mia_min_k: mia_min_k
+
+pre_compute:
+ mia_min_k:
+ access_key: forget
+
+reference_logs:
+ retain_model_logs:
+ path: ${eval.tofu.retain_logs_path}
+ include:
+ mia_min_k:
+ access_key: retain
+
+handler: privleak
+ref_value: 0.5
diff --git a/configs/experiment/eval/tofu/default.yaml b/configs/experiment/eval/tofu/default.yaml
index 600f2bf9..b2be0205 100644
--- a/configs/experiment/eval/tofu/default.yaml
+++ b/configs/experiment/eval/tofu/default.yaml
@@ -5,6 +5,7 @@ defaults:
- override /eval: tofu
forget_split: forget10
+holdout_split: holdout10
retain_logs_path: null
model:
@@ -14,6 +15,7 @@ model:
eval:
tofu:
forget_split: ${forget_split}
+ holdout_split: ${holdout_split}
retain_logs_path: ${retain_logs_path}
task_name: ???
\ No newline at end of file
diff --git a/configs/experiment/examples/muse_unlearn.yaml b/configs/experiment/examples/muse_unlearn.yaml
index 1535b920..07c6d12f 100644
--- a/configs/experiment/examples/muse_unlearn.yaml
+++ b/configs/experiment/examples/muse_unlearn.yaml
@@ -140,7 +140,7 @@ eval:
index: index
handler: minKpc_negative_logprob
batch_size: 8
- percentile_K: 40
+ k: 0.4
access_key: forget
holdout_minKpc_neg_logprob:
datasets:
@@ -161,7 +161,7 @@ eval:
index: index
handler: minKpc_negative_logprob
batch_size: 8
- percentile_K: 40
+ k: 0.4
access_key: holdout
reference_logs:
retain_model_logs:
@@ -171,7 +171,7 @@ eval:
access_key: retain
holdout_minKpc_neg_logprob:
access_key: holdout
- handler: relative_auc
+ handler: privleak
ref_value: 0.5
handler: MUSEEvaluator
device: cuda
diff --git a/configs/experiment/finetune/tofu/default.yaml b/configs/experiment/finetune/tofu/default.yaml
index 072e86d2..ce4d62bf 100644
--- a/configs/experiment/finetune/tofu/default.yaml
+++ b/configs/experiment/finetune/tofu/default.yaml
@@ -4,6 +4,7 @@ defaults:
- override /model: Llama-3.2-1B-Instruct
- override /trainer: finetune
- override /data/datasets@data.train: TOFU_QA_full
+ - override /eval: tofu
mode: finetune
trainer:
@@ -13,4 +14,17 @@ trainer:
warmup_epochs: 1.0 # custom parameter
num_train_epochs: 5
+
+forget_split: forget10
+holdout_split: holdout10
+retain_logs_path: null
+
+eval:
+ tofu:
+ forget_split: ${forget_split}
+ holdout_split: ${holdout_split}
+ retain_logs_path: ${retain_logs_path}
+ overwrite: true
+
+
task_name: tofu_Llama-3.2-1B-Instruct_full
\ No newline at end of file
diff --git a/configs/experiment/unlearn/tofu/default.yaml b/configs/experiment/unlearn/tofu/default.yaml
index f2e0ab1a..3ea7b4fd 100644
--- a/configs/experiment/unlearn/tofu/default.yaml
+++ b/configs/experiment/unlearn/tofu/default.yaml
@@ -14,11 +14,13 @@ model:
forget_split: forget10
retain_split: retain90
+holdout_split: holdout10
retain_logs_path: null
eval:
tofu:
forget_split: ${forget_split}
+ holdout_split: ${holdout_split}
retain_logs_path: ${retain_logs_path}
overwrite: true
diff --git a/docs/components.md b/docs/components.md
index 016932c9..0c889efd 100644
--- a/docs/components.md
+++ b/docs/components.md
@@ -19,7 +19,8 @@ This process involves three main steps:
6. [Collator](#collator) - Handles data collation logic
7. [Experiment](#experiment) - Combines components into a final experiment config
-> [!Note] adding each component requires Hydra config management features, which are documented in [`docs/hydra.md`](../docs/hydra.md).
+> [!NOTE]
+> Adding each component requires Hydra config management features, which are documented in [`docs/hydra.md`](../docs/hydra.md).
---
@@ -147,7 +148,8 @@ To add a new model architecture:
### Implement and register a handler
For all the models currently supported, HuggingFace's `AutoModelForCausalLM` and `AutoTokenizer` are used, and therefore the user doesn't need to create or register any handler.
-> [!Note]: Currently, we do not support loading models modified with LoRA and related variants. If you wish use such features, please create define and register model handlers for this logic in [`src/model`](../src/model) and provide the config info as discussed next.
+> [!NOTE]
+Currently, we do not support loading models modified with LoRA and related variants. If you wish use such features, please create define and register model handlers for this logic in [`src/model`](../src/model) and provide the config info as discussed next.
### Add to configs
Model configurations contain details required to load the model+tokenizer such as paths, chat templating arguments, LoRA parameters etc. in [`configs/models`](../configs/models/).
diff --git a/docs/contributing.md b/docs/contributing.md
index 3e398f59..583d3f27 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -17,6 +17,8 @@ There are several ways you can contribute to OpenUnlearning:
* Implement new evaluations.
* Contribute to the documentation.
+Once your feature is added you may also link the relevant paper in [`docs/links.md`](../docs/links.md)
+
## Fixing Issues
If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](#create-a-pull-request) and open a Pull Request!
@@ -61,7 +63,7 @@ Adding a new component listed below requires defining a new class, registering i
6. [Collator](components#collator) - Handles data collation logic
7. [Experiment](components#experiment) - Combines components into a final experiment config
-> **IMPORTANT** π
+> [!IMPORTANT]
> **We especially encourage** contributions of methods and benchmarks that you've created, since you best understand them and know how to use them. We are ready to expedite their integration into OpenUnlearning.
> When facing difficulties implementing any component, please contact the maintainers to join our discord where we can go in detail with the implementations.
@@ -84,7 +86,7 @@ Some methods might involve multiple commands or steps while unlearning: ensure y
### 4. Update Leaderboard and Upload Model
-Don't forget to add your results to the [leaderboard](results.md) and upload your unlearned model to HuggingFace for broader accessibility and reproducibility.
+Don't forget to add your results to the [leaderboard](results.md) and upload your unlearned model to HuggingFace for broader accessibility and reproducibility. Also, if applicable, add a link to your paper in [`docs/links.md`](../docs/links.md)
```bash
pip install huggingface_hub
@@ -116,7 +118,7 @@ Your contributions toward defining or improving evaluation methods can significa
1. **Prepare Datasets & Models** β Create your dataset and train models to generate fine-tuned or retained models.
2. **Define a New Benchmark** (if needed) β Follow the [Benchmark Guide]((components.md#benchmark)) to implement a new evaluation benchmark.
3. **Run and Tune Baseline Methods** β Evaluate existing unlearning methods on your benchmark and optimize them.
-4. **Document & Share Findings** β Provide detailed steps for reproduction in [`community/benchmarks/`](../community/benchmarks).
+4. **Document & Share Findings** β Provide detailed steps for reproduction in [`community/benchmarks/`](../community/benchmarks). Also, if applicable, add a link to your paper in [`docs/links.md`](../docs/links.md)
---
diff --git a/docs/evaluation.md b/docs/evaluation.md
index ec41a86a..ecce4d2c 100644
--- a/docs/evaluation.md
+++ b/docs/evaluation.md
@@ -46,12 +46,12 @@ Other metrics like TOFU's Forget Quality (which is a single score computed over
### Steps to create new metrics:
#### 1. Implement a handler
-Metric handlers are implemented in [`src/evals/metrics`](../src/evals/metrics/), where we define handlers for `probability`, `rouge`, `forget_quality` etc.
+Metric handlers are implemented in [`src/evals/metrics`](../src/evals/metrics/), where we define handlers for `probability`, `rouge`, `privleak` etc.
A metric handler is implemented as a function decorated with `@unlearning_metric`. This decorator wraps the function into an UnlearningMetric object. This provides functionality to automatically load and prepare datasets and collators for `probability` as specified in the eval config ([example](../configs/eval/tofu_metrics/forget_Q_A_Prob.yaml)), so they are readily available for use in the function.
-Example: implementing the `rouge` and `forget_quality` handlers
+Example: implementing the `rouge` and `privleak` handlers
```python
# in src/evals/metrics/memorization.py
@@ -72,16 +72,19 @@ def rouge(model, **kwargs):
}
# in src/evals/metrics/privacy.py
-@unlearning_metric(name="forget_quality")
-def forget_quality(model, **kwargs):
- # the forget quality metric is aggregated from computed statistics of
- # other metrics like truth ratio, which is provided through kwargs
+@unlearning_metric(name="privleak")
+def privleak(model, **kwargs):
+ # the privleak quality metric is found from computed statistics of
+ # other metrics like MIA attack scores, which is provided through kwargs
...
- return {"agg_value": pvalue}
+ return {'agg_value': (score-ref)/(ref+1e-10)*100}
```
- `@unlearning_metric(name="rouge")` - Defines a `rouge` handler.
+> [!NOTE]
+`kwargs` contains many important attributes that are useful while computing metrics. It will contain all the metric-specific parameters defined in the metric's yaml file, and also contain the created objects corresponding to the other attributes mentioned in the metric config: such as the `"tokenizer"`, `"data"` (the preprocessed torch dataset), `"batch_size"`, `"collator"`, `"generation_args"`, `"pre_compute"` (prior metrics the current metric depends on), and `"reference_logs"` (evals from a reference model the current metric can use).
+
#### 2. Register the metric handler
Register the handler to link the class to the configs via the class name in [`METRIC_REGISTRY`](../src/evals/metrics/__init__.py).
@@ -98,8 +101,7 @@ Metric configurations are in [`configs/eval/tofu_metrics`](../configs/eval/tofu_
Example 1: Creating the config for MUSE's `forget_verbmem_ROUGE` ([`configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml`](../configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml)).
-
+
```yaml
# @package eval.muse.metrics.forget_verbmem_ROUGE
@@ -128,13 +130,9 @@ collators:
generation_args:
max_new_tokens: 128
```
-
Example 2: Creating the config for TOFU's `forget_quality` ([`configs/eval/tofu_metrics/forget_quality.yaml`](../configs/eval/tofu_metrics/forget_quality.yaml)).
-
-
```yaml
# @package eval.tofu.metrics.forget_quality
defaults:
@@ -155,9 +153,9 @@ pre_compute:
forget_truth_ratio:
access_key: forget
-handler: forget_quality
+handler: ks_test # the handler with logic that is registered in code
```
-
+
### Designing metrics that depend on other metrics
diff --git a/docs/experiments.md b/docs/experiments.md
index b570d3e8..d61009b0 100644
--- a/docs/experiments.md
+++ b/docs/experiments.md
@@ -59,7 +59,8 @@ paths.output_dir=saves/unlearn/NPO/evals
```
-> [!Note]: The unlearning experiments support evaluation during the unlearning finetuning. But this is supported only on a single GPU When multiple GPUs are used to train, checkpoints must be stored and evaluated after training.
+> [!NOTE]
+The unlearning experiments support evaluation during the unlearning finetuning. But this is supported only on a single GPU When multiple GPUs are used to train, checkpoints must be stored and evaluated after training.
---
@@ -70,7 +71,7 @@ To understand the structure of an evaluation config and the kind of available pa
To understand the structure of an unlearning config and the kind of available parameters for overriding, refer to: [`configs/experiment/examples/muse_unlearn.yaml`](../configs/experiment/examples/muse_unlearn.yaml).
The following tables list the most commonly used arguments while running experiments.
-
+
### Model Settings
@@ -242,7 +243,8 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
src/train.py --config-name=unlearn.yaml experiment=unlearn/muse/default.yaml task_name=DISTRIBUTED_TRAIN
```
-> [!Note]: Evaluation runs are designed to work only a single GPU (this includes running evaluation during training). To run an evaluation job, modify your command to make only one GPU visible (assuming one GPU is enough for inference):
+> [!CAUTION]
+> Evaluation runs are designed to work only a single GPU (this includes running evaluation during training). To run an evaluation job, modify your command to make only one GPU visible (assuming one GPU is enough for inference), as shown below
```bash
CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/muse/default.yaml task_name=SAMPLE_EVAL
diff --git a/docs/links.md b/docs/links.md
new file mode 100644
index 00000000..9a651a10
--- /dev/null
+++ b/docs/links.md
@@ -0,0 +1,62 @@
+# π Links and References
+
+Links to research papers and resources corresponding to implemented features in this repository. Please feel free to fill in any missing references!
+
+---
+
+## π Table of Contents
+- [Implemented Methods](#implemented-methods)
+- [Benchmarks](#benchmarks)
+- [Evaluation Metrics](#evaluation-metrics)
+- [Useful Links](#useful-links)
+ - [Survey Papers](#survey-papers)
+ - [Other GitHub Repositories](#other-github-repositories)
+
+---
+
+## π Implemented Methods
+
+| Method | Resource |
+|-----------------|----------|
+| GradAscent, GradDiff | Naive baselines found in many papers including MUSE, TOFU etc. |
+| NPO | Paper [π](https://arxiv.org/abs/2404.05868), Code [π](https://github.com/licong-lin/negative-preference-optimization) |
+| SimNPO | Paper [π](https://arxiv.org/abs/2410.07163), Code [π](https://github.com/OPTML-Group/Unlearn-Simple) |
+| IdkDPO | TOFU ([π](https://arxiv.org/abs/2401.06121)) |
+| RMU | WMDP paper ([π](https://github.com/centerforaisafety/wmdp/tree/main/rmu), [π](https://www.wmdp.ai/)), later used in G-effect ([π](https://github.com/tmlr-group/G-effect/blob/main/dataloader.py)) |
+
+---
+
+## π Benchmarks
+
+| Benchmark | Resource |
+|-----------|----------|
+| TOFU | Paper [π](https://arxiv.org/abs/2401.06121) |
+| MUSE | Paper [π](https://arxiv.org/abs/2407.06460) |
+
+---
+
+## π Evaluation Metrics
+
+| Metric | Resource |
+|--------|----------|
+| Verbatim Probability / ROUGE, simple QA-ROUGE | Naive metrics found in many papers including MUSE, TOFU etc. |
+| Membership Inference Attacks (LOSS, ZLib, Reference, GradNorm, MinK, MinK++) | MIMIR ([π](https://github.com/iamgroot42/mimir)), MUSE ([π](https://arxiv.org/abs/2407.06460)) |
+| PrivLeak | MUSE ([π](https://arxiv.org/abs/2407.06460)) |
+| Forget Quality, Truth Ratio, Model Utility | TOFU ([π](https://arxiv.org/abs/2401.06121)) |
+| Extraction Strength (ES) | Carlini et al., 2021 ([π](https://www.usenix.org/conference/usenixsecurity21/presentation/carlini-extracting)), used for unlearning in Wang et al., 2025 ([π](https://openreview.net/pdf?id=wUtCieKuQU)) |
+| Exact Memorization (EM) | Tirumala et al., 2022 ([π](https://proceedings.neurips.cc/paper_files/paper/2022/hash/fa0509f4dab6807e2cb465715bf2d249-Abstract-Conference.html)), used for unlearning in Wang et al., 2025 ([π](https://openreview.net/pdf?id=wUtCieKuQU)) |
+
+---
+
+## π Useful Links
+
+### π Surveys
+- [Machine Unlearning in 2024](https://ai.stanford.edu/~kzliu/blog/unlearning)
+- [Rethinking Machine Unlearning for Large Language Models](https://arxiv.org/abs/2402.08787)
+
+### π Other GitHub Repositories
+- [TOFU Benchmark (original)](https://github.com/locuslab/tofu)
+- [MUSE Benchmark (original)](https://github.com/swj0419/muse_bench)
+- [Awesome LLM Unlearning](https://github.com/chrisliu298/awesome-llm-unlearning)
+- [Awesome Machine Unlearning](https://github.com/tamlhp/awesome-machine-unlearning)
+- [Awesome GenAI Unlearning](https://github.com/franciscoliu/Awesome-GenAI-Unlearning)
\ No newline at end of file
diff --git a/docs/repro.md b/docs/repro.md
index ac64ac38..9bd51037 100644
--- a/docs/repro.md
+++ b/docs/repro.md
@@ -4,7 +4,8 @@
->βFor results where methods have been tuned for optimal performance, please refer to the [`community/leaderboard`](../community/leaderboard.md).
+> [!TIP]
+> βThis page is for reproducibility. For results where methods have been tuned for optimal performance, please refer to the [`community/leaderboard`](../community/leaderboard.md).
The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks.
```bash
@@ -22,11 +23,13 @@ For all the experiments below, we used the following setup
|-------------------------|------------|
| **Hardware** | 2 Γ L40s GPUs (48GB each) |
| **Distributed Computing** | [DeepSpeed ZeRO Stage 3 (Accelerate)](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed) |
-| **Hyperparameters** | Learning Rate (lr) = 1e-5
Ξ± = 1, Ξ³ = 1, Ξ² = 0.1 (where applicable)
Number of Epochs = 10
Optimizer: [paged_adamw_32bit](https://huggingface.co/docs/bitsandbytes/main/en/reference/optim/adamw#bitsandbytes.optim.PagedAdamW) |
+| **Hyperparameters** | Learning Rate (lr) = 1e-5
Ξ± = 1, Ξ³ = 1, Ξ² = 0.1 (where applicable)
Batch size 32 effectively: 8 per device, 4 grad accum steps
Number of Epochs = 10
Optimizer: [paged_adamw_32bit](https://huggingface.co/docs/bitsandbytes/main/en/reference/optim/adamw#bitsandbytes.optim.PagedAdamW) |
-__Note:__
-1. Results may vary even with the same effective hyperparameters when trained with modifications to the distributed training setup, including when training on a single GPU. For example: methods such as SimNPO & RMU can be significantly improved with careful tuning. **Please use these numbers only for reproducibility purposes**.
-2. NPO in MUSE: for NPO, the MUSE implementation is inconsistent with the [original paper](https://github.com/licong-lin/negative-preference-optimization) as discussed [here]( https://github.com/jaechan-repo/muse_bench/issues/2). This inconsistency is carried over into implementations like [SimNPO](https://github.com/OPTML-Group/Unlearn-Simple/issues/5). Here, we use the original NPO implementation with the same loss function expression across datasets.
+
+> [!NOTE]
+> 1. The results in the next section display only some important subsets of metrics for each benchmark. For examples of more available evaluation metrics available: see `muse*/*_SUMMARY.json`, `tofu*/evals*/*_SUMMARY.json` files on the [HuggingFace space](https://huggingface.co/datasets/open-unlearning/eval).
+> 2. Results may vary even with the same effective hyperparameters when trained with modifications to the distributed training setup, including when training on a single GPU. For example: methods such as SimNPO & RMU can be significantly improved with careful tuning. **Please use the below numbers only for reproducibility purposes**.
+> 3. __NPO inconsistency__: for NPO, the MUSE implementation is inconsistent with the [original paper](https://github.com/licong-lin/negative-preference-optimization) as discussed [here](https://github.com/jaechan-repo/muse_bench/issues/2). This inconsistency is carried over into implementations like [SimNPO](https://github.com/OPTML-Group/Unlearn-Simple/issues/5). Here, we use the original NPO implementation with the same loss function expression across datasets.
@@ -60,7 +63,7 @@ __Note:__
1.27e-03 |
0.63 |
0.53 |
- 1.33e-13 |
+ 5.87e-14 |
0.63 |
0.51 |
4.35e-25 |
@@ -187,12 +190,12 @@ __Note:__
| Finetuned |
0.01 |
- 0.60 |
+ 0.6 |
0.47 |
- 2.96e-13 |
+ 1.33e-13 |
0.6 |
0.47 |
- 8.08e-22 |
+ 1.66e-21 |
0.6 |
0.48 |
@@ -203,7 +206,7 @@ __Note:__
0.65 |
1.0 |
0.6 |
- 0.63 |
+ 0.64 |
1.0 |
0.59 |
0.63 |
@@ -316,7 +319,7 @@ __Note:__
0.64 |
0.58 |
-99.81 |
- 0.55 |
+ 0.56 |
0.47 |
1.0 |
-57.26 |
@@ -325,7 +328,7 @@ __Note:__
| Retain |
0.33 |
- 0.21 |
+ 0.20 |
0 |
0.56 |
0.3 |
diff --git a/requirements.txt b/requirements.txt
index 147f5155..2f39c76e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,6 @@ datasets==3.0.1
accelerate==0.34.2
bitsandbytes==0.44.1
rouge-score==0.1.2
-pre-commit==4.0.1
scipy==1.14.1
tensorboard==2.18.0
scikit-learn==1.5.2
diff --git a/scripts/tofu_finetune.sh b/scripts/tofu_finetune.sh
index bb18fa44..dfb3a692 100644
--- a/scripts/tofu_finetune.sh
+++ b/scripts/tofu_finetune.sh
@@ -11,10 +11,10 @@ models=(
)
per_device_train_batch_size=4 # Effective batch size 32 on two GPUs with gradent_accumulation_steps=8
-forget_retain_splits=(
- "forget01 retain99"
- "forget05 retain95"
- "forget10 retain90"
+splits=(
+ "forget01 holdout01 retain99"
+ "forget05 holdout05 retain95"
+ "forget10 holdout10 retain90"
)
@@ -23,9 +23,10 @@ forget_retain_splits=(
########################################### RETAIN Finetuned TOFU ######################################################
########################################################################################################################
-for split in "${forget_retain_splits[@]}"; do
+for split in "${splits[@]}"; do
forget_split=$(echo $split | cut -d' ' -f1)
- retain_split=$(echo $split | cut -d' ' -f2)
+ holdout_split=$(echo $split | cut -d' ' -f2)
+ retain_split=$(echo $split | cut -d' ' -f3)
for model in "${models[@]}"; do
CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \
@@ -41,6 +42,7 @@ for split in "${forget_retain_splits[@]}"; do
CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/tofu/default.yaml \
forget_split=${forget_split} \
+ holdout_split=${holdout_split} \
task_name=tofu_${model}_${retain_split} \
model=${model} \
model.model_args.pretrained_model_name_or_path=saves/finetune/tofu_${model}_${retain_split}
@@ -65,12 +67,14 @@ for model in "${models[@]}"; do
trainer.args.gradient_checkpointing=true
# Evaluate the full models on each forget split
- for split in "${forget_retain_splits[@]}"; do
+ for split in "${splits[@]}"; do
forget_split=$(echo $split | cut -d' ' -f1)
- retain_split=$(echo $split | cut -d' ' -f2)
+ holdout_split=$(echo $split | cut -d' ' -f2)
+ retain_split=$(echo $split | cut -d' ' -f3)
CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/tofu/default.yaml \
forget_split=${forget_split} \
+ holdout_split=${holdout_split} \
task_name=tofu_${model}_full_${forget_split} \
model=${model} \
model.model_args.pretrained_model_name_or_path=saves/finetune/tofu_${model}_full \
diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh
index ae33189f..87d9a7c1 100644
--- a/scripts/tofu_unlearn.sh
+++ b/scripts/tofu_unlearn.sh
@@ -16,12 +16,13 @@ trainers_experiments=(
"DPO unlearn/tofu/idk.yaml"
"RMU unlearn/tofu/default.yaml"
)
-forget_retain_splits=(
- "forget01 retain99"
- "forget05 retain95"
- "forget10 retain90"
+splits=(
+ "forget01 holdout01 retain99"
+ "forget05 holdout05 retain95"
+ "forget10 holdout10 retain90"
)
+
per_device_train_batch_size=4 # on two gpus would make effective batch size 32
gradient_accumulation_steps=4
@@ -31,9 +32,11 @@ gradient_accumulation_steps=4
########################################################################################################################
-for split in "${forget_retain_splits[@]}"; do
+for split in "${splits[@]}"; do
forget_split=$(echo $split | cut -d' ' -f1)
- retain_split=$(echo $split | cut -d' ' -f2)
+ holdout_split=$(echo $split | cut -d' ' -f2)
+ retain_split=$(echo $split | cut -d' ' -f3)
+
for model in "${models[@]}"; do
for trainer_experiment in "${trainers_experiments[@]}"; do
trainer=$(echo $trainer_experiment | cut -d' ' -f1)
@@ -63,6 +66,7 @@ for split in "${forget_retain_splits[@]}"; do
CUDA_VISIBLE_DEVICES=0 python src/eval.py \
experiment=eval/tofu/default.yaml \
forget_split=${forget_split} \
+ holdout_split=${holdout_split} \
model=${model} \
task_name=${task_name} \
model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \
diff --git a/setup.py b/setup.py
index 209335c1..b02a3481 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
url="https://github.com/locuslab/open-unlearning",
+ license="MIT",
packages=find_packages(),
install_requires=requirements, # Uses requirements.txt
extras_require={
diff --git a/src/data/__init__.py b/src/data/__init__.py
index e8388003..a67ce7da 100644
--- a/src/data/__init__.py
+++ b/src/data/__init__.py
@@ -40,9 +40,8 @@ def _load_single_dataset(dataset_name, dataset_cfg: DictConfig, **kwargs):
def get_datasets(dataset_cfgs: Union[Dict, DictConfig], **kwargs):
dataset = {}
for dataset_name, dataset_cfg in dataset_cfgs.items():
- dataset[dataset_name] = _load_single_dataset(
- dataset_name, dataset_cfg, **kwargs
- )
+ access_name = dataset_cfg.get("access_key", dataset_name)
+ dataset[access_name] = _load_single_dataset(dataset_name, dataset_cfg, **kwargs)
if len(dataset) == 1:
# return a single dataset
return list(dataset.values())[0]
diff --git a/src/eval.py b/src/eval.py
index 82aae675..066c8dad 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -1,6 +1,7 @@
import hydra
from omegaconf import DictConfig
+from trainer.utils import seed_everything
from model import get_model
from evals import get_evaluators
@@ -11,6 +12,7 @@ def main(cfg: DictConfig):
Args:
cfg (DictConfig): Config to train
"""
+ seed_everything(cfg.seed)
model_cfg = cfg.model
template_args = model_cfg.template_args
assert model_cfg is not None, "Invalid model yaml passed in train config."
diff --git a/src/evals/base.py b/src/evals/base.py
index 992c2496..3d794d5a 100644
--- a/src/evals/base.py
+++ b/src/evals/base.py
@@ -32,8 +32,11 @@ def save_logs(self, logs, file):
"""Save the logs in a json file"""
logs = dict(sorted(logs.items()))
os.makedirs(os.path.dirname(file), exist_ok=True)
- with open(file, "w") as f:
- json.dump(logs, f, indent=4)
+ try:
+ with open(file, "w") as f:
+ json.dump(logs, f, indent=4)
+ except Exception as e:
+ raise RuntimeError(f"Failed to save {file}: {e}")
def prepare_model(self, model):
"""Prepare model for evaluation"""
@@ -49,6 +52,8 @@ def summarize(self, logs):
"""Summarize the metrics results"""
metric_summary = {}
for metric_name, metric_results in logs.items():
+ if metric_name not in self.metrics:
+ continue
agg_value = metric_results.get("agg_value", None)
if agg_value is not None:
metric_summary[metric_name] = agg_value
@@ -77,6 +82,7 @@ def evaluate(self, model, output_dir=None, overwrite=None, **kwargs):
logger.info(
f"Result for metric {metric_name}:\t{logs[metric_name]['agg_value']}"
)
+ self.save_logs(self.summarize(logs), summary_file_path)
continue
_ = logs.pop(metric_name, None) # overwriting existing evals if present
kwargs = {
@@ -94,12 +100,7 @@ def evaluate(self, model, output_dir=None, overwrite=None, **kwargs):
)
if "agg_value" in result:
logger.info(f"Result for metric {metric_name}:\t{result['agg_value']}")
- try:
- self.save_logs(logs, logs_file_path)
- except Exception as e:
- raise RuntimeError(f"Failed to save logs: {e}")
- try:
- self.save_logs(self.summarize(logs), summary_file_path)
- except Exception as e:
- raise RuntimeError(f"Failed to save summary: {e}")
+
+ self.save_logs(logs, logs_file_path)
+ self.save_logs(self.summarize(logs), summary_file_path)
return logs
diff --git a/src/evals/metrics/__init__.py b/src/evals/metrics/__init__.py
index 7a4099ea..9441c8dc 100644
--- a/src/evals/metrics/__init__.py
+++ b/src/evals/metrics/__init__.py
@@ -7,11 +7,17 @@
rouge,
truth_ratio,
hm_aggregate,
+ extraction_strength,
+ exact_memorization,
)
-from evals.metrics.privacy import (
- forget_quality,
- minKpc_negative_logprob,
- relative_auc,
+from evals.metrics.privacy import ks_test, privleak, rel_diff
+from evals.metrics.mia import (
+ mia_loss,
+ mia_min_k,
+ mia_min_k_plus_plus,
+ mia_gradnorm,
+ mia_zlib,
+ mia_reference,
)
METRICS_REGISTRY: Dict[str, UnlearningMetric] = {}
@@ -47,7 +53,17 @@ def get_metrics(metric_cfgs: DictConfig, **kwargs):
_register_metric(probability_w_options)
_register_metric(rouge)
_register_metric(truth_ratio)
-_register_metric(forget_quality)
+_register_metric(ks_test)
_register_metric(hm_aggregate)
-_register_metric(minKpc_negative_logprob)
-_register_metric(relative_auc)
+_register_metric(privleak)
+_register_metric(rel_diff)
+_register_metric(exact_memorization)
+_register_metric(extraction_strength)
+
+# Register MIA metrics
+_register_metric(mia_loss)
+_register_metric(mia_min_k)
+_register_metric(mia_min_k_plus_plus)
+_register_metric(mia_gradnorm)
+_register_metric(mia_zlib)
+_register_metric(mia_reference)
diff --git a/src/evals/metrics/base.py b/src/evals/metrics/base.py
index 334db5c6..105ff1c6 100644
--- a/src/evals/metrics/base.py
+++ b/src/evals/metrics/base.py
@@ -120,7 +120,7 @@ def prepare_kwargs_evaluate_metric(self, model, metric_name, cache={}, **kwargs)
reference_logs[reference_log_name][access_name] = _results
if _results is None:
logger.warning(
- f"{key} not present in the {path}, setting it to None!"
+ f"{key} evals not present in the {path}, setting it to None, may result in error soon if code attempts to access."
)
if reference_logs:
kwargs.update({"reference_logs": reference_logs})
diff --git a/src/evals/metrics/memorization.py b/src/evals/metrics/memorization.py
index 95878099..90cd3c29 100644
--- a/src/evals/metrics/memorization.py
+++ b/src/evals/metrics/memorization.py
@@ -1,4 +1,5 @@
import logging
+import torch
import numpy as np
import scipy as sc
from torch.utils.data import DataLoader
@@ -9,6 +10,7 @@
evaluate_probability,
eval_text_similarity,
run_batchwise_evals,
+ tokenwise_vocab_logprobs,
)
from evals.metrics.base import unlearning_metric
@@ -136,3 +138,65 @@ def true_better(arr):
def hm_aggregate(model, **kwargs):
values = [result["agg_value"] for _, result in kwargs["pre_compute"].items()]
return {"agg_value": sc.stats.hmean(values)}
+
+
+@unlearning_metric(name="exact_memorization")
+def exact_memorization(model, **kwargs):
+ data = kwargs["data"]
+ collator = kwargs["collators"]
+ batch_size = kwargs["batch_size"]
+ dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator)
+
+ def _exact_memorization(model, batch):
+ log_probs_batch, labels_batch = tokenwise_vocab_logprobs(
+ model, batch, grad=False, return_labels=True
+ )
+ em_batch = []
+ for log_probs, labels in zip(log_probs_batch, labels_batch):
+ assert len(log_probs) == len(labels)
+ preds = torch.argmax(log_probs, dim=-1)
+ em_score = (preds == labels).sum() / len(labels)
+ em_batch.append({"score": em_score.item()})
+ return em_batch
+
+ fun_args = {}
+ scores_by_index = run_batchwise_evals(
+ model, dataloader, _exact_memorization, fun_args, "Calculating EM"
+ )
+ em_values = np.array([evals["score"] for evals in scores_by_index.values()])
+ em_values = aggregate_to_1D(em_values)
+ return {"agg_value": np.mean(em_values), "value_by_index": scores_by_index}
+
+
+@unlearning_metric(name="extraction_strength")
+def extraction_strength(model, **kwargs):
+ data = kwargs["data"]
+ collator = kwargs["collators"]
+ batch_size = kwargs["batch_size"]
+ dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator)
+
+ def _extraction_strength(model, batch):
+ log_probs_batch, labels_batch = tokenwise_vocab_logprobs(
+ model, batch, grad=False, return_labels=True
+ )
+ es_batch = []
+ for log_probs, labels in zip(log_probs_batch, labels_batch):
+ assert len(log_probs) == len(labels)
+ valid_len = len(labels)
+ preds = torch.argmax(log_probs, dim=-1)
+ for k in range(valid_len):
+ suff_preds = preds[k:]
+ suff_labels = labels[k:]
+ if torch.equal(suff_preds, suff_labels):
+ break
+ es_score = 1 - (k / valid_len)
+ es_batch.append({"score": es_score})
+ return es_batch
+
+ fun_args = {}
+ scores_by_index = run_batchwise_evals(
+ model, dataloader, _extraction_strength, fun_args, "Calculating ES"
+ )
+ es_values = np.array([evals["score"] for evals in scores_by_index.values()])
+ es_values = aggregate_to_1D(es_values)
+ return {"agg_value": np.mean(es_values), "value_by_index": scores_by_index}
diff --git a/src/evals/metrics/mia/__init__.py b/src/evals/metrics/mia/__init__.py
new file mode 100644
index 00000000..5ab869f6
--- /dev/null
+++ b/src/evals/metrics/mia/__init__.py
@@ -0,0 +1,100 @@
+"""
+Attack implementations.
+"""
+
+from transformers import AutoModelForCausalLM
+
+from evals.metrics.base import unlearning_metric
+from evals.metrics.mia.loss import LOSSAttack
+from evals.metrics.mia.min_k import MinKProbAttack
+from evals.metrics.mia.min_k_plus_plus import MinKPlusPlusAttack
+from evals.metrics.mia.gradnorm import GradNormAttack
+from evals.metrics.mia.zlib import ZLIBAttack
+from evals.metrics.mia.reference import ReferenceAttack
+
+from evals.metrics.mia.utils import mia_auc
+import logging
+
+logger = logging.getLogger("metrics")
+
+## NOTE: all MIA attack statistics are signed as required in order to show the
+# same trends as loss (higher the score on an example, less likely the membership)
+
+
+@unlearning_metric(name="mia_loss")
+def mia_loss(model, **kwargs):
+ return mia_auc(
+ LOSSAttack,
+ model,
+ data=kwargs["data"],
+ collator=kwargs["collators"],
+ batch_size=kwargs["batch_size"],
+ )
+
+
+@unlearning_metric(name="mia_min_k")
+def mia_min_k(model, **kwargs):
+ return mia_auc(
+ MinKProbAttack,
+ model,
+ data=kwargs["data"],
+ collator=kwargs["collators"],
+ batch_size=kwargs["batch_size"],
+ k=kwargs["k"],
+ )
+
+
+@unlearning_metric(name="mia_min_k_plus_plus")
+def mia_min_k_plus_plus(model, **kwargs):
+ return mia_auc(
+ MinKPlusPlusAttack,
+ model,
+ data=kwargs["data"],
+ collator=kwargs["collators"],
+ batch_size=kwargs["batch_size"],
+ k=kwargs["k"],
+ )
+
+
+@unlearning_metric(name="mia_gradnorm")
+def mia_gradnorm(model, **kwargs):
+ return mia_auc(
+ GradNormAttack,
+ model,
+ data=kwargs["data"],
+ collator=kwargs["collators"],
+ batch_size=kwargs["batch_size"],
+ p=kwargs["p"],
+ )
+
+
+@unlearning_metric(name="mia_zlib")
+def mia_zlib(model, **kwargs):
+ return mia_auc(
+ ZLIBAttack,
+ model,
+ data=kwargs["data"],
+ collator=kwargs["collators"],
+ batch_size=kwargs["batch_size"],
+ tokenizer=kwargs.get("tokenizer"),
+ )
+
+
+@unlearning_metric(name="mia_reference")
+def mia_reference(model, **kwargs):
+ if "reference_model_path" not in kwargs:
+ raise ValueError("Reference model must be provided in kwargs")
+ logger.info(f"Loading reference model from {kwargs['reference_model_path']}")
+ reference_model = AutoModelForCausalLM.from_pretrained(
+ kwargs["reference_model_path"],
+ torch_dtype=model.dtype,
+ device_map={"": model.device},
+ )
+ return mia_auc(
+ ReferenceAttack,
+ model,
+ data=kwargs["data"],
+ collator=kwargs["collators"],
+ batch_size=kwargs["batch_size"],
+ reference_model=reference_model,
+ )
diff --git a/src/evals/metrics/mia/all_attacks.py b/src/evals/metrics/mia/all_attacks.py
new file mode 100644
index 00000000..f2d074c2
--- /dev/null
+++ b/src/evals/metrics/mia/all_attacks.py
@@ -0,0 +1,63 @@
+"""
+Enum class for attacks. Also contains the base attack class.
+"""
+
+from enum import Enum
+from torch.utils.data import DataLoader
+import numpy as np
+from tqdm import tqdm
+
+
+# Attack definitions
+class AllAttacks(str, Enum):
+ LOSS = "loss"
+ REFERENCE_BASED = "ref"
+ ZLIB = "zlib"
+ MIN_K = "min_k"
+ MIN_K_PLUS_PLUS = "min_k++"
+ GRADNORM = "gradnorm"
+ RECALL = "recall"
+
+
+# Base attack class
+class Attack:
+ def __init__(self, model, data, collator, batch_size, **kwargs):
+ """Initialize attack with model and create dataloader."""
+ self.model = model
+ self.dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator)
+ self.setup(**kwargs)
+
+ def setup(self, **kwargs):
+ """Setup attack-specific parameters."""
+ pass
+
+ def compute_batch_values(self, batch):
+ """Process a batch through model to get needed statistics."""
+ raise NotImplementedError
+
+ def compute_score(self, sample_stats):
+ """Compute MIA score for a single sample."""
+ raise NotImplementedError
+
+ def attack(self):
+ """Run full MIA attack."""
+ all_scores = []
+ all_indices = []
+
+ for batch in tqdm(self.dataloader, total=len(self.dataloader)):
+ indices = batch.pop("index").cpu().numpy().tolist()
+ batch_values = self.compute_batch_values(batch)
+ scores = [self.compute_score(values) for values in batch_values]
+
+ all_scores.extend(scores)
+ all_indices.extend(indices)
+
+ scores_by_index = {
+ str(idx): {"score": float(score)}
+ for idx, score in zip(all_indices, all_scores)
+ }
+
+ return {
+ "agg_value": float(np.mean(all_scores)),
+ "value_by_index": scores_by_index,
+ }
diff --git a/src/evals/metrics/mia/gradnorm.py b/src/evals/metrics/mia/gradnorm.py
new file mode 100644
index 00000000..dcb529cb
--- /dev/null
+++ b/src/evals/metrics/mia/gradnorm.py
@@ -0,0 +1,36 @@
+"""
+Gradient-norm attack. Proposed for MIA in multiple settings, and particularly
+experimented for pre-training data and LLMs in https://arxiv.org/abs/2402.17012
+"""
+
+import torch
+from evals.metrics.mia.all_attacks import Attack
+from evals.metrics.utils import tokenwise_logprobs
+
+
+# DO NOT use gradnorm in a way so that it runs when your accumulated gradients during training aren't used yet
+# gradnorm zeros out the gradients of the model during its computation
+class GradNormAttack(Attack):
+ def setup(self, p, **kwargs):
+ if p not in [1, 2, float("inf")]:
+ raise ValueError(f"Invalid p-norm value: {p}")
+ self.p = p
+
+ def compute_batch_values(self, batch):
+ """Compute gradients of examples w.r.t model parameters. More grad norm => more loss."""
+ batch_log_probs = tokenwise_logprobs(self.model, batch, grad=True)
+ batch_loss = [-torch.mean(lps) for lps in batch_log_probs]
+ batch_grad_norms = []
+ for sample_loss in batch_loss:
+ sample_grad_norms = []
+ self.model.zero_grad()
+ sample_loss.backward()
+ for param in self.model.parameters():
+ if param.grad is not None:
+ sample_grad_norms.append(param.grad.detach().norm(p=self.p))
+ batch_grad_norms.append(torch.stack(sample_grad_norms).mean())
+ return batch_grad_norms
+
+ def compute_score(self, sample_stats):
+ """Return negative gradient norm as the attack score."""
+ return sample_stats.cpu().to(torch.float32).numpy()
diff --git a/src/evals/metrics/mia/loss.py b/src/evals/metrics/mia/loss.py
new file mode 100644
index 00000000..bcfd204a
--- /dev/null
+++ b/src/evals/metrics/mia/loss.py
@@ -0,0 +1,16 @@
+"""
+Straight-forward LOSS attack, as described in https://ieeexplore.ieee.org/abstract/document/8429311
+"""
+
+from evals.metrics.mia.all_attacks import Attack
+from evals.metrics.utils import evaluate_probability
+
+
+class LOSSAttack(Attack):
+ def compute_batch_values(self, batch):
+ """Compute probabilities and losses for the batch."""
+ return evaluate_probability(self.model, batch)
+
+ def compute_score(self, sample_stats):
+ """Return the average loss for the sample."""
+ return sample_stats["avg_loss"]
diff --git a/src/evals/metrics/mia/min_k.py b/src/evals/metrics/mia/min_k.py
new file mode 100644
index 00000000..8b8d4ecf
--- /dev/null
+++ b/src/evals/metrics/mia/min_k.py
@@ -0,0 +1,26 @@
+"""
+Min-k % Prob Attack: https://arxiv.org/pdf/2310.16789.pdf
+"""
+
+import numpy as np
+from evals.metrics.mia.all_attacks import Attack
+from evals.metrics.utils import tokenwise_logprobs
+
+
+class MinKProbAttack(Attack):
+ def setup(self, k=0.2, **kwargs):
+ self.k = k
+
+ def compute_batch_values(self, batch):
+ """Get token-wise log probabilities for the batch."""
+ return tokenwise_logprobs(self.model, batch, grad=False)
+
+ def compute_score(self, sample_stats):
+ """Score single sample using min-k negative log probs scores attack."""
+ lp = sample_stats.cpu().numpy()
+ if lp.size == 0:
+ return 0
+
+ num_k = max(1, int(len(lp) * self.k))
+ sorted_vals = np.sort(lp)
+ return -np.mean(sorted_vals[:num_k])
diff --git a/src/evals/metrics/mia/min_k_plus_plus.py b/src/evals/metrics/mia/min_k_plus_plus.py
new file mode 100644
index 00000000..cfc85dea
--- /dev/null
+++ b/src/evals/metrics/mia/min_k_plus_plus.py
@@ -0,0 +1,39 @@
+import torch as torch
+import numpy as np
+from evals.metrics.mia.min_k import MinKProbAttack
+from evals.metrics.utils import tokenwise_vocab_logprobs, tokenwise_logprobs
+
+
+class MinKPlusPlusAttack(MinKProbAttack):
+ def compute_batch_values(self, batch):
+ """Get both token-wise and vocab-wise log probabilities for the batch."""
+ vocab_log_probs = tokenwise_vocab_logprobs(self.model, batch, grad=False)
+ token_log_probs = tokenwise_logprobs(self.model, batch, grad=False)
+ return [
+ {"vocab_log_probs": vlp, "token_log_probs": tlp}
+ for vlp, tlp in zip(vocab_log_probs, token_log_probs)
+ ]
+
+ def compute_score(self, sample_stats):
+ """Score using min-k negative log probs scores with vocab-wise normalization."""
+ all_probs = sample_stats["vocab_log_probs"]
+ target_prob = sample_stats["token_log_probs"]
+
+ if len(target_prob) == 0:
+ return 0
+
+ # Compute normalized scores using vocab distribution
+ mu = (torch.exp(all_probs) * all_probs).sum(-1)
+ sigma = (torch.exp(all_probs) * torch.square(all_probs)).sum(-1) - torch.square(
+ mu
+ )
+
+ # Handle numerical stability
+ sigma = torch.clamp(sigma, min=1e-6)
+ scores = (target_prob.cpu().numpy() - mu.cpu().numpy()) / torch.sqrt(
+ sigma
+ ).cpu().numpy()
+
+ # Take bottom k% as the attack score
+ num_k = max(1, int(len(scores) * self.k))
+ return -np.mean(sorted(scores)[:num_k])
diff --git a/src/evals/metrics/mia/reference.py b/src/evals/metrics/mia/reference.py
new file mode 100644
index 00000000..3faeb6d6
--- /dev/null
+++ b/src/evals/metrics/mia/reference.py
@@ -0,0 +1,25 @@
+"""
+Reference-based attacks.
+"""
+
+from evals.metrics.mia.all_attacks import Attack
+from evals.metrics.utils import evaluate_probability
+
+
+class ReferenceAttack(Attack):
+ def setup(self, reference_model, **kwargs):
+ """Setup reference model."""
+ self.reference_model = reference_model
+
+ def compute_batch_values(self, batch):
+ """Compute loss scores for both target and reference models."""
+ ref_results = evaluate_probability(self.reference_model, batch)
+ target_results = evaluate_probability(self.model, batch)
+ return [
+ {"target_loss": t["avg_loss"], "ref_loss": r["avg_loss"]}
+ for t, r in zip(target_results, ref_results)
+ ]
+
+ def compute_score(self, sample_stats):
+ """Score using difference between target and reference model losses."""
+ return sample_stats["target_loss"] - sample_stats["ref_loss"]
diff --git a/src/evals/metrics/mia/utils.py b/src/evals/metrics/mia/utils.py
new file mode 100644
index 00000000..15eb19b4
--- /dev/null
+++ b/src/evals/metrics/mia/utils.py
@@ -0,0 +1,70 @@
+from evals.metrics.mia.all_attacks import AllAttacks
+from evals.metrics.mia.loss import LOSSAttack
+from evals.metrics.mia.reference import ReferenceAttack
+from evals.metrics.mia.zlib import ZLIBAttack
+from evals.metrics.mia.min_k import MinKProbAttack
+from evals.metrics.mia.min_k_plus_plus import MinKPlusPlusAttack
+from evals.metrics.mia.gradnorm import GradNormAttack
+
+from sklearn.metrics import roc_auc_score
+
+
+import numpy as np
+
+
+def get_attacker(attack: str):
+ mapping = {
+ AllAttacks.LOSS: LOSSAttack,
+ AllAttacks.REFERENCE_BASED: ReferenceAttack,
+ AllAttacks.ZLIB: ZLIBAttack,
+ AllAttacks.MIN_K: MinKProbAttack,
+ AllAttacks.MIN_K_PLUS_PLUS: MinKPlusPlusAttack,
+ AllAttacks.GRADNORM: GradNormAttack,
+ }
+ attack_cls = mapping.get(attack, None)
+ if attack_cls is None:
+ raise ValueError(f"Attack {attack} not found")
+ return attack_cls
+
+
+def mia_auc(attack_cls, model, data, collator, batch_size, **kwargs):
+ """
+ Compute the MIA AUC and accuracy.
+
+ Parameters:
+ - attack_cls: the attack class to use.
+ - model: the target model.
+ - data: a dict with keys "forget" and "holdout".
+ - collator: data collator.
+ - batch_size: batch size.
+ - kwargs: additional optional parameters (e.g. k, p, tokenizer, reference_model).
+
+ Returns a dict containing the attack outputs, including "acc" and "auc".
+
+ Note on convention: auc is 1 when the forget data is much more likely than the holdout data
+ """
+ # Build attack arguments from common parameters and any extras.
+ attack_args = {
+ "model": model,
+ "collator": collator,
+ "batch_size": batch_size,
+ }
+ attack_args.update(kwargs)
+
+ output = {
+ "forget": attack_cls(data=data["forget"], **attack_args).attack(),
+ "holdout": attack_cls(data=data["holdout"], **attack_args).attack(),
+ }
+ forget_scores = [
+ elem["score"] for elem in output["forget"]["value_by_index"].values()
+ ]
+ holdout_scores = [
+ elem["score"] for elem in output["holdout"]["value_by_index"].values()
+ ]
+ scores = np.array(forget_scores + holdout_scores)
+ labels = np.array(
+ [0] * len(forget_scores) + [1] * len(holdout_scores)
+ ) # see note above
+ auc_value = roc_auc_score(labels, scores)
+ output["auc"], output["agg_value"] = auc_value, auc_value
+ return output
diff --git a/src/evals/metrics/mia/zlib.py b/src/evals/metrics/mia/zlib.py
new file mode 100644
index 00000000..5a8f7ba3
--- /dev/null
+++ b/src/evals/metrics/mia/zlib.py
@@ -0,0 +1,29 @@
+"""
+zlib-normalization Attack: https://www.usenix.org/system/files/sec21-carlini-extracting.pdf
+"""
+
+import zlib
+
+from evals.metrics.mia.all_attacks import Attack
+from evals.metrics.utils import (
+ evaluate_probability,
+ extract_target_texts_from_processed_data,
+)
+
+
+class ZLIBAttack(Attack):
+ def setup(self, tokenizer=None, **kwargs):
+ """Setup tokenizer."""
+ self.tokenizer = tokenizer or self.model.tokenizer
+
+ def compute_batch_values(self, batch):
+ """Get loss and text for batch."""
+ eval_results = evaluate_probability(self.model, batch)
+ texts = extract_target_texts_from_processed_data(self.tokenizer, batch)
+ return [{"loss": r["avg_loss"], "text": t} for r, t in zip(eval_results, texts)]
+
+ def compute_score(self, sample_stats):
+ """Score using loss normalized by compressed text length."""
+ text = sample_stats["text"]
+ zlib_entropy = len(zlib.compress(text.encode("utf-8")))
+ return sample_stats["loss"] / zlib_entropy
diff --git a/src/evals/metrics/privacy.py b/src/evals/metrics/privacy.py
index fcaab25a..1d9bdfbe 100644
--- a/src/evals/metrics/privacy.py
+++ b/src/evals/metrics/privacy.py
@@ -1,14 +1,12 @@
import numpy as np
from scipy.stats import ks_2samp
-from torch.utils.data import DataLoader
-from sklearn.metrics import auc as get_auc, roc_curve as get_roc_curve
-
from evals.metrics.base import unlearning_metric, logger
-from evals.metrics.utils import run_batchwise_evals, eval_minKpc_neg_logprob
-@unlearning_metric(name="forget_quality")
-def forget_quality(model, **kwargs):
+@unlearning_metric(name="ks_test")
+def ks_test(model, **kwargs):
+ """Compare two forget and retain model distributions with a 2-sample KS-test and report the p-value.
+ Used in the TOFU benchmark as forget_quality when computed over the truth_ratio statistic."""
forget_tr_stats = np.array(
[
evals["score"]
@@ -17,12 +15,11 @@ def forget_quality(model, **kwargs):
)
reference_logs = kwargs.get("reference_logs", None)
if reference_logs:
+ reference_logs = reference_logs["retain_model_logs"]
retain_tr_stats = np.array(
[
evals["score"]
- for evals in kwargs["reference_logs"]["retain_model_logs"]["retain"][
- "value_by_index"
- ].values()
+ for evals in reference_logs["retain"]["value_by_index"].values()
]
)
fq = ks_2samp(forget_tr_stats, retain_tr_stats)
@@ -35,72 +32,35 @@ def forget_quality(model, **kwargs):
return {"agg_value": pvalue}
-@unlearning_metric(name="minKpc_negative_logprob")
-def minKpc_negative_logprob(model, **kwargs):
- """Compute the min-k percentile average of token-wise model probabilities by data points"""
- data = kwargs["data"]
- collator = kwargs["collators"]
- batch_size = kwargs["batch_size"]
-
- dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator)
-
- fun_args = {"percentile": kwargs["percentile_K"]}
- return {
- "value_by_index": run_batchwise_evals(
- model,
- dataloader,
- eval_minKpc_neg_logprob,
- fun_args,
- "Calculating avg token-wise lowest K% percentile logprobs across batches",
+@unlearning_metric(name="privleak")
+def privleak(model, **kwargs):
+ """Compare two forget and retain model scores using a relative comparison of a single statistic.
+ To be used for MIA AUC scores in ensuring consistency and reproducibility of the MUSE benchmark.
+ This function is similar to the rel_diff function below, but due to the MUSE benchmark reporting AUC
+ scores as (1-x) when the more conventional way is x, we do adjustments here to our MIA AUC scores.
+ calculations in the reverse way,"""
+ score = kwargs["pre_compute"]["forget"]["agg_value"]
+ try:
+ ref = kwargs["reference_logs"]["retain_model_logs"]["retain"]["agg_value"]
+ except Exception as _:
+ logger.warning(
+ f"retain_model_logs evals not provided for privleak, using default retain auc of {kwargs['ref_value']}"
)
- }
-
-
-@unlearning_metric(name="relative_auc")
-def relative_auc(model, **kwargs):
- """Compute the auc score of an MIA attack wrt model scores on a victim and holdout set"""
-
- def sweep(ppl, y):
- fpr, tpr, _ = get_roc_curve(y, -ppl)
- acc = np.max(1 - (fpr + (1 - tpr)) / 2)
- return fpr, tpr, get_auc(fpr, tpr), acc
-
- forget_scores = kwargs["pre_compute"]["forget"]["value_by_index"].values()
- forget_scores = [elem["score"] for elem in forget_scores]
- forget_holdout_scores = kwargs["pre_compute"]["holdout"]["value_by_index"].values()
- forget_holdout_scores = [elem["score"] for elem in forget_holdout_scores]
- scores = np.array(forget_scores + forget_holdout_scores)
- # in MUSE the scores are -mean(min k% log-probs) for some reason so flip the 1 and 0
- labels = np.array([0] * len(forget_scores) + [1] * len(forget_holdout_scores))
+ ref = kwargs["ref_value"]
+ score = 1 - score
+ ref = 1 - ref
+ return {"agg_value": (score - ref) / (ref + 1e-10) * 100}
- _, _, auc_score, acc = sweep(scores, labels)
- output = {
- "acc": acc,
- "auc": auc_score,
- }
- retain_auc_score = kwargs["ref_value"]
-
- reference_logs = kwargs.get("reference_logs", None)
- if reference_logs:
- retain_scores = reference_logs["retain_model_logs"]["retain"][
- "value_by_index"
- ].values()
- retain_scores = [elem["score"] for elem in retain_scores]
- retain_holdout_scores = reference_logs["retain_model_logs"]["holdout"][
- "value_by_index"
- ].values()
- retain_holdout_scores = [elem["score"] for elem in retain_holdout_scores]
- scores = np.array(retain_scores + retain_holdout_scores)
- labels = np.array([0] * len(retain_scores) + [1] * len(retain_holdout_scores))
- _, _, retain_auc_score, retain_acc = sweep(scores, labels)
- output.update({"retain_acc": retain_acc, "retain_auc_score": retain_auc_score})
-
- output.update(
- {
- "agg_value": (auc_score - retain_auc_score)
- / (retain_auc_score)
- * 100 # privleak score in muse
- }
- )
- return output
+@unlearning_metric(name="rel_diff")
+def rel_diff(model, **kwargs):
+ """Compare two forget and retain model scores using a relative comparison of a single statistic."""
+ score = kwargs["pre_compute"]["forget"]["agg_value"]
+ try:
+ ref = kwargs["reference_logs"]["retain_model_logs"]["retain"]["agg_value"]
+ except Exception as _:
+ logger.warning(
+ f"retain_model_logs evals not provided for privleak, using default retain auc of {kwargs['ref_value']}"
+ )
+ ref = kwargs["ref_value"]
+ return {"agg_value": (score - ref) / (ref + 1e-10) * 100}
diff --git a/src/evals/metrics/utils.py b/src/evals/metrics/utils.py
index 2fdbe202..92c51bf8 100644
--- a/src/evals/metrics/utils.py
+++ b/src/evals/metrics/utils.py
@@ -103,25 +103,35 @@ def evaluate_probability(model, batch):
]
-def eval_minKpc_neg_logprob(model, batch, percentile):
- """Compute minK% attack score for each sample in a batch."""
+def tokenwise_logprobs(model, batch, grad=False, return_labels=False):
+ """
+ Compute token-wise next token prediction logprobs for all labeled tokens for each sample in a batch.
+ `grad` decides whether gradients are turned on
+ Returns
+ log_probs_batch (List[Tensor]): Tensors of size seq_len where seq_len is length of labeled tokens
+ labels_batch (List[Tensor]): List of tensors of length N. Returned only if return_labels is True
+ """
batch = {k: v.to(model.device) for k, v in batch.items()}
- with torch.no_grad():
+
+ model.train(mode=grad)
+ with torch.set_grad_enabled(grad):
output = model(**batch)
+
logits = output.logits
bsz, seq_len, V = logits.shape
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)[:, :-1, :]
# ^ we don't predict next token for last token, bsz x seq_len-1 x V
next_tokens = batch["input_ids"][:, 1:].unsqueeze(-1) # bsz x seq_len-1 x 1
target_log_probs = torch.gather(log_probs, dim=2, index=next_tokens).squeeze(-1)
- mink_means = []
+ log_probs_batch = []
+ labels_batch = []
for i in range(bsz):
labels = batch["labels"][i][:-1]
# only focus on tokens which have loss on them (i.e. used in labels)
actual_indices = (labels != IGNORE_INDEX).nonzero(as_tuple=True)[0]
num_actual_tokens = actual_indices.numel()
if num_actual_tokens == 0:
- mink_means.append(0)
+ log_probs_batch.append(torch.tensor([0.0], device=labels.device))
continue
start_idx, end_idx = actual_indices[0].item(), actual_indices[-1].item()
if start_idx == 0:
@@ -129,14 +139,52 @@ def eval_minKpc_neg_logprob(model, batch, percentile):
"Index 0 in a datapoint's input_ids must not have loss (unignored labels) on it",
UserWarning,
)
- actual_seq_log_probs = (
- target_log_probs[i, start_idx - 1 : end_idx].cpu().numpy()
- )
- sorted_probs = np.sort(actual_seq_log_probs)
- top_k = max(1, int(percentile / 100 * len(actual_seq_log_probs)))
- mink_mean = -1 * np.mean(sorted_probs[:top_k])
- mink_means.append(mink_mean)
- return [{"score": float(neglogprob)} for neglogprob in mink_means]
+ log_probs_batch.append(target_log_probs[i, start_idx - 1 : end_idx])
+ labels_batch.append(labels[actual_indices])
+
+ return (log_probs_batch, labels_batch) if return_labels else log_probs_batch
+
+
+def tokenwise_vocab_logprobs(model, batch, grad=False, return_labels=False):
+ """Get vocabulary-wise log probabilities for each token in the sequence.
+
+ Returns:
+ log_probs_batch (List[Tensor]): List of tensors of shape (N, V) containing log probabilities
+ for each sequence, where N is the length of labeled tokens and V is vocab size.
+ labels_batch (List[Tensor]): List of tensors of length N. Returned only if return_labels is True
+ """
+ batch = {k: v.to(model.device) for k, v in batch.items()}
+ model.train(mode=grad)
+ with torch.set_grad_enabled(grad):
+ output = model(**batch)
+
+ logits = output.logits
+ bsz, seq_len, V = logits.shape
+ log_probs = torch.nn.functional.log_softmax(logits, dim=-1)[
+ :, :-1, :
+ ] # Don't predict for last token
+
+ # Process each sequence in batch separately
+ log_probs_batch = []
+ labels_batch = []
+ for i in range(bsz):
+ labels = batch["labels"][i][:-1]
+ # Only include positions that have labels
+ actual_indices = (labels != IGNORE_INDEX).nonzero(as_tuple=True)[0]
+ if len(actual_indices) == 0:
+ log_probs_batch.append(torch.zeros(1, V, device=labels.device))
+ continue
+ start_idx, end_idx = actual_indices[0].item(), actual_indices[-1].item()
+ if start_idx == 0:
+ warnings.warn(
+ "Index 0 in a datapoint's input_ids must not have loss (unignored labels) on it",
+ UserWarning,
+ )
+ # Return full distribution for each position: shape (N, V)
+ log_probs_batch.append(log_probs[i, start_idx - 1 : end_idx])
+ labels_batch.append(labels[actual_indices])
+
+ return (log_probs_batch, labels_batch) if return_labels else log_probs_batch
class MultiTokenEOSCriteria(StoppingCriteria):
@@ -277,3 +325,13 @@ def eval_rouge_recall_batch(gen_outputs, ground_truths):
)
]
return scores
+
+
+def extract_target_texts_from_processed_data(tokenizer, batch):
+ """Extract and detokenize text from activated positions in the batch."""
+ labels = batch["labels"]
+ labels = [elem[elem != -100] for elem in labels]
+ texts = [
+ tokenizer.decode(elem.tolist(), skip_special_tokens=True) for elem in labels
+ ]
+ return texts
diff --git a/src/train.py b/src/train.py
index 989e9cf0..a9048e3f 100644
--- a/src/train.py
+++ b/src/train.py
@@ -4,6 +4,7 @@
from model import get_model
from trainer import load_trainer
from evals import get_evaluator
+from trainer.utils import seed_everything
@hydra.main(version_base=None, config_path="../configs", config_name="train.yaml")
@@ -12,6 +13,7 @@ def main(cfg: DictConfig):
Args:
cfg (DictConfig): Config to train
"""
+ seed_everything(cfg.trainer.args.seed)
mode = cfg.get("mode", "train")
model_cfg = cfg.model
template_args = model_cfg.template_args
diff --git a/src/trainer/utils.py b/src/trainer/utils.py
index c5125b78..dfb6876f 100644
--- a/src/trainer/utils.py
+++ b/src/trainer/utils.py
@@ -1,8 +1,19 @@
import torch
+import random
+import numpy as np
from torch import nn
import torch.nn.functional as F
+def seed_everything(seed=42):
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
+
+
def compute_kl_divergence(model, target_model, inputs):
with torch.no_grad():
ref_outputs = target_model(**inputs)