Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions docs/curate-text/process-data/content-processing/pii.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ The PII de-identification tool helps you remove the following sensitive data fro
| `"US_DRIVER_LICENSE"` | US driver's licenses |
| `"US_PASSPORT"` | US passport numbers |
| `"LOCATION"` | Location information |
| `"Custom(User Defined)"` | Custom information |

### Redaction Format

Expand Down Expand Up @@ -62,6 +63,48 @@ This consistent formatting makes it easy to identify processed content and under

Here's how to read, de-identify, and write a dataset:

### Custom PII Recognizers

NeMo Curator supports custom PII entity recognizers via the `custom_analyzer_recognizers` parameter. For example:

```python
from nemo_curator.pii.custom_recognizers_sample import crypto_recognizer, medical_license_recognizer, iban_generic_recognizer
# from presidio_analyzer import PatternRecognizer, Pattern

# crypto_recognizer = PatternRecognizer(
# supported_entity="CRYPTO",
# patterns=[
# Pattern(name="Ethereum wallet", regex="0x[a-fA-F0-9]{40}", score=0.9)
# ]
# )

# medical_license_recognizer = PatternRecognizer(
# supported_entity="MEDICAL_LICENSE",
# patterns=[
# Pattern(name="Medical license", regex="MED[0-9]{7}", score=0.9)
# ]
# )

# iban_generic_recognizer = PatternRecognizer(
# supported_entity="IBAN_CODE",
# patterns=[
# Pattern(
# name = "IBAN Code",
# regex = r"\b([A-Z]{2})([0-9]{2})([A-Z]{4})([A-Z0-9]{14})\b",
# score = 0.9,
# )
# ]
# )
from nemo_curator.pii.custom_recognizers_sample import crypto_recognizer, medical_license_recognizer, iban_generic_recognizer
modifier = PiiModifier(
supported_entities=["CRYPTO", "MEDICAL_LICENSE", "IBAN_CODE"],
anonymize_action="replace",
custom_analyzer_recognizers=[crypto_recognizer, medical_license_recognizer, iban_generic_recognizer],
)
```

As shown above, you can define custom recognizers for Ethereum wallets, medical licenses, IBAN codes, etc., and include them in the PII processing workflow.

::::{tab-set}

:::{tab-item} Python
Expand Down
49 changes: 49 additions & 0 deletions examples/custom_pii_and_deidentify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd
from nemo_curator.datasets import DocumentDataset
from nemo_curator.modifiers.pii_modifier import PiiModifier
from nemo_curator.modules.modify import Modify
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.pii.custom_recognizers_sample import crypto_recognizer, medical_license_recognizer, iban_generic_recognizer

import argparse
from nemo_curator.utils.script_utils import ArgumentHelper

def main(args: argparse.Namespace) -> None:
# define client: choice "cpu" or "gpu"
client = get_client(**ArgumentHelper.parse_client_args(args))

# create a sample dataframe
dataframe = pd.DataFrame(
{
"text":
[
"My crypto wallet is 0x32Be343B94f860124dC4fEe278FDCBD38C102D88",
"My IBAN is GB33BUKB20201555555555",
"My medical license number is MED1234567"
]
}
)

# Load data - be DocumentDataset
dataset = DocumentDataset.from_pandas(dataframe, npartitions=1)

# Initialize PiiModifier
modifier = PiiModifier(
log_dir="./logs",
batch_size=2,
supported_entities=["CRYPTO", "MEDICAL_LICENSE", "IBAN_CODE"], # Custom entities (sample)
anonymize_action="replace",
custom_analyzer_recognizers=[crypto_recognizer, medical_license_recognizer, iban_generic_recognizer], # Custom recognizers (sample)
)
modify = Modify(modifier)
modified_dataset = modify(dataset)
datasets = modified_dataset.to_pandas()

# (Optional) Save the modified dataset
datasets.to_csv("./modified_data.csv", index=False)

def attach_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
return ArgumentHelper(parser).add_distributed_args()

if __name__ == "__main__":
main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())
8 changes: 7 additions & 1 deletion nemo_curator/modifiers/pii_modifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# limitations under the License.


from typing import Optional, List
from presidio_analyzer import EntityRecognizer
import pandas as pd

from nemo_curator.modifiers import DocumentModifier
Expand Down Expand Up @@ -52,6 +54,7 @@ def __init__(
supported_entities: list[str] | None = None,
anonymize_action: str = "redact",
batch_size: int = DEFAULT_BATCH_SIZE,
custom_analyzer_recognizers: Optional[List[EntityRecognizer]] = None,
device: str = "gpu",
**kwargs,
):
Expand All @@ -63,6 +66,7 @@ def __init__(
self.kwargs = kwargs

self.batch_size = batch_size
self.custom_analyzer_recognizers = custom_analyzer_recognizers
self.device = device

@batched
Expand Down Expand Up @@ -102,5 +106,7 @@ def load_deidentifier(self) -> "PiiDeidentifier": # noqa: F821
**self.kwargs,
)
deidentifier.analyzer.nlp_engine.nlp[deidentifier.language].max_length = DEFAULT_MAX_DOC_SIZE

if self.custom_analyzer_recognizers is not None:
for recognizer in self.custom_analyzer_recognizers:
deidentifier.analyzer.registry.add_recognizer(recognizer)
return deidentifier
26 changes: 26 additions & 0 deletions nemo_curator/pii/custom_recognizers_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from presidio_analyzer import PatternRecognizer, Pattern

crypto_recognizer = PatternRecognizer(
supported_entity="CRYPTO",
patterns=[
Pattern(name="Ethereum wallet", regex="0x[a-fA-F0-9]{40}", score=0.9)
]
)

medical_license_recognizer = PatternRecognizer(
supported_entity="MEDICAL_LICENSE",
patterns=[
Pattern(name="Medical license", regex="MED[0-9]{7}", score=0.9)
]
)

iban_generic_recognizer = PatternRecognizer(
supported_entity="IBAN_CODE",
patterns=[
Pattern(
name = "IBAN Code",
regex = r"\b([A-Z]{2})([0-9]{2})([A-Z]{4})([A-Z0-9]{14})\b",
score = 0.9,
)
]
)
51 changes: 51 additions & 0 deletions tests/test_custom_pii_and_deidentify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest

def test_custom_pii_detection_and_deidentification():
try:
import pandas as pd
from nemo_curator.datasets import DocumentDataset
from nemo_curator.modifiers.pii_modifier import PiiModifier
from nemo_curator.modules.modify import Modify
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.pii.custom_recognizers_sample import crypto_recognizer, medical_license_recognizer, iban_generic_recognizer

client = get_client(cluster_type="gpu")
dataframe = pd.DataFrame(
{
"text":
[
"My crypto wallet is 0x32Be343B94f860124dC4fEe278FDCBD38C102D88",
"My IBAN is GB33BUKB20201555555555",
"My medical license number is MED1234567"
]
}
)
dataset = DocumentDataset.from_pandas(dataframe, npartitions=1)

# Recommended entities
supported_entities=[
"CRYPTO",
"MEDICAL_LICENSE",
"IBAN_CODE",
]

modifier = PiiModifier(
log_dir="./logs",
batch_size=8,
supported_entities=supported_entities,
anonymize_action="replace",
custom_analyzer_recognizers=[crypto_recognizer, medical_license_recognizer, iban_generic_recognizer],
)
modify = Modify(modifier)
modified_dataset = modify(dataset)
datasets = modified_dataset.to_pandas()
for i, status in enumerate(datasets["text"] == [ "My crypto wallet is <CRYPTO>", "My IBAN is <IBAN_CODE>", "My medical license number is <MEDICAL_LICENSE>"]):
if status:
print("De-identification successful:", datasets["text"][i])
assert True
else:
print("De-identification failed:", datasets["text"][i])
assert False, "Test Failed custom_pii_and_deidentify, data is not de-identified"
except Exception as e:
print(e)
assert False, "Test Failed custom_pii_and_deidentify, code error"