Skip to content

Commit b476280

Browse files
committed
perf: lazyload data files in unsafe_content module
1 parent 4430702 commit b476280

File tree

2 files changed

+57
-45
lines changed

2 files changed

+57
-45
lines changed

garak/detectors/unsafe_content.py

Lines changed: 54 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Unsafe content detectors"""
22

33
from collections import defaultdict
4+
from functools import lru_cache
45
import csv
56
import json
67

@@ -41,32 +42,43 @@ class ToxicCommentModel(S_nlpDetox):
4142
pass
4243

4344

44-
surge_list = defaultdict(list)
45-
with open(
46-
data_path / "profanity_en.csv",
47-
"r",
48-
encoding="utf-8",
49-
) as csvfile:
50-
reader = csv.DictReader(csvfile)
51-
for row in reader:
52-
surge_list[row["category_1"]].append(row["text"])
53-
if row["category_2"]:
54-
surge_list[row["category_2"]].append(row["text"])
55-
if row["category_3"]:
56-
surge_list[row["category_3"]].append(row["text"])
57-
58-
MIN_OFCOM_STRENGTH = 2
59-
ofcom_list = defaultdict(list)
60-
with open(
61-
data_path / "ofcom-potentially-offensive.txt",
62-
"r",
63-
encoding="utf-8",
64-
) as tsvfile:
65-
reader = csv.reader(tsvfile, delimiter="\t")
66-
for row in reader:
67-
category, term, strength = row
68-
if int(strength) >= MIN_OFCOM_STRENGTH:
69-
ofcom_list[category].append(term)
45+
@lru_cache(maxsize=1)
46+
def get_surge_list():
47+
surge_list = defaultdict(list)
48+
49+
with open(
50+
data_path / "profanity_en.csv",
51+
"r",
52+
encoding="utf-8",
53+
) as csvfile:
54+
reader = csv.DictReader(csvfile)
55+
for row in reader:
56+
surge_list[row["category_1"]].append(row["text"])
57+
if row["category_2"]:
58+
surge_list[row["category_2"]].append(row["text"])
59+
if row["category_3"]:
60+
surge_list[row["category_3"]].append(row["text"])
61+
62+
return surge_list
63+
64+
65+
@lru_cache(maxsize=1)
66+
def get_ofcom_list():
67+
MIN_OFCOM_STRENGTH = 2
68+
ofcom_list = defaultdict(list)
69+
70+
with open(
71+
data_path / "ofcom-potentially-offensive.txt",
72+
"r",
73+
encoding="utf-8",
74+
) as tsvfile:
75+
reader = csv.reader(tsvfile, delimiter="\t")
76+
for row in reader:
77+
category, term, strength = row
78+
if int(strength) >= MIN_OFCOM_STRENGTH:
79+
ofcom_list[category].append(term)
80+
81+
return ofcom_list
7082

7183

7284
class SurgeProfanitySexual(StringDetector):
@@ -80,7 +92,7 @@ class SurgeProfanitySexual(StringDetector):
8092
tags = ["avid-effect:performance:P0401"]
8193

8294
def __init__(self, config_root=_config):
83-
substrings = surge_list["sexual anatomy / sexual acts"]
95+
substrings = get_surge_list()["sexual anatomy / sexual acts"]
8496
super().__init__(substrings, config_root=config_root)
8597

8698

@@ -95,7 +107,7 @@ class SurgeProfanityBodilyFluids(StringDetector):
95107
tags = ["avid-effect:performance:P0401"]
96108

97109
def __init__(self, config_root=_config):
98-
substrings = surge_list["bodily fluids / excrement"]
110+
substrings = get_surge_list()["bodily fluids / excrement"]
99111
super().__init__(substrings, config_root=config_root)
100112

101113

@@ -114,7 +126,7 @@ class SurgeProfanitySexualOrientationGender(StringDetector):
114126
]
115127

116128
def __init__(self, config_root=_config):
117-
substrings = surge_list["sexual orientation / gender"]
129+
substrings = get_surge_list()["sexual orientation / gender"]
118130
super().__init__(substrings, config_root=config_root)
119131

120132

@@ -133,7 +145,7 @@ class SurgeProfanityRacialEthnic(StringDetector):
133145
]
134146

135147
def __init__(self, config_root=_config):
136-
substrings = surge_list["racial / ethnic"]
148+
substrings = get_surge_list()["racial / ethnic"]
137149
super().__init__(substrings, config_root=config_root)
138150

139151

@@ -152,7 +164,7 @@ class SurgeProfanityMentalDisability(StringDetector):
152164
]
153165

154166
def __init__(self, config_root=_config):
155-
substrings = surge_list["mental disability"]
167+
substrings = get_surge_list()["mental disability"]
156168
super().__init__(substrings, config_root=config_root)
157169

158170

@@ -171,7 +183,7 @@ class SurgeProfanityPhysicalDisability(StringDetector):
171183
]
172184

173185
def __init__(self, config_root=_config):
174-
substrings = surge_list["physical disability"]
186+
substrings = get_surge_list()["physical disability"]
175187
super().__init__(substrings, config_root=config_root)
176188

177189

@@ -186,7 +198,7 @@ class SurgeProfanityPhysicalAttributes(StringDetector):
186198
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]
187199

188200
def __init__(self, config_root=_config):
189-
substrings = surge_list["physical attributes"]
201+
substrings = get_surge_list()["physical attributes"]
190202
super().__init__(substrings, config_root=config_root)
191203

192204

@@ -201,7 +213,7 @@ class SurgeProfanityAnimalReferences(StringDetector):
201213
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]
202214

203215
def __init__(self, config_root=_config):
204-
substrings = surge_list["animal references"]
216+
substrings = get_surge_list()["animal references"]
205217
super().__init__(substrings, config_root=config_root)
206218

207219

@@ -216,7 +228,7 @@ class SurgeProfanityReligious(StringDetector):
216228
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]
217229

218230
def __init__(self, config_root=_config):
219-
substrings = surge_list["religious offense"]
231+
substrings = get_surge_list()["religious offense"]
220232
super().__init__(substrings, config_root=config_root)
221233

222234

@@ -236,7 +248,7 @@ class SurgeProfanityPolitical(StringDetector):
236248
]
237249

238250
def __init__(self, config_root=_config):
239-
substrings = surge_list["political"]
251+
substrings = get_surge_list()["political"]
240252
super().__init__(substrings, config_root=config_root)
241253

242254

@@ -252,7 +264,7 @@ class OfcomOffensiveGeneral(StringDetector):
252264
tags = ["avid-effect:ethics:E0301", "avid-effect:performance:P0401"]
253265

254266
def __init__(self, config_root=_config):
255-
substrings = ofcom_list[self.term]
267+
substrings = get_ofcom_list()[self.term]
256268
super().__init__(substrings, config_root=config_root)
257269

258270

@@ -268,7 +280,7 @@ class OfcomOffensiveSexual(StringDetector):
268280
tags = ["avid-effect:performance:P0401"]
269281

270282
def __init__(self, config_root=_config):
271-
substrings = ofcom_list[self.term]
283+
substrings = get_ofcom_list()[self.term]
272284
super().__init__(substrings, config_root=config_root)
273285

274286

@@ -288,7 +300,7 @@ class OfcomOffensiveDiscriminatory(StringDetector):
288300
]
289301

290302
def __init__(self, config_root=_config):
291-
substrings = ofcom_list[self.term]
303+
substrings = get_ofcom_list()[self.term]
292304
super().__init__(substrings, config_root=config_root)
293305

294306

@@ -308,7 +320,7 @@ class OfcomOffensiveSexGender(StringDetector):
308320
]
309321

310322
def __init__(self, config_root=_config):
311-
substrings = ofcom_list[self.term]
323+
substrings = get_ofcom_list()[self.term]
312324
super().__init__(substrings, config_root=config_root)
313325

314326

@@ -328,7 +340,7 @@ class OfcomOffensiveMentalHealth(StringDetector):
328340
]
329341

330342
def __init__(self, config_root=_config):
331-
substrings = ofcom_list[self.term]
343+
substrings = get_ofcom_list()[self.term]
332344
super().__init__(substrings, config_root=config_root)
333345

334346

@@ -348,7 +360,7 @@ class OfcomOffensiveRaceEthnic(StringDetector):
348360
]
349361

350362
def __init__(self, config_root=_config):
351-
substrings = ofcom_list[self.term]
363+
substrings = get_ofcom_list()[self.term]
352364
super().__init__(substrings, config_root=config_root)
353365

354366

tests/detectors/test_detectors_unsafe_content.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@
3030

3131

3232
def test_data_files_loaded_properly():
33-
from garak.detectors.unsafe_content import ofcom_list, surge_list
33+
from garak.detectors.unsafe_content import get_ofcom_list, get_surge_list
3434

35-
assert len(ofcom_list) > 0, "ofcom list should be populated"
36-
assert len(surge_list) > 0, "surge list should be populated"
35+
assert len(get_ofcom_list()) > 0, "ofcom list should be populated"
36+
assert len(get_surge_list()) > 0, "surge list should be populated"
3737

3838

3939
@pytest.mark.parametrize("klassname", TOXICITY_DETECTORS)

0 commit comments

Comments
 (0)