PassTorch/score.py at main · JimTheFrog/PassTorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
#!/usr/bin/env python
"""
score.py: Score a password against a training checkpoint or inference model.
Updated 2026-03-10. (c) Jim Taylor. Created 2026-02-24. MIT License. Coding assistance from multiple AIs.

Notes:
  Trained model for pwnedtop100k is 1,511,264 bytes vs 831,914 bytes for the password list
  Larger password sets do not usually make the model much larger
Settings that change model size:
  hidden_dim in train.py (line 41): biggest lever
  embedding_dim in train.py (line 40)
  num_layers in train.py (line 42)
  min_char_freq in train.py (line 47): can shrink vocab a bit
No material change to model size:
  number of epochs (this mostly affects training time)
  batch size
  learning rate
  number of training passwords, except indirectly via vocab size
A model could be much smaller. Roughly:
  current 64/256: about 376k params
  32/128: about 106k params
  32/64: about 39k params
"""

from __future__ import annotations

import argparse
import json
import math

import torch

from common import (
    CharVocab,
    PasswordModel,
    SegmentVocab,
    build_model,
    load_model_artifact,
    model_type_from_payload,
    resolve_device,
    vocab_from_payload,
)


# Internal test harness. Set TEST to true to use
TEST = False
def test_args() -> argparse.Namespace:
    return argparse.Namespace(
        model="pwnedtop100k.infer.pt",
        passwords=[
            "password",
            "12345",
            "correcthorsebatterystaple",
            "p@ssw0rd",
            "this is a passphrase",
            "1234567890123456789012345678901234567890",
            "randompw~bld9K%nq"
        ],
        min_len=1,
        max_len=30,
        top_next=32,
        min_next_prob=1e-9,
        min_prob=0.0,
        allow_unk=False,
        device="auto",
        json=False,
    )


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Compute password probability and membership check under "
            "generate.py pruning constraints."
        )
    )
    parser.add_argument(
        "--model",
        required=True,
        help="Path to a training checkpoint or inference model.",
    )
    parser.add_argument("--password", required=True, help="Password to score.")
    parser.add_argument("--min-len", type=int, default=1)
    parser.add_argument("--max-len", type=int, default=30)
    parser.add_argument("--top-next", type=int, default=32, help="Require each next token to be inside generate.py's top-N expansion set.")
    parser.add_argument("--min-next-prob", type=float, default=1e-9, help="Require each next-token probability to be at least this threshold.")
    parser.add_argument("--min-prob", type=float, default=0.0, help="Require cumulative probability to stay above this threshold.")
    parser.add_argument("--allow-unk", action="store_true", help="Match generate.py's allow-unk switch for path checks.")
    parser.add_argument("--device", default="auto", help="auto, cpu, cuda, cuda:0, ...")
    parser.add_argument("--json", action="store_true", help="Emit JSON output.")
    return parser.parse_args()


def load_model(
    model_path: str,
    device: torch.device,
) -> tuple[PasswordModel, "CharVocab | SegmentVocab"]:
    payload = load_model_artifact(model_path, device=device)
    vocab = vocab_from_payload(payload)
    model_type = model_type_from_payload(payload)
    model_hparams = payload["model_hparams"]

    model = build_model(
        model_type=model_type,
        vocab_size=len(vocab),
        pad_id=model_hparams.get("pad_id", vocab.pad_id),
        model_hparams=model_hparams,
    ).to(device)
    model.load_state_dict(payload["model_state_dict"])
    model.eval()
    return model, vocab


def model_prefix_limit(model: PasswordModel) -> int | None:
    return getattr(model, "max_seq_len", None)


def add_reason_once(
    membership_reasons: list[str],
    seen_reason_categories: set[str],
    category: str,
    reason: str,
) -> None:
    if category in seen_reason_categories:
        return
    seen_reason_categories.add(category)
    membership_reasons.append(reason)


def add_model_prefix_limit_reason(
    membership_reasons: list[str],
    seen_reason_categories: set[str],
    token_count: int,
    model: PasswordModel,
) -> bool:
    max_prefix_len = model_prefix_limit(model)
    required_prefix_len = token_count + 1

    if max_prefix_len is None or required_prefix_len <= max_prefix_len:
        return False

    reason = (
        f"password requires prefix length {required_prefix_len} to score EOS, "
        f"but model max_seq_len is {max_prefix_len}"
    )
    add_reason_once(membership_reasons, seen_reason_categories, "model_prefix_limit", reason)
    return True


@torch.no_grad()
def score_password(
    model: PasswordModel,
    vocab: "CharVocab | SegmentVocab",
    password: str,
    min_len: int,
    max_len: int,
    top_next: int,
    min_next_prob: float,
    min_prob: float,
    allow_unk: bool,
) -> dict:
    membership_reasons: list[str] = []
    seen_reason_categories: set[str] = set()

    def add_reason(category: str, reason: str) -> None:
        add_reason_once(membership_reasons, seen_reason_categories, category, reason)

    password_len = len(password)
    if password_len < min_len:
        add_reason("min_len", f"password length {password_len} < min_len {min_len}")
    if password_len > max_len:
        add_reason("max_len", f"password length {password_len} > max_len {max_len}")

    # Token password: build per-token information depending on vocab type.
    is_segment_mode = isinstance(vocab, SegmentVocab)
    if is_segment_mode:
        token_strings: list[str] = vocab.segment_password(password)
        oov_segs: list[str] = sorted({s for s in token_strings if s not in vocab.stoi})
        oov_chars: list[str] = []
        if oov_segs:
            add_reason("oov_segs", "password contains segments outside the training segment vocabulary")
    else:
        token_strings = list(password)
        oov_chars = sorted({ch for ch in password if ch not in vocab.stoi})
        oov_segs = []
        if oov_chars:
            add_reason("oov_chars", "password contains characters outside training vocabulary")

    token_ids = [vocab.stoi.get(s, vocab.unk_id) for s in token_strings]
    # char_len_per_token: actual character contribution of each token (strings, not IDs,
    # so OOV segments still count their real character length).
    token_char_lens = [len(s) for s in token_strings]

    exceeds_model_prefix_limit = add_model_prefix_limit_reason(
        membership_reasons,
        seen_reason_categories,
        len(token_ids),
        model,
    )

    forbidden = {vocab.pad_id, vocab.bos_id}
    if not allow_unk:
        forbidden.add(vocab.unk_id)

    min_log_prob = math.log(min_prob) if min_prob > 0 else None

    prefix = (vocab.bos_id,)
    cumulative_logp = 0.0
    char_len = 0  # characters decoded so far (before the current step's token)

    if not exceeds_model_prefix_limit:
        all_steps = list(zip(token_ids + [vocab.eos_id], token_char_lens + [0]))
        for step_index, (token_id, token_clen) in enumerate(all_steps, start=1):
            # char_len here is the character count BEFORE adding this token,
            # matching generate.py's semantics where we check prefix_char_len before expansion.
            if prefix[-1] != vocab.eos_id and char_len >= max_len:
                add_reason(
                    "path_max_len",
                    f"prefix length reached max_len={max_len} before completing the password path"
                )

            log_probs = model.next_token_log_probs(prefix)
            token_logp = float(log_probs[token_id].item())
            token_prob = math.exp(token_logp)
            cumulative_logp += token_logp

            if token_id in forbidden:
                add_reason(
                    "forbidden_token",
                    f"step {step_index} token id {token_id} is forbidden by allow_unk/pad/bos rules"
                )

            if top_next > 0 and top_next < log_probs.numel():
                _, indices = torch.sort(log_probs, descending=True)
                if token_id not in indices[:top_next].tolist():
                    add_reason("top_next", f"not in top {top_next} choices at step {step_index}")

            if token_prob < min_next_prob:
                add_reason(
                    "min_next_prob",
                    f"step {step_index} next-token probability {token_prob:.3e} < min_next_prob {min_next_prob:.3e}"
                )

            if min_log_prob is not None and cumulative_logp < min_log_prob:
                add_reason(
                    "min_prob",
                    f"step {step_index} cumulative probability {math.exp(cumulative_logp):.3e} < min_prob {min_prob:.3e}"
                )

            if token_id == vocab.eos_id and char_len < min_len:
                add_reason("eos_before_min_len", f"EOS arrives at length {char_len} < min_len {min_len}")

            prefix = prefix + (token_id,)
            char_len += token_clen

    tokenized_probability = 0.0 if exceeds_model_prefix_limit else math.exp(cumulative_logp)
    exact_probability = 0.0 if (oov_chars or oov_segs or exceeds_model_prefix_limit) else tokenized_probability

    result: dict = {
        "password": password,
        "membership": len(membership_reasons) == 0,
        "membership_reasons": membership_reasons,
        "exact_probability": exact_probability,
        "exact_log_probability": None if (oov_chars or oov_segs or exceeds_model_prefix_limit) else cumulative_logp,
        "tokenized_probability": tokenized_probability,
        "tokenized_log_probability": None if exceeds_model_prefix_limit else cumulative_logp,
        "contains_oov_chars": bool(oov_chars),
        "oov_chars": oov_chars,
        "settings": {
            "min_len": min_len,
            "max_len": max_len,
            "top_next": top_next,
            "min_next_prob": min_next_prob,
            "min_prob": min_prob,
            "allow_unk": allow_unk,
        },
    }
    if is_segment_mode:
        result["contains_oov_segs"] = bool(oov_segs)
        result["oov_segs"] = oov_segs
    return result


@torch.no_grad()
def batch_score_passwords(
    model: PasswordModel,
    vocab: "CharVocab | SegmentVocab",
    passwords: list[str],
    min_len: int,
    max_len: int,
    top_next: int,
    min_next_prob: float,
    min_prob: float,
    allow_unk: bool,
) -> list[dict]:
    # Tokenize and pad
    is_segment_mode = isinstance(vocab, SegmentVocab)
    token_lists = []
    token_char_lens_lists = []
    oov_chars_list = []
    oov_segs_list = []
    for password in passwords:
        if is_segment_mode:
            token_strings = vocab.segment_password(password)
            oov_segs = sorted({s for s in token_strings if s not in vocab.stoi})
            oov_chars = []
        else:
            token_strings = list(password)
            oov_chars = sorted({ch for ch in password if ch not in vocab.stoi})
            oov_segs = []
        token_ids = [vocab.stoi.get(s, vocab.unk_id) for s in token_strings]
        token_lists.append(token_ids)
        token_char_lens_lists.append([len(s) for s in token_strings])
        oov_chars_list.append(oov_chars)
        oov_segs_list.append(oov_segs)
    # Pad
    max_len_tokens = max((len(t) for t in token_lists), default=0)
    pad_id = vocab.pad_id
    eos_id = vocab.eos_id
    batch_token_ids = [t + [eos_id] + [pad_id] * (max_len_tokens - len(t)) for t in token_lists]
    batch_char_lens = [l + [0] + [0] * (max_len_tokens - len(l)) for l in token_char_lens_lists]
    batch_token_ids_tensor = torch.tensor(batch_token_ids, dtype=torch.long, device=next(model.parameters()).device)
    # Scoring
    results = []
    for i, password in enumerate(passwords):
        membership_reasons = []
        seen_reason_categories: set[str] = set()

        def add_reason(category: str, reason: str) -> None:
            add_reason_once(membership_reasons, seen_reason_categories, category, reason)
        pw_len = len(password)
        if pw_len < min_len:
            add_reason("min_len", f"password length {pw_len} < min_len {min_len}")
        if pw_len > max_len:
            add_reason("max_len", f"password length {pw_len} > max_len {max_len}")
        token_ids = token_lists[i]
        token_char_lens = token_char_lens_lists[i]
        oov_chars = oov_chars_list[i]
        oov_segs = oov_segs_list[i]
        if oov_chars:
            add_reason("oov_chars", "password contains characters outside training vocabulary")
        if oov_segs:
            add_reason("oov_segs", "password contains segments outside the training segment vocabulary")
        exceeds_model_prefix_limit = add_model_prefix_limit_reason(
            membership_reasons,
            seen_reason_categories,
            len(token_ids),
            model,
        )
        forbidden = {vocab.pad_id, vocab.bos_id}
        if not allow_unk:
            forbidden.add(vocab.unk_id)
        min_log_prob = math.log(min_prob) if min_prob > 0 else None
        prefix = (vocab.bos_id,)
        cumulative_logp = 0.0
        char_len = 0
        if not exceeds_model_prefix_limit:
            all_steps = list(zip(token_ids + [vocab.eos_id], token_char_lens + [0]))
            for step_index, (token_id, token_clen) in enumerate(all_steps, start=1):
                if prefix[-1] != vocab.eos_id and char_len >= max_len:
                    add_reason("path_max_len", f"prefix length reached max_len={max_len} before completing the password path")
                log_probs = model.next_token_log_probs(prefix)
                token_logp = float(log_probs[token_id].item())
                token_prob = math.exp(token_logp)
                cumulative_logp += token_logp
                if token_id in forbidden:
                    add_reason("forbidden_token", f"step {step_index} token id {token_id} is forbidden by allow_unk/pad/bos rules")
                if top_next > 0 and top_next < log_probs.numel():
                    _, indices = torch.sort(log_probs, descending=True)
                    if token_id not in indices[:top_next].tolist():
                        add_reason("top_next", f"not in top {top_next} choices at step {step_index}")
                if token_prob < min_next_prob:
                    add_reason("min_next_prob", f"step {step_index} next-token probability {token_prob:.3e} < min_next_prob {min_next_prob:.3e}")
                if min_log_prob is not None and cumulative_logp < min_log_prob:
                    add_reason("min_prob", f"step {step_index} cumulative probability {math.exp(cumulative_logp):.3e} < min_prob {min_prob:.3e}")
                if token_id == vocab.eos_id and char_len < min_len:
                    add_reason("eos_before_min_len", f"EOS arrives at length {char_len} < min_len {min_len}")
                prefix = prefix + (token_id,)
                char_len += token_clen
        tokenized_probability = 0.0 if exceeds_model_prefix_limit else math.exp(cumulative_logp)
        exact_probability = 0.0 if (oov_chars or oov_segs or exceeds_model_prefix_limit) else tokenized_probability
        result = {
            "password": password,
            "membership": len(membership_reasons) == 0,
            "membership_reasons": membership_reasons,
            "exact_probability": exact_probability,
            "exact_log_probability": None if (oov_chars or oov_segs or exceeds_model_prefix_limit) else cumulative_logp,
            "tokenized_probability": tokenized_probability,
            "tokenized_log_probability": None if exceeds_model_prefix_limit else cumulative_logp,
            "contains_oov_chars": bool(oov_chars),
            "oov_chars": oov_chars,
            "settings": {
                "min_len": min_len,
                "max_len": max_len,
                "top_next": top_next,
                "min_next_prob": min_next_prob,
                "min_prob": min_prob,
                "allow_unk": allow_unk,
            },
        }
        if is_segment_mode:
            result["contains_oov_segs"] = bool(oov_segs)
            result["oov_segs"] = oov_segs
        results.append(result)
    return results


def escape_text(value: str) -> str:
    return value.encode("unicode_escape").decode("ascii")


def print_result(result: dict) -> None:
    print(f"password={escape_text(result['password'])}")
    print(f"membership={result['membership']}")
    print(f"exact_probability={result['exact_probability']:.17g}")
    print(f"tokenized_probability={result['tokenized_probability']:.17g}")

    if result["membership_reasons"]:
        print("membership_reasons=")
        for reason in result["membership_reasons"]:
            print(f"- {reason}")

    if result["contains_oov_chars"]:
        escaped_chars = "".join(escape_text(ch) for ch in result["oov_chars"])
        print(f"oov_chars={escaped_chars}")

    if result.get("contains_oov_segs"):
        print(f"oov_segs={result['oov_segs']}")


def print_test_result(result: dict) -> None:
    print(
        [
            escape_text(result["password"]),
            result["membership"],
            result["exact_probability"],
#            result["tokenized_probability"],
#            result["membership_reasons"],
#            [escape_text(ch) for ch in result["oov_chars"]],
        ]
    )


def main() -> None:
    args = test_args() if TEST else parse_args()
    device = resolve_device(args.device)
    passwords = args.passwords if hasattr(args, "passwords") else [args.password]

    model, vocab = load_model(args.model, device=device)
    results = [
        score_password(
            model=model,
            vocab=vocab,
            password=password,
            min_len=args.min_len,
            max_len=args.max_len,
            top_next=args.top_next,
            min_next_prob=args.min_next_prob,
            min_prob=args.min_prob,
            allow_unk=args.allow_unk,
        )
        for password in passwords
    ]

    if args.json:
        payload = results if len(results) > 1 else results[0]
        print(json.dumps(payload, ensure_ascii=True, sort_keys=True))
        return

    if TEST:
        print('Password / Found / Probability')
        for result in results:
            print_test_result(result)
        return

    for index, result in enumerate(results):
        if index:
            print()
        print_result(result)


if __name__ == "__main__":
    main()