Moved SpecialTokens assignment after the modification to avoid "Collection Modified" error (#7328)

shaltielshmid · web-flow · commit 01c41644edef · 2024-12-05T08:54:37.000-08:00
* Moved special tokens assignment below so the collection won't be modified

* Added safe dictionary inversion

* Added storing the not-normalized special tokens

* Added support for net standard

* Added and updated tests

* Updated without additional memory allocation
diff --git a/src/Microsoft.ML.Tokenizers/Model/BertTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/BertTokenizer.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
@@ -762,15 +762,16 @@ private static BertTokenizer Create(
 
             options.Normalizer ??= options.ApplyBasicTokenization ? new BertNormalizer(options.LowerCaseBeforeTokenization, options.IndividuallyTokenizeCjk, options.RemoveNonSpacingMarks) : null;
 
+            IReadOnlyDictionary<string, int>? specialTokensDict = options.SpecialTokens;
             if (options.SplitOnSpecialTokens)
             {
                 bool lowerCase = options.ApplyBasicTokenization && options.LowerCaseBeforeTokenization;
                 if (options.SpecialTokens is not null)
                 {
                     if (lowerCase)
                     {
-                        Dictionary<string, int> dic = options.SpecialTokens.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
-                        options.SpecialTokens = dic;
+                        Dictionary<string, int> tempSpecialTokens = [];
+                        specialTokensDict = tempSpecialTokens;
 
                         foreach (var kvp in options.SpecialTokens)
                         {
@@ -779,37 +780,49 @@ private static BertTokenizer Create(
                                 throw new ArgumentException($"The special token '{kvp.Key}' is not in the vocabulary or assigned id value {id} different than the value {kvp.Value} in the special tokens.");
                             }
 
-                            // Ensure that the special tokens are lowercased.
-                            dic[kvp.Key.ToLowerInvariant()] = kvp.Value;
+                            // Add the special token into our dictionary, normalizing it, and adding it into the
+                            // main vocab, if needed. 
+                            AddSpecialToken(vocab, tempSpecialTokens, kvp.Key, true);
                         }
                     }
                 }
                 else
                 {
-                    // Create a dictionary with the special tokens.
-                    Dictionary<string, int> specialTokens = new Dictionary<string, int>();
-                    options.SpecialTokens = specialTokens;
-
-                    AddSpecialToken(vocab, specialTokens, options.UnknownToken, lowerCase);
-                    AddSpecialToken(vocab, specialTokens, options.SeparatorToken, lowerCase);
-                    AddSpecialToken(vocab, specialTokens, options.PaddingToken, lowerCase);
-                    AddSpecialToken(vocab, specialTokens, options.ClassificationToken, lowerCase);
-                    AddSpecialToken(vocab, specialTokens, options.MaskingToken, lowerCase);
+                    // Create a dictionary with the special tokens - store the un-normalized forms in the options as
+                    // that field is exposed to the public. In addition, store the normalized form for creating the 
+                    // pre-tokenizer.
+                    Dictionary<string, int> tempSpecialTokens = [];
+                    Dictionary<string, int> notNormalizedSpecialTokens = [];
+                    AddSpecialToken(vocab, tempSpecialTokens, options.UnknownToken, lowerCase, notNormalizedSpecialTokens);
+                    AddSpecialToken(vocab, tempSpecialTokens, options.SeparatorToken, lowerCase, notNormalizedSpecialTokens);
+                    AddSpecialToken(vocab, tempSpecialTokens, options.PaddingToken, lowerCase, notNormalizedSpecialTokens);
+                    AddSpecialToken(vocab, tempSpecialTokens, options.ClassificationToken, lowerCase, notNormalizedSpecialTokens);
+                    AddSpecialToken(vocab, tempSpecialTokens, options.MaskingToken, lowerCase, notNormalizedSpecialTokens);
+
+                    options.SpecialTokens = notNormalizedSpecialTokens;
+                    specialTokensDict = tempSpecialTokens;
                 }
             }
 
-            options.PreTokenizer ??= options.ApplyBasicTokenization ? PreTokenizer.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? options.SpecialTokens : null) : PreTokenizer.CreateWhiteSpace();
+            // We set the PreTokenizer here using the normalized special tokens dict (if relevant), and therefore we can 
+            // keep the not-normalized special tokens dict in the options passed to the WordPieceTokenizer.
+            options.PreTokenizer ??= options.ApplyBasicTokenization ? PreTokenizer.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? specialTokensDict : null) : PreTokenizer.CreateWhiteSpace();
 
             return new BertTokenizer(vocab, vocabReverse, options);
         }
 
-        private static void AddSpecialToken(Dictionary<StringSpanOrdinalKey, int> vocab, Dictionary<string, int> specialTokens, string token, bool lowerCase)
+        private static void AddSpecialToken(Dictionary<StringSpanOrdinalKey, int> vocab, Dictionary<string, int> specialTokens, string token, bool lowerCase, Dictionary<string, int>? notNormalizedSpecialTokens = null)
         {
             if (token is null || !vocab.TryGetValue(new StringSpanOrdinalKey(token), out int id))
             {
                 throw new ArgumentException($"The special token '{token}' is not in the vocabulary.");
             }
 
+            if (notNormalizedSpecialTokens is not null)
+            {
+                notNormalizedSpecialTokens[token] = id;
+            }
+
             string normalizedToken = token;
             if (lowerCase)
             {
diff --git a/src/Microsoft.ML.Tokenizers/Model/WordPieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/WordPieceTokenizer.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
@@ -42,7 +42,7 @@ internal WordPieceTokenizer(
             options ??= new();
 
             SpecialTokens = options.SpecialTokens;
-            SpecialTokensReverse = options.SpecialTokens is not null ? options.SpecialTokens.ToDictionary(kvp => kvp.Value, kvp => kvp.Key) : null;
+            SpecialTokensReverse = options.SpecialTokens is not null ? options.SpecialTokens.GroupBy(kvp => kvp.Value).ToDictionary(g => g.Key, g => g.First().Key) : null;
 
             if (options.UnknownToken is null)
             {
@@ -800,4 +800,4 @@ public OperationStatus Decode(IEnumerable<int> ids, Span<char> destination, bool
             return OperationStatus.Done;
         }
     }
-}
+}
diff --git a/test/Microsoft.ML.Tokenizers.Tests/BertTokenizerTests.cs b/test/Microsoft.ML.Tokenizers.Tests/BertTokenizerTests.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
@@ -14,6 +14,91 @@ namespace Microsoft.ML.Tokenizers.Tests
 {
     public class BertTokenizerTests
     {
+        [Fact]
+        public void TestWithLowerCasingExplicitSpecialTokens()
+        {
+            // Add [SPECIAL] token at end (to keep indices as is)
+            //                     Ids: 0        1        2        3        4     5      6    7      8        9      10      11     12,   13 
+            string[] vocabTokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "!", ",", "?", "hello", "world", "how", "are", "you", "[SPECIAL]"];
+
+            string vocabFile = WordPieceTests.CreateVocabFile(vocabTokens);
+
+            Dictionary<string, int> specialTokens = new() {
+                { "[PAD]", 0 },
+                { "[UNK]", 1 },
+                { "[CLS]", 2 },
+                { "[SEP]", 3 },
+                { "[MASK]", 4 },
+                { "[SPECIAL]", 13 },
+            };
+            var bertOptions = new BertOptions()
+            {
+                SpecialTokens = specialTokens
+            };
+
+            try
+            {
+                using Stream vocabStream = File.OpenRead(vocabFile);
+                BertTokenizer[] bertTokenizers = [BertTokenizer.Create(vocabFile, bertOptions), BertTokenizer.Create(vocabStream, bertOptions)];
+
+                foreach (var tokenizer in bertTokenizers)
+                {
+                    Assert.NotNull(tokenizer.PreTokenizer);
+                    Assert.Equal("[UNK]", tokenizer.UnknownToken);
+                    Assert.Equal(1, tokenizer.UnknownTokenId);
+                    Assert.NotNull(tokenizer.Normalizer);
+                    Assert.NotNull(tokenizer.PreTokenizer);
+
+                    Assert.True(tokenizer.SpecialTokens!.ContainsKey("[SPECIAL]"));
+
+                    string text = "Hello, How are you [SPECIAL]?";
+                    var tokens = tokenizer.EncodeToTokens(text, out string? normalizedText);
+                    Assert.Equal("hello, how are you [special]?", normalizedText);
+
+                    Assert.Equal(
+                        [
+                            new EncodedToken(8, "hello", new Range(0, 5)),
+                            new EncodedToken(6, ",", new Range(5, 6)),
+                            new EncodedToken(10, "how", new Range(7, 10)),
+                            new EncodedToken(11, "are", new Range(11, 14)),
+                            new EncodedToken(12, "you", new Range(15, 18)),
+                            new EncodedToken(13, "[SPECIAL]", new Range(19, 28)),
+                            new EncodedToken(7, "?", new Range(28, 29))
+                        ],
+                        tokens);
+
+                    var ids = tokenizer.EncodeToIds(text);
+                    Assert.Equal([tokenizer.ClassificationTokenId, 8, 6, 10, 11, 12, 13, 7, tokenizer.SeparatorTokenId], ids);
+
+                    Assert.Equal("[CLS] hello, how are you [SPECIAL]? [SEP]", tokenizer.Decode(ids));
+                    Assert.Equal("hello, how are you?", tokenizer.Decode(ids, skipSpecialTokens: true));
+
+                    tokens = tokenizer.EncodeToTokens(tokenizer.Decode(ids), out normalizedText);
+                    Assert.Equal("[cls] hello, how are you [special]? [sep]", normalizedText);
+                    Assert.Equal(
+                        [
+                            new EncodedToken(2, "[CLS]", new Range(0, 5)),
+                            new EncodedToken(8, "hello", new Range(6, 11)),
+                            new EncodedToken(6, ",", new Range(11, 12)),
+                            new EncodedToken(10, "how", new Range(13, 16)),
+                            new EncodedToken(11, "are", new Range(17, 20)),
+                            new EncodedToken(12, "you", new Range(21, 24)),
+                            new EncodedToken(13, "[SPECIAL]", new Range(25, 34)),
+                            new EncodedToken(7, "?", new Range(34, 35)),
+                            new EncodedToken(3, "[SEP]", new Range(36, 41))
+                        ],
+                        tokens);
+
+                    ids = tokenizer.EncodeToIds(normalizedText!);
+                    Assert.Equal([tokenizer.ClassificationTokenId, tokenizer.ClassificationTokenId, 8, 6, 10, 11, 12, 13, 7, tokenizer.SeparatorTokenId, tokenizer.SeparatorTokenId], ids);
+                }
+            }
+            finally
+            {
+                File.Delete(vocabFile);
+            }
+        }
+
         [Fact]
         public void TestWithLowerCasing()
         {
@@ -35,6 +120,10 @@ public void TestWithLowerCasing()
                     Assert.NotNull(tokenizer.Normalizer);
                     Assert.NotNull(tokenizer.PreTokenizer);
 
+                    // Make sure the SpecialTokens dictionary contains the not-normalized tokens
+                    Assert.True(tokenizer.SpecialTokens!.ContainsKey(tokenizer.UnknownToken));
+                    Assert.True(tokenizer.SpecialTokens!.ContainsKey(tokenizer.ClassificationToken));
+
                     string text = "Hello, How are you?";
                     var tokens = tokenizer.EncodeToTokens(text, out string? normalizedText);
                     Assert.Equal("hello, how are you?", normalizedText);
@@ -511,4 +600,4 @@ public void TestCreateTokenTypeIdsFromSequences()
             }
         }
     }
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// Licensed to the .NET Foundation under one or more agreements.`
	`1`	`+// Licensed to the .NET Foundation under one or more agreements.`
`2`	`2`	`// The .NET Foundation licenses this file to you under the MIT license.`
`3`	`3`	`// See the LICENSE file in the project root for more information.`
`4`	`4`
`@@ -762,15 +762,16 @@ private static BertTokenizer Create(`
`762`	`762`
`763`	`763`	`options.Normalizer ??= options.ApplyBasicTokenization ? new BertNormalizer(options.LowerCaseBeforeTokenization, options.IndividuallyTokenizeCjk, options.RemoveNonSpacingMarks) : null;`
`764`	`764`
	`765`	`+ IReadOnlyDictionary<string, int>? specialTokensDict = options.SpecialTokens;`
`765`	`766`	`if (options.SplitOnSpecialTokens)`
`766`	`767`	`{`
`767`	`768`	`bool lowerCase = options.ApplyBasicTokenization && options.LowerCaseBeforeTokenization;`
`768`	`769`	`if (options.SpecialTokens is not null)`
`769`	`770`	`{`
`770`	`771`	`if (lowerCase)`
`771`	`772`	`{`
`772`		`- Dictionary<string, int> dic = options.SpecialTokens.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);`
`773`		`- options.SpecialTokens = dic;`
	`773`	`+ Dictionary<string, int> tempSpecialTokens = [];`
	`774`	`+ specialTokensDict = tempSpecialTokens;`
`774`	`775`
`775`	`776`	`foreach (var kvp in options.SpecialTokens)`
`776`	`777`	`{`
`@@ -779,37 +780,49 @@ private static BertTokenizer Create(`
`779`	`780`	`throw new ArgumentException($"The special token '{kvp.Key}' is not in the vocabulary or assigned id value {id} different than the value {kvp.Value} in the special tokens.");`
`780`	`781`	`}`
`781`	`782`
`782`		`- // Ensure that the special tokens are lowercased.`
`783`		`- dic[kvp.Key.ToLowerInvariant()] = kvp.Value;`
	`783`	`+ // Add the special token into our dictionary, normalizing it, and adding it into the`
	`784`	`+ // main vocab, if needed.`
	`785`	`+ AddSpecialToken(vocab, tempSpecialTokens, kvp.Key, true);`
`784`	`786`	`}`
`785`	`787`	`}`
`786`	`788`	`}`
`787`	`789`	`else`
`788`	`790`	`{`
`789`		`- // Create a dictionary with the special tokens.`
`790`		`- Dictionary<string, int> specialTokens = new Dictionary<string, int>();`
`791`		`- options.SpecialTokens = specialTokens;`
`792`		`-`
`793`		`- AddSpecialToken(vocab, specialTokens, options.UnknownToken, lowerCase);`
`794`		`- AddSpecialToken(vocab, specialTokens, options.SeparatorToken, lowerCase);`
`795`		`- AddSpecialToken(vocab, specialTokens, options.PaddingToken, lowerCase);`
`796`		`- AddSpecialToken(vocab, specialTokens, options.ClassificationToken, lowerCase);`
`797`		`- AddSpecialToken(vocab, specialTokens, options.MaskingToken, lowerCase);`
	`791`	`+ // Create a dictionary with the special tokens - store the un-normalized forms in the options as`
	`792`	`+ // that field is exposed to the public. In addition, store the normalized form for creating the`
	`793`	`+ // pre-tokenizer.`
	`794`	`+ Dictionary<string, int> tempSpecialTokens = [];`
	`795`	`+ Dictionary<string, int> notNormalizedSpecialTokens = [];`
	`796`	`+ AddSpecialToken(vocab, tempSpecialTokens, options.UnknownToken, lowerCase, notNormalizedSpecialTokens);`
	`797`	`+ AddSpecialToken(vocab, tempSpecialTokens, options.SeparatorToken, lowerCase, notNormalizedSpecialTokens);`
	`798`	`+ AddSpecialToken(vocab, tempSpecialTokens, options.PaddingToken, lowerCase, notNormalizedSpecialTokens);`
	`799`	`+ AddSpecialToken(vocab, tempSpecialTokens, options.ClassificationToken, lowerCase, notNormalizedSpecialTokens);`
	`800`	`+ AddSpecialToken(vocab, tempSpecialTokens, options.MaskingToken, lowerCase, notNormalizedSpecialTokens);`
	`801`	`+`
	`802`	`+ options.SpecialTokens = notNormalizedSpecialTokens;`
	`803`	`+ specialTokensDict = tempSpecialTokens;`
`798`	`804`	`}`
`799`	`805`	`}`
`800`	`806`
`801`		`- options.PreTokenizer ??= options.ApplyBasicTokenization ? PreTokenizer.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? options.SpecialTokens : null) : PreTokenizer.CreateWhiteSpace();`
	`807`	`+ // We set the PreTokenizer here using the normalized special tokens dict (if relevant), and therefore we can`
	`808`	`+ // keep the not-normalized special tokens dict in the options passed to the WordPieceTokenizer.`
	`809`	`+ options.PreTokenizer ??= options.ApplyBasicTokenization ? PreTokenizer.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? specialTokensDict : null) : PreTokenizer.CreateWhiteSpace();`
`802`	`810`
`803`	`811`	`return new BertTokenizer(vocab, vocabReverse, options);`
`804`	`812`	`}`
`805`	`813`
`806`		`- private static void AddSpecialToken(Dictionary<StringSpanOrdinalKey, int> vocab, Dictionary<string, int> specialTokens, string token, bool lowerCase)`
	`814`	`+ private static void AddSpecialToken(Dictionary<StringSpanOrdinalKey, int> vocab, Dictionary<string, int> specialTokens, string token, bool lowerCase, Dictionary<string, int>? notNormalizedSpecialTokens = null)`
`807`	`815`	`{`
`808`	`816`	`if (token is null \|\| !vocab.TryGetValue(new StringSpanOrdinalKey(token), out int id))`
`809`	`817`	`{`
`810`	`818`	`throw new ArgumentException($"The special token '{token}' is not in the vocabulary.");`
`811`	`819`	`}`
`812`	`820`
	`821`	`+ if (notNormalizedSpecialTokens is not null)`
	`822`	`+ {`
	`823`	`+ notNormalizedSpecialTokens[token] = id;`
	`824`	`+ }`
	`825`	`+`
`813`	`826`	`string normalizedToken = token;`
`814`	`827`	`if (lowerCase)`
`815`	`828`	`{`