mdn · Josh-Cena · Oct 31, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 9, 2024
@@ -31,8 +31,7 @@ jobs:
           yarn content fix-flaws
           yarn fix:md
           yarn fix:fm
-          node scripts/sort_and_unique_file_lines.js .vscode/ignore-list.txt
-          node scripts/sort_and_unique_file_lines.js .vscode/terms-abbreviations.txt
+          node scripts/sort_and_unique_file_lines.js .vscode/dictionaries/*.txt
 
       - name: Create PR with only fixable issues
         if: success()

@@ -29,5 +29,4 @@ jobs:
 
       - name: Check if cSpell word lists are in correct order
         run: |
-          node scripts/sort_and_unique_file_lines.js .vscode/ignore-list.txt --check
-          node scripts/sort_and_unique_file_lines.js .vscode/terms-abbreviations.txt --check
+          node scripts/sort_and_unique_file_lines.js --check .vscode/dictionaries/*.txt
@@ -51,4 +51,4 @@ jobs:
             ${{ env.OUTPUT }}
 
             > [!TIP]
-            > To exclude words from the spellchecker, you can add valid words (web technology terms or abbreviations) to the [terms-abbreviations.txt](https://github.com/mdn/content/blob/main/.vscode/terms-abbreviations.txt) dictionary for IDE autocompletion. To ignore strings that are not words (\`AABBCC\` in code, for instance), you can add them to [ignore-list.txt](https://github.com/mdn/content/blob/main/.vscode/ignore-list.txt).
+            > If the word is actually valid, consider adding it to one of the dictionaries under `.vscode/dictionaries`.
@@ -17,10 +17,7 @@ export default {
     `yarn filecheck ${filenames.join(" ")}`,
   ],
   "*": (filenames) => [`node scripts/log-url-issues.js`],
-  ".vscode/ignore-list.txt": (filenames) => [
-    `node scripts/sort_and_unique_file_lines.js .vscode/ignore-list.txt`,
-  ],
-  ".vscode/terms-abbreviations.txt": (filenames) => [
-    `node scripts/sort_and_unique_file_lines.js .vscode/terms-abbreviations.txt`,
+  ".vscode/dictionaries/*.txt": (filenames) => [
+    `node scripts/sort_and_unique_file_lines.js ${filenames.join(" ")}`,
   ],
 };
@@ -6,6 +6,10 @@
   "useGitignore": true,
   "dictionaries": [
     "terms-abbreviations",
+    "cultural-words",
+    "proper-names",
+    "non-english",
+    "code-entities",
     "ignore-list",
     "bash",
     "css",
@@ -32,16 +36,31 @@
   ],
   "ignoreRegExpList": [
     // macros
-    "{{\\s?\\w*\\(",
-    "{{EmbedInteractiveExample\\(.*\\)}}",
-    "{{EmbedLiveSample\\(.*\\)}}",
-    "{{EmbedYouTube\\(\"[\\w-]*\"\\)}}",
-    // TODO - add some details what these match
-    "\\(#\\w*\\)",
+    "{{\\s?\\w*",
+    "{{\\s*EmbedInteractiveExample\\(.*\\)\\s*}}",
+    "{{\\s*EmbedLiveSample\\(.*\\)\\s*}}",
+    "{{\\s*EmbedYouTube\\(.*\\)\\s*}}",
+    "{{\\s*EmbedGHLiveSample\\(.*\\)\\s*}}",
+    // Markdown links
     "\\]\\(\\S*\\)",
-    "\\*\\*\\w\\*\\*\\w*",
-    "\\*\\w\\*\\w*",
+    // Website references
+    "[\\w\\-.]+\\.(com|net|org|ac\\.uk)\\b",
+    // Things like "**J**ava**S**cript"
+    "\\*\\*\\w+\\*\\*\\w*",
+    "\\*\\w+\\*\\w*",
     "#[À-ž\\w-]*",
+    // Old Firefox interfaces
+    "nsIDOM\\w+",
+    // Don't check other scripts
+    "[\\u0370-\\u03FF]+", // Greek
+    "[\\u0400-\\u04FF]+", // Cyrillic
+    "[\\u0590-\\u05FF]+", // Hebrew
+    "[\\u0600-\\u06FF]+", // Arabic
+    "(\\uD835[\\uDC00-\\uDFFF])+", // Mathematical Alphanumeric Symbols
+    "(\\uD83A[\\uDD00-\\uDD5F])+", // Adlam script
+    // Percent-encoding
+    "[A-Za-z]*%[A-F0-9]{2}[A-Za-z]*",
+    // Various HTML attributes that often have non-word values
     "aria-activedescendant=\"(?:[^\\\"]+|\\.)*\"",
     "aria-controls=\"(?:[^\\\"]+|\\.)*\"",
     "aria-describedby=\"(?:[^\\\"]+|\\.)*\"",
@@ -50,29 +69,76 @@
     "aria-flowto=\"(?:[^\\\"]+|\\.)*\"",
     "aria-labelledby=\"(?:[^\\\"]+|\\.)*\"",
     "aria-owns=\"(?:[^\\\"]+|\\.)*\"",
-    "Base64",
     "class=\"(?:[^\\\"]+|\\.)*\"",
     "data-test-id=\"(?:[^\\\"]+|\\.)*\"",
     "for=\"(?:[^\\\"]+|\\.)*\"",
-    "HexValues",
+    "pattern=\"(?:[^\\\"]+|\\.)*\"",
     "href=\"(?:[^\\\"]+|\\.)*\"",
-    "(?<=id)=\"(?:[^\\\"]+|\\.)*\"",
+    "(?<!\\w)id=\"(?:[^\\\"]+|\\.)*\"",
     "lang=\".*\">.*</",
     "src=\"(?:[^\\\"]+|\\.)*\"",
+    "HexValues",
+    "Base64",
+    // Any base64 in data URLs, even those shorter than 40 chars (which don't match Base64 regex)
+    "data:[^\\s;]+;base64,[a-zA-Z0-9/+=…]*",
+    "[Ee][Tt]ag: ([\\w-]+|\"[\\w-]+\")",
+    // Note: we don't add other headers that may contain base64 data, becase
+    // they often contain other meaningful directives that we want to spell
+    // check too
     "url\\(\"data\\:image/svg\\+xml.*\"\\)[,;]",
+    "nonce-\\w+",
+    "sessionid=\\w+",
+    "csrftoken=\\w+",
+    "csrfmiddlewaretoken=\\w+",
+    "widget_session=\\w+",
     "Urls",
-    "favourite-colour",
     "ucaf:.*\""
   ],
   "dictionaryDefinitions": [
     {
       "name": "terms-abbreviations",
-      "path": "./terms-abbreviations.txt",
+      "path": "./dictionaries/terms-abbreviations.txt",
+      "description": "Anything that may be used throughout the content: compound words, abbreviations, etc. They are considered as real words and will be suggested.",
+      "addWords": true
+    },
+    {
+      "name": "cultural-words",
+      "path": "./dictionaries/cultural-words.txt",
+      "description": "Culture-specific names: currencies, calendars, languages, big cities, countries, etc.",
       "addWords": true
     },
+    // Dictionaries below will not be suggested.
+    // We are not dogmatic about where to put a word: for example,
+    // sometimes proper names are in terms-abbreviations because you are likely to use it.
+    // There's no difference between these dictionaries; they only provide rough
+    // divisions for easier management. For example, a proper name can be non-English,
+    // and non-English words may be code entities.
+    // We recommend assessing applicability in the order the dictionaries are listed.
+    {
+      "name": "proper-names",
+      "path": "./dictionaries/proper-names.txt",
+      "description": "Proper names: people, small towns, companies, products, fonts, online platform handles.",
+      "addWords": false,
+      "noSuggest": true
+    },
+    {
+      "name": "non-english",
+      "path": "./dictionaries/non-english.txt",
+      "description": "Non-English words. Note that some non-English words denote well-known concepts, such as \"Adlam script\" or \"Adis Ababa\", in which case they should be placed in cultural-words. This dictionary is intended for entire scripts for demonstrating non-English languages.",
+      "addWords": false,
+      "noSuggest": true
+    },
+    {
+      "name": "code-entities",
+      "path": "./dictionaries/code-entities.txt",
+      "description": "This list contains compound words that aren't properly capitalized or obscure abbreviations that are only utilized by particular web APIs (e.g. HTML attributes, language codes, event names, etc.). Only include entities defined by standards or libraries here; variable names, strings, etc. that are created by MDN code examples should be added to ignore-list.",
+      "addWords": false,
+      "noSuggest": true
+    },
     {
       "name": "ignore-list",
-      "path": "./ignore-list.txt",
+      "path": "./dictionaries/ignore-list.txt",
+      "description": "Other gibberish words that are used for specific purposes. For example, placeholder identifiers, random strings, URLs (all lowercase), hashes, filler texts, etc. For purposefully misspelled words and/or words that are likely typos in other contexts, consider using cSpell:ignore instead.",
       "addWords": false,
       "noSuggest": true
     }