Feat: Improve chardict.py to add n-gram counts

Warning format change to count n-gram occurences Can go up to large n-gram parametter by changing `NGRAM_MAX_LENGTH` constant. Switch from os module to pathlib to manage paths. Fixes #7
OneDeadKey · Dec 18, 2024 · cdc8b29 · cdc8b29
1 parent 44e9733
commit cdc8b29
Showing 1 changed file with 98 additions and 57 deletions.
diff --git a/bin/chardict.py b/bin/chardict.py
@@ -1,78 +1,119 @@
 #!/usr/bin/env python3
-"""Turn corpus texts into dictionaries of symbols, bigrams and trigrams."""
+"""Turn corpus texts into dictionaries of n-grams."""
 
 import json
-from os import listdir, path
+from pathlib import Path
 from sys import argv
 
-IGNORED_CHARS = "1234567890 \t\r\n\ufeff"
-
-
-def parse_corpus(file_path):
-    """Count symbols, bigrams and trigrams in a text file."""
-
-    symbols = {}
-    bigrams = {}
-    trigrams = {}
-    char_count = 0
-    prev_symbol = None
-    prev_prev_symbol = None
-
-    # get a dictionary of all symbols (letters, punctuation marks...)
-    file = open(file_path, "r", encoding="utf-8")
-    for char in file.read():
-        symbol = char.lower()
-        if char not in IGNORED_CHARS:
-            char_count += 1
-            if symbol not in symbols:
-                symbols[symbol] = 0
-            symbols[symbol] += 1
-            if prev_symbol is not None:
-                bigram = prev_symbol + symbol
-                if bigram not in bigrams:
-                    bigrams[bigram] = 0
-                bigrams[bigram] += 1
-                if prev_prev_symbol is not None:
-                    trigram = prev_prev_symbol + bigram
-                    if trigram not in trigrams:
-                        trigrams[trigram] = 0
-                    trigrams[trigram] += 1
-            prev_prev_symbol = prev_symbol
-            prev_symbol = symbol
-        else:
-            prev_symbol = None
-    file.close()
+NGRAM_MAX_LENGTH = 5  # Quadrigrams
+IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵"
+APP_NAME = "kalamine"
+APP_AUTHOR = "1dk"
+
+
+def parse_corpus(txt: str) -> dict:
+    """Count ngrams in a string.
+    retuns a dict of ngrams
+        ngrams[1]=symbols
+        ngrams[2]=bigrames
+        ngrams[3]=trigrams
+        etc., up to NGRAM_MAX_LENGTH
+    ngrams[2] is shaped as { "aa": count }
+    """
+
+    ngrams = {}
+    ngrams_count = {}  # ngrams_count counts the total number of ngrams[i] in corpus.
+
+    txt = txt.lower()  # we want to be case **in**sensitive
+
+    for ngram in range(1, NGRAM_MAX_LENGTH):
+        ngrams[ngram] = {}
+        ngrams_count[ngram] = 0
+
+    def get_ngram(txt: str, ngram_start: int, ngram_length: int) -> str:
+        """get a ngram of a given length at given position in txt
+        returns empty string if ngram cannot be provided"""
+        if txt[ngram_start] in IGNORED_CHARS:
+            return ""
+        if ngram_length <= 0:
+            return ""
+        if ngram_start + ngram_length >= len(txt):
+            return ""
+
+        ngram = txt[ngram_start : ngram_start + ngram_length]
+
+        for n in ngram[1:]:  # 1st char already tested
+            if n in IGNORED_CHARS:
+                return ""
+
+        return ngram
+
+    # get all n-grams
+    for ngram_start in range(len(txt)):
+        for ngram_length in range(NGRAM_MAX_LENGTH):
+            _ngram = get_ngram(txt, ngram_start, ngram_length)
+
+            if not _ngram:  # _ngram is ""
+                continue
+
+            if _ngram not in ngrams[ngram_length]:
+                ngrams[ngram_length][_ngram] = 0
+
+            ngrams[ngram_length][_ngram] += 1
+            ngrams_count[ngram_length] += 1
 
     # sort the dictionary by symbol frequency (requires CPython 3.6+)
-    def sort_by_frequency(table, precision=3):
+    def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict:
         sorted_dict = {}
         for key, count in sorted(table.items(), key=lambda x: -x[1]):
             freq = round(100 * count / char_count, precision)
             if freq > 0:
                 sorted_dict[key] = freq
         return sorted_dict
 
-    results = {}
-    results["corpus"] = file_path
-    results["symbols"] = sort_by_frequency(symbols)
-    results["bigrams"] = sort_by_frequency(bigrams, 4)
-    results["trigrams"] = sort_by_frequency(trigrams)
-    return results
+    for ngram in range(1, NGRAM_MAX_LENGTH):
+        ngrams[ngram] = sort_by_frequency(ngrams[ngram], ngrams_count[ngram], 4)
+
+    return ngrams, ngrams_count
+
+
+def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict:
+    """read a .txt file and provide a dictionary of n-grams"""
+    try:
+        path = Path(file_path)
+        if not path.is_file:
+            raise Exception("Error, this is not a file")
+        if not name:
+            name = path.stem
+        with path.open("r", encoding=encoding) as file:
+            corpus_txt = "↵".join(file.readlines())
+
+    except Exception as e:
+        print(f"file does not exist or could not be read.\n {e}")
+
+    ngrams_freq, ngrams_count = parse_corpus(corpus_txt)
+    return {
+        "name": name,
+        #   "text": corpus_txt,
+        "freq": ngrams_freq,
+        "count": ngrams_count,
+    }
 
 
 if __name__ == "__main__":
     if len(argv) == 2:  # convert one file
-        data = parse_corpus(argv[1])
+        file_path = Path(argv[1])
+        data = read_corpus(str(file_path))
+        output_file_path = file_path.parent / f"{file_path.stem}.json"
+        with open(output_file_path, "w", encoding="utf-8") as outfile:
+            json.dump(data, outfile, indent=4, ensure_ascii=False)
         print(json.dumps(data, indent=4, ensure_ascii=False))
+
     else:  # converts all *.txt files in the script directory
-        bin_dir = path.dirname(__file__)
-        destdir = path.join(bin_dir, "..", "txt")
-        txtdir = path.join(bin_dir, "..", "txt")
-        for filename in listdir(txtdir):
-            if filename.endswith(".txt"):
-                basename = filename[:-4]
-                print(f"...  {basename}")
-                data = parse_corpus(path.join(txtdir, filename))
-                destfile = path.join(destdir, basename + ".json")
-                with open(destfile, "w", encoding="utf-8") as outfile:
+        curent_path = Path(__file__).resolve().parent
+        for file in curent_path.glob("*.txt"):
+            if file.is_file():
+                data = read_corpus(str(file))
+                output_file_path = file.parent / f"{file.stem}.json"
+                with open(output_file_path, "w", encoding="utf-8") as outfile:
                     json.dump(data, outfile, indent=4, ensure_ascii=False)