Skip to content

Commit

Permalink
Feat: Improve chardict.py to add n-gram counts
Browse files Browse the repository at this point in the history
Warning format change to count n-gram occurences

Can go up to large n-gram parametter by changing `NGRAM_MAX_LENGTH`
constant.

Switch from os module to pathlib to manage paths.

Fixes #7
  • Loading branch information
Cèd’C committed Dec 18, 2024
1 parent 44e9733 commit cdc8b29
Showing 1 changed file with 98 additions and 57 deletions.
155 changes: 98 additions & 57 deletions bin/chardict.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,119 @@
#!/usr/bin/env python3
"""Turn corpus texts into dictionaries of symbols, bigrams and trigrams."""
"""Turn corpus texts into dictionaries of n-grams."""

import json
from os import listdir, path
from pathlib import Path
from sys import argv

IGNORED_CHARS = "1234567890 \t\r\n\ufeff"


def parse_corpus(file_path):
"""Count symbols, bigrams and trigrams in a text file."""

symbols = {}
bigrams = {}
trigrams = {}
char_count = 0
prev_symbol = None
prev_prev_symbol = None

# get a dictionary of all symbols (letters, punctuation marks...)
file = open(file_path, "r", encoding="utf-8")
for char in file.read():
symbol = char.lower()
if char not in IGNORED_CHARS:
char_count += 1
if symbol not in symbols:
symbols[symbol] = 0
symbols[symbol] += 1
if prev_symbol is not None:
bigram = prev_symbol + symbol
if bigram not in bigrams:
bigrams[bigram] = 0
bigrams[bigram] += 1
if prev_prev_symbol is not None:
trigram = prev_prev_symbol + bigram
if trigram not in trigrams:
trigrams[trigram] = 0
trigrams[trigram] += 1
prev_prev_symbol = prev_symbol
prev_symbol = symbol
else:
prev_symbol = None
file.close()
NGRAM_MAX_LENGTH = 5 # Quadrigrams
IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵"
APP_NAME = "kalamine"
APP_AUTHOR = "1dk"


def parse_corpus(txt: str) -> dict:
"""Count ngrams in a string.
retuns a dict of ngrams
ngrams[1]=symbols
ngrams[2]=bigrames
ngrams[3]=trigrams
etc., up to NGRAM_MAX_LENGTH
ngrams[2] is shaped as { "aa": count }
"""

ngrams = {}
ngrams_count = {} # ngrams_count counts the total number of ngrams[i] in corpus.

txt = txt.lower() # we want to be case **in**sensitive

for ngram in range(1, NGRAM_MAX_LENGTH):
ngrams[ngram] = {}
ngrams_count[ngram] = 0

def get_ngram(txt: str, ngram_start: int, ngram_length: int) -> str:
"""get a ngram of a given length at given position in txt
returns empty string if ngram cannot be provided"""
if txt[ngram_start] in IGNORED_CHARS:
return ""
if ngram_length <= 0:
return ""
if ngram_start + ngram_length >= len(txt):
return ""

ngram = txt[ngram_start : ngram_start + ngram_length]

for n in ngram[1:]: # 1st char already tested
if n in IGNORED_CHARS:
return ""

return ngram

# get all n-grams
for ngram_start in range(len(txt)):
for ngram_length in range(NGRAM_MAX_LENGTH):
_ngram = get_ngram(txt, ngram_start, ngram_length)

if not _ngram: # _ngram is ""
continue

if _ngram not in ngrams[ngram_length]:
ngrams[ngram_length][_ngram] = 0

ngrams[ngram_length][_ngram] += 1
ngrams_count[ngram_length] += 1

# sort the dictionary by symbol frequency (requires CPython 3.6+)
def sort_by_frequency(table, precision=3):
def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict:
sorted_dict = {}
for key, count in sorted(table.items(), key=lambda x: -x[1]):
freq = round(100 * count / char_count, precision)
if freq > 0:
sorted_dict[key] = freq
return sorted_dict

results = {}
results["corpus"] = file_path
results["symbols"] = sort_by_frequency(symbols)
results["bigrams"] = sort_by_frequency(bigrams, 4)
results["trigrams"] = sort_by_frequency(trigrams)
return results
for ngram in range(1, NGRAM_MAX_LENGTH):
ngrams[ngram] = sort_by_frequency(ngrams[ngram], ngrams_count[ngram], 4)

return ngrams, ngrams_count


def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict:
"""read a .txt file and provide a dictionary of n-grams"""
try:
path = Path(file_path)
if not path.is_file:
raise Exception("Error, this is not a file")
if not name:
name = path.stem
with path.open("r", encoding=encoding) as file:
corpus_txt = "↵".join(file.readlines())

except Exception as e:
print(f"file does not exist or could not be read.\n {e}")

ngrams_freq, ngrams_count = parse_corpus(corpus_txt)
return {
"name": name,
# "text": corpus_txt,
"freq": ngrams_freq,
"count": ngrams_count,
}


if __name__ == "__main__":
if len(argv) == 2: # convert one file
data = parse_corpus(argv[1])
file_path = Path(argv[1])
data = read_corpus(str(file_path))
output_file_path = file_path.parent / f"{file_path.stem}.json"
with open(output_file_path, "w", encoding="utf-8") as outfile:
json.dump(data, outfile, indent=4, ensure_ascii=False)
print(json.dumps(data, indent=4, ensure_ascii=False))

else: # converts all *.txt files in the script directory
bin_dir = path.dirname(__file__)
destdir = path.join(bin_dir, "..", "txt")
txtdir = path.join(bin_dir, "..", "txt")
for filename in listdir(txtdir):
if filename.endswith(".txt"):
basename = filename[:-4]
print(f"... {basename}")
data = parse_corpus(path.join(txtdir, filename))
destfile = path.join(destdir, basename + ".json")
with open(destfile, "w", encoding="utf-8") as outfile:
curent_path = Path(__file__).resolve().parent
for file in curent_path.glob("*.txt"):
if file.is_file():
data = read_corpus(str(file))
output_file_path = file.parent / f"{file.stem}.json"
with open(output_file_path, "w", encoding="utf-8") as outfile:
json.dump(data, outfile, indent=4, ensure_ascii=False)

0 comments on commit cdc8b29

Please sign in to comment.