Skip to content

Commit

Permalink
minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fabi1cazenave committed Dec 22, 2024
1 parent cdc8b29 commit f69cf2a
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 15 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ json:
@mkdir -p json
@echo "Creating JSON dicts..."
@bin/chardict.py
@echo "Merging JSON dicts..."
@echo "... de_modern"
@bin/merge.py txt/deu_*.json > json/de_modern.json
@echo "... en_modern"
Expand Down
29 changes: 14 additions & 15 deletions bin/chardict.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pathlib import Path
from sys import argv

NGRAM_MAX_LENGTH = 5 # Quadrigrams
NGRAM_MAX_LENGTH = 4 # trigrams
IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵"
APP_NAME = "kalamine"
APP_AUTHOR = "1dk"
Expand Down Expand Up @@ -77,43 +77,42 @@ def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict:
return ngrams, ngrams_count


def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict:
def read_corpus(file: Path, name: str = "", encoding="utf-8") -> dict:
"""read a .txt file and provide a dictionary of n-grams"""
try:
path = Path(file_path)
if not path.is_file:
if not file.is_file:
raise Exception("Error, this is not a file")
if not name:
name = path.stem
with path.open("r", encoding=encoding) as file:
corpus_txt = "↵".join(file.readlines())
name = file.stem
with file.open("r", encoding=encoding) as f:
corpus_txt = "↵".join(f.readlines())

except Exception as e:
print(f"file does not exist or could not be read.\n {e}")

ngrams_freq, ngrams_count = parse_corpus(corpus_txt)
return {
"name": name,
# "text": corpus_txt,
"freq": ngrams_freq,
"count": ngrams_count,
}


if __name__ == "__main__":
if len(argv) == 2: # convert one file
file_path = Path(argv[1])
data = read_corpus(str(file_path))
output_file_path = file_path.parent / f"{file_path.stem}.json"
file = Path(argv[1])
data = read_corpus(file)
output_file_path = file.parent / f"{file.stem}.json"
with open(output_file_path, "w", encoding="utf-8") as outfile:
json.dump(data, outfile, indent=4, ensure_ascii=False)
print(json.dumps(data, indent=4, ensure_ascii=False))

else: # converts all *.txt files in the script directory
curent_path = Path(__file__).resolve().parent
for file in curent_path.glob("*.txt"):
txt_dir = Path(__file__).resolve().parent.parent / "txt"
for file in sorted(txt_dir.glob("*.txt")):
if file.is_file():
data = read_corpus(str(file))
output_file_path = file.parent / f"{file.stem}.json"
print(f"... {file.stem}")
data = read_corpus(file)
output_file_path = txt_dir / f"{file.stem}.json"
with open(output_file_path, "w", encoding="utf-8") as outfile:
json.dump(data, outfile, indent=4, ensure_ascii=False)

0 comments on commit f69cf2a

Please sign in to comment.