Skip to content

Commit

Permalink
Feat: support mix mergetype which average on % and not on charcount
Browse files Browse the repository at this point in the history
  • Loading branch information
Cèd’C committed Dec 23, 2024
1 parent 207e983 commit 99a06eb
Showing 1 changed file with 60 additions and 47 deletions.
107 changes: 60 additions & 47 deletions bin/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@


# sort the merged dictionary by symbol frequency (requires CPython 3.6+)
def sort_by_frequency(corpus: dict, precision=3):
def _sort_ngram_by_frequency(table, precision):
sorted_dict = {}
for key, count in sorted(table.items(), key=lambda x: -x[1]):
freq = round(count, precision)
if freq > 0:
sorted_dict[key] = freq
return sorted_dict
def _sort_ngram_by_frequency(table, precision=3):
sorted_dict = {}
for key, count in sorted(table.items(), key=lambda x: -x[1]):
freq = round(count, precision)
if freq > 0:
sorted_dict[key] = freq
return sorted_dict


def sort_by_frequency(corpus: dict, precision=3):
for ngram in range(1, len(corpus["freq"].keys())):
ngram = str(ngram)
corpus["freq"][ngram] = _sort_ngram_by_frequency(
Expand All @@ -24,51 +25,26 @@ def _sort_ngram_by_frequency(table, precision):
return corpus


def merge(filenames, filecount) -> dict:
merged = {
"symbols": {},
"bigrams": {},
"trigrams": {},
}

# merge dictionaries
for filename in filenames:
with open(filename, "r") as corpus:
data = json.load(corpus)
for section in merged.keys():
for key, count in data[section].items():
if key not in merged[section]:
merged[section][key] = 0.0
merged[section][key] += count / filecount

results = {}
results["corpus"] = ""
results["symbols"] = _sort_by_frequency(merged["symbols"])
results["bigrams"] = _sort_by_frequency(merged["bigrams"], 4)
results["trigrams"] = _sort_by_frequency(merged["trigrams"])
return results


def concat(filenames, output_name: str) -> dict:
def merge(filenames, output_name: str, mergetype: str, arg: dict = {}) -> dict:
"""merge liste of JSON files output from `chardict` to get a sigle corpus file
the merge is on the number of character in each file"""

def _concat_dicts(corpus1: dict, corpus2: dict) -> dict:
def _merge_dicts(corpus1: dict, corpus2: dict, mergetype: str, arg: dict) -> dict:
"""concat two corpus dics into one based on number of characters"""
merge_dict = {
"name": "",
"freq": {},
"count": {},
}

# check corpus have the same number of n-grams for merge
if len(corpus1["freq"].keys()) != len(corpus2["freq"].keys()):
print(
f"Error: could not merge {corpus1["name"]} and {corpus2["name"]} :n-grams length is different ({ngram_length}-gram vs. {len(corpus["freq"].keys()+1)}-gram) ; skipping merge"
)
return {}
ngram_length = range(1, len(corpus1["freq"].keys()) + 1)
print(ngram_length)
return corpus1

ngram_length = range(1, len(corpus1["freq"].keys()) + 1)
for n in ngram_length:
n = str(n)

Expand All @@ -86,32 +62,69 @@ def _concat_dicts(corpus1: dict, corpus2: dict) -> dict:
for ngram in missing_ngrams:
corpus2["freq"][n][ngram] = 0

for ngram in all_ngrams:
merge_dict["freq"][n][ngram] = (
corpus1["freq"][n][ngram] * corpus1["count"][n]
+ corpus2["freq"][n][ngram] * corpus2["count"][n]
) / merge_dict["count"][n]
# There are several ways to merge corpuses
match mergetype:
case "concat":
for ngram in all_ngrams:
merge_dict["freq"][n][ngram] = (
corpus1["freq"][n][ngram] * corpus1["count"][n]
+ corpus2["freq"][n][ngram] * corpus2["count"][n]
) / merge_dict["count"][n]

case "mix":
c1_ratio = arg["ratio"]
c2_ratio = 1 - c1_ratio

for ngram in all_ngrams:
merge_dict["freq"][n][ngram] = (
corpus1["freq"][n][ngram] * c1_ratio
+ corpus2["freq"][n][ngram] * c2_ratio
)
return merge_dict

def read_corpus(filename: Path) -> dict:
try:
with open(filename) as f:
corpus = json.load(f)
return corpus
except:
print(
f"Warning: cannot open corpus called {filename.stem} ; skipping this file"
)
return corpus
return {}

merged_corpus = _concat_dicts(read_corpus(filenames[0]), read_corpus(filenames[1]))
if mergetype == "mix":
if arg == {}:
arg["ratio"] = 0.5
else:
sum_ratio = 0
for file in arg["ratio"]:
sum_ratio += arg["ratio"][file]
if round(sum_ratio) != 1:
print("Error: Provided merge ratio do not add-up to 1 ; aborting merge")
return {}

merged_corpus = _merge_dicts(
read_corpus(filenames[0]), read_corpus(filenames[1]), mergetype, arg
)

if len(filenames) == 2:
merged_corpus["name"] = output_name
return sort_by_frequency(merged_corpus)

file_count = 2
for filename in filenames[2:]:
corpus = read_corpus(filename)
merged_corpus = _concat_dicts(merged_corpus, corpus)
if corpus == {}:
continue # skipping corpuses we cannot read
merged_corpus = _merge_dicts(
merged_corpus,
corpus,
mergetype,
{"ratio": file_count / (file_count + 1)},
# todo : support custom mix (eg 10% email, 40% book, 50% chat)
)
file_count += 1

merged_corpus["name"] = output_name
return sort_by_frequency(merged_corpus)
Expand All @@ -122,7 +135,7 @@ def read_corpus(filename: Path) -> dict:
if argl >= 2:
dir = Path(__file__).resolve().parent.parent
files = [Path(f) for f in argv[1:]]
corpus = concat(files, "output")
corpus = merge(files, "output", "mix")
with open("output.json", "w", encoding="utf-8") as outfile:
json.dump(corpus, outfile, indent=4, ensure_ascii=False)
print(json.dumps(corpus, indent=4, ensure_ascii=False))
Expand Down

0 comments on commit 99a06eb

Please sign in to comment.