Feat: support mix mergetype which average on % and not on charcount

OneDeadKey · Dec 23, 2024 · 99a06eb · 99a06eb
1 parent 207e983
commit 99a06eb
Showing 1 changed file with 60 additions and 47 deletions.
diff --git a/bin/merge.py b/bin/merge.py
@@ -7,15 +7,16 @@
 
 
 # sort the merged dictionary by symbol frequency (requires CPython 3.6+)
-def sort_by_frequency(corpus: dict, precision=3):
-    def _sort_ngram_by_frequency(table, precision):
-        sorted_dict = {}
-        for key, count in sorted(table.items(), key=lambda x: -x[1]):
-            freq = round(count, precision)
-            if freq > 0:
-                sorted_dict[key] = freq
-        return sorted_dict
+def _sort_ngram_by_frequency(table, precision=3):
+    sorted_dict = {}
+    for key, count in sorted(table.items(), key=lambda x: -x[1]):
+        freq = round(count, precision)
+        if freq > 0:
+            sorted_dict[key] = freq
+    return sorted_dict
+
 
+def sort_by_frequency(corpus: dict, precision=3):
     for ngram in range(1, len(corpus["freq"].keys())):
         ngram = str(ngram)
         corpus["freq"][ngram] = _sort_ngram_by_frequency(
@@ -24,51 +25,26 @@ def _sort_ngram_by_frequency(table, precision):
     return corpus
 
 
-def merge(filenames, filecount) -> dict:
-    merged = {
-        "symbols": {},
-        "bigrams": {},
-        "trigrams": {},
-    }
-
-    # merge dictionaries
-    for filename in filenames:
-        with open(filename, "r") as corpus:
-            data = json.load(corpus)
-            for section in merged.keys():
-                for key, count in data[section].items():
-                    if key not in merged[section]:
-                        merged[section][key] = 0.0
-                    merged[section][key] += count / filecount
-
-    results = {}
-    results["corpus"] = ""
-    results["symbols"] = _sort_by_frequency(merged["symbols"])
-    results["bigrams"] = _sort_by_frequency(merged["bigrams"], 4)
-    results["trigrams"] = _sort_by_frequency(merged["trigrams"])
-    return results
-
-
-def concat(filenames, output_name: str) -> dict:
+def merge(filenames, output_name: str, mergetype: str, arg: dict = {}) -> dict:
     """merge liste of JSON files output from `chardict` to get a sigle corpus file
     the merge is on the number of character in each file"""
 
-    def _concat_dicts(corpus1: dict, corpus2: dict) -> dict:
+    def _merge_dicts(corpus1: dict, corpus2: dict, mergetype: str, arg: dict) -> dict:
         """concat two corpus dics into one based on number of characters"""
         merge_dict = {
             "name": "",
             "freq": {},
             "count": {},
         }
 
+        # check corpus have the same number of n-grams for merge
         if len(corpus1["freq"].keys()) != len(corpus2["freq"].keys()):
             print(
                 f"Error: could not merge {corpus1["name"]} and {corpus2["name"]} :n-grams length is different ({ngram_length}-gram vs. {len(corpus["freq"].keys()+1)}-gram) ; skipping merge"
             )
-            return {}
-        ngram_length = range(1, len(corpus1["freq"].keys()) + 1)
-        print(ngram_length)
+            return corpus1
 
+        ngram_length = range(1, len(corpus1["freq"].keys()) + 1)
         for n in ngram_length:
             n = str(n)
 
@@ -86,32 +62,69 @@ def _concat_dicts(corpus1: dict, corpus2: dict) -> dict:
             for ngram in missing_ngrams:
                 corpus2["freq"][n][ngram] = 0
 
-            for ngram in all_ngrams:
-                merge_dict["freq"][n][ngram] = (
-                    corpus1["freq"][n][ngram] * corpus1["count"][n]
-                    + corpus2["freq"][n][ngram] * corpus2["count"][n]
-                ) / merge_dict["count"][n]
+            # There are several ways to merge corpuses
+            match mergetype:
+                case "concat":
+                    for ngram in all_ngrams:
+                        merge_dict["freq"][n][ngram] = (
+                            corpus1["freq"][n][ngram] * corpus1["count"][n]
+                            + corpus2["freq"][n][ngram] * corpus2["count"][n]
+                        ) / merge_dict["count"][n]
+
+                case "mix":
+                    c1_ratio = arg["ratio"]
+                    c2_ratio = 1 - c1_ratio
+
+                    for ngram in all_ngrams:
+                        merge_dict["freq"][n][ngram] = (
+                            corpus1["freq"][n][ngram] * c1_ratio
+                            + corpus2["freq"][n][ngram] * c2_ratio
+                        )
         return merge_dict
 
     def read_corpus(filename: Path) -> dict:
         try:
             with open(filename) as f:
                 corpus = json.load(f)
+                return corpus
         except:
             print(
                 f"Warning: cannot open corpus called {filename.stem} ; skipping this file"
             )
-        return corpus
+            return {}
 
-    merged_corpus = _concat_dicts(read_corpus(filenames[0]), read_corpus(filenames[1]))
+    if mergetype == "mix":
+        if arg == {}:
+            arg["ratio"] = 0.5
+        else:
+            sum_ratio = 0
+            for file in arg["ratio"]:
+                sum_ratio += arg["ratio"][file]
+            if round(sum_ratio) != 1:
+                print("Error: Provided merge ratio do not add-up to 1 ; aborting merge")
+                return {}
+
+    merged_corpus = _merge_dicts(
+        read_corpus(filenames[0]), read_corpus(filenames[1]), mergetype, arg
+    )
 
     if len(filenames) == 2:
         merged_corpus["name"] = output_name
         return sort_by_frequency(merged_corpus)
 
+    file_count = 2
     for filename in filenames[2:]:
         corpus = read_corpus(filename)
-        merged_corpus = _concat_dicts(merged_corpus, corpus)
+        if corpus == {}:
+            continue  # skipping corpuses we cannot read
+        merged_corpus = _merge_dicts(
+            merged_corpus,
+            corpus,
+            mergetype,
+            {"ratio": file_count / (file_count + 1)},
+            # todo : support custom mix (eg 10% email, 40% book, 50% chat)
+        )
+        file_count += 1
 
     merged_corpus["name"] = output_name
     return sort_by_frequency(merged_corpus)
@@ -122,7 +135,7 @@ def read_corpus(filename: Path) -> dict:
     if argl >= 2:
         dir = Path(__file__).resolve().parent.parent
         files = [Path(f) for f in argv[1:]]
-        corpus = concat(files, "output")
+        corpus = merge(files, "output", "mix")
         with open("output.json", "w", encoding="utf-8") as outfile:
             json.dump(corpus, outfile, indent=4, ensure_ascii=False)
         print(json.dumps(corpus, indent=4, ensure_ascii=False))