Computing metrics for glottochronology and morphology

vmonakhov · vmonakhov · commit 60e3c1e5079d · 2024-08-07T17:46:06.000+03:00
diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py
@@ -4371,6 +4371,11 @@ class Arguments:
     embedding_3d = graphene.List(graphene.List(graphene.Float))
     perspective_name_list = graphene.List(graphene.String)
 
+    dictionary_count = graphene.Int()
+    group_count = graphene.Int()
+    not_enough_count = graphene.Int()
+    transcription_count = graphene.Int()
+
     @staticmethod
     def get_entry_text(entry):
         return f"{entry['swadesh']} [ {entry['transcription']} ] {entry['translation']}"
@@ -4773,19 +4778,31 @@ def split_lex(lex):
             # GC
             del data_query
 
+        group_counter = [0] * len(group_list)
+        total_transcription_count = 0
+
         # Checking if found entries have links
         means = collections.OrderedDict()
         for perspective_id, entries in entries_set.items():
             means[perspective_id] = collections.defaultdict(set)
             for group_index, group in enumerate(group_list):
+
                 # Select etymologically linked entries
                 linked = entries & group
+                # Count non-empty 'linked'
+                group_counter[group_index] += (len(linked) > 0)
+
                 for entry_id in linked:
                     result_pool[perspective_id][entry_id]['group'] = group_index
                     swadesh = result_pool[perspective_id][entry_id]['swadesh']
                     # Store the correspondence: perspective { meanings(1/2/3) { etymological_groups(1.1/1.2/2.1/3.1)
                     if not result_pool[perspective_id][entry_id]['borrowed']:
                         means[perspective_id][swadesh].add(group_index)
+                        total_transcription_count += 1
+
+        not_enough_count = 0
+        for count in group_counter:
+            not_enough_count += (count < 2)
 
         dictionary_count = len(means)
         distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float')
@@ -4863,6 +4880,10 @@ def split_lex(lex):
 
                 result = html_result,
                 xlsx_url = xlsx_url,
+                dictionary_count = len(perspective_info_list),
+                group_count = len(group_list),
+                not_enough_count = not_enough_count,
+                transcription_count = total_transcription_count,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,
                 embedding_3d = embedding_3d_pca,
@@ -5024,6 +5045,11 @@ class Arguments:
     embedding_3d = graphene.List(graphene.List(graphene.Float))
     perspective_name_list = graphene.List(graphene.String)
 
+    dictionary_count = graphene.Int()
+    group_count = graphene.Int()
+    not_enough_count = graphene.Int()
+    transcription_count = graphene.Int()
+
     @staticmethod
     def get_entry_text(entry):
         return f"{'; '.join(entry['affix'])} ( {'; '.join(entry['meaning'])} )"
@@ -5207,17 +5233,29 @@ def morph_cognate_statistics(
             # GC
             del data_query
 
+        group_counter = [0] * len(group_list)
+        total_transcription_count = 0
+
         # Checking if found entries have links
         for perspective_id, entries in result_pool.items():
             for group_index, group in enumerate(group_list):
+
                 # Select etymologically linked entries
                 linked = entries.keys() & group
+                # Count non-empty 'linked'
+                group_counter[group_index] += (len(linked) > 0)
+
                 for entry_id in linked:
                     result_pool[perspective_id][entry_id]['group'] = group_index
                     meaning = result_pool[perspective_id][entry_id]['meaning']
+                    total_transcription_count += 1
                     for sub_meaning in meaning:
                         meaning_to_links[perspective_id][sub_meaning].add(group_index)
 
+        not_enough_count = 0
+        for count in group_counter:
+            not_enough_count += (count < 2)
+
         dictionary_count = len(result_pool)
         distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float')
         complex_data_array = numpy.full((dictionary_count, dictionary_count), "n/a", dtype='object')
@@ -5302,6 +5340,10 @@ def morph_cognate_statistics(
 
                 result = html_result,
                 xlsx_url = xlsx_url,
+                dictionary_count=len(perspective_info_list),
+                group_count=len(group_list),
+                not_enough_count = not_enough_count,
+                transcription_count=total_transcription_count,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,
                 embedding_3d = embedding_3d_pca,