From c3843bfb3395ef51a73a4339b495a2584e249d73 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Sat, 28 Sep 2024 00:42:24 +0300 Subject: [PATCH 01/17] json result --- lingvodoc/schema/gql_cognate.py | 64 +++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index d115e20a..1afa5374 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -8,6 +8,7 @@ import hashlib import io import itertools +import json import logging import math import os.path @@ -643,6 +644,7 @@ class Arguments: result = graphene.String() xlsx_url = graphene.String() + json_url = graphene.String() distance_list = graphene.Field(ObjectVal) figure_url = graphene.String() @@ -4382,6 +4384,7 @@ class Arguments: result = graphene.String() xlsx_url = graphene.String() + json_url = graphene.String() minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) embedding_2d = graphene.List(graphene.List(graphene.Float)) embedding_3d = graphene.List(graphene.List(graphene.Float)) @@ -4462,16 +4465,17 @@ def export_dataframe(result_pool, distance_data_array, bundles, get_entry_text): } @staticmethod - def export_xlsx( + def export_xlsx_json( result, + distance_dict, base_language_name, storage ): # Exporting analysis results as an Excel file. current_datetime = datetime.datetime.now(datetime.timezone.utc) - xlsx_filename = pathvalidate.sanitize_filename( - '{0} {1} {2:04d}.{3:02d}.{4:02d}.xlsx'.format( + filename = pathvalidate.sanitize_filename( + '{0} {1} {2:04d}.{3:02d}.{4:02d}'.format( base_language_name[:64], 'glottochronology', current_datetime.year, @@ -4479,12 +4483,19 @@ def export_xlsx( current_datetime.day)) cur_time = time.time() + + url_path = ''.join([ + storage['prefix'], storage['static_route'], + 'glottochronology', '/', str(cur_time)]) + storage_dir = os.path.join(storage['path'], 'glottochronology', str(cur_time)) - # Storing Excel file with the results. + xlsx_path = os.path.join(storage_dir, f'{filename}.xlsx') + json_path = os.path.join(storage_dir, f'{filename}') - xlsx_path = os.path.join(storage_dir, xlsx_filename) - os.makedirs(os.path.dirname(xlsx_path), exist_ok=True) + os.makedirs(storage_dir, exist_ok=True) + + # Storing Excel and Json file with the results. with pd.ExcelWriter(xlsx_path, engine='xlsxwriter') as writer: header_format = writer.book.add_format({'bold': True, @@ -4520,11 +4531,10 @@ def export_xlsx( if coeff > 1: worksheet.set_row(row_num + 1, 14 * coeff) - xlsx_url = ''.join([ - storage['prefix'], storage['static_route'], - 'glottochronology', '/', str(cur_time), '/', xlsx_filename]) + with open(json_path, 'w') as json_file: + json.dump(distance_dict, json_file) - return xlsx_url + return f'{url_path}/{filename}.xlsx', f'{url_path}/{filename}' @staticmethod def export_html(result, tiny_dicts=None, huge_size=1048576): @@ -4829,11 +4839,16 @@ def split_lex(lex): # So length of this intersection is the similarity of corresponding perspectives # means_total is amount of Swadesh's lexemes met in the both perspectives bundles = set() + distance_dict = {'__perspectives__': []} # Calculate each-to-each distances, exclude self-to-self for n1, (perspective1, means1) in enumerate(means.items()): + pers_data = result_pool[perspective1] + distance_dict['__perspectives__'].append((perspective1, pers_data['name'])) + id_key = f'{perspective1[0]},{perspective1[1]}' + distance_dict[id_key] = [] # Numerate dictionaries - result_pool[perspective1]['name'] = f"{n1 + 1}. {result_pool[perspective1]['name']}" - distance_header_array[n1] = result_pool[perspective1]['name'] + pers_data['name'] = f"{n1 + 1}. {pers_data['name']}" + distance_header_array[n1] = pers_data['name'] for n2, (perspective2, means2) in enumerate(means.items()): if n1 == n2: distance_data_array[n1][n2] = 0 @@ -4863,13 +4878,14 @@ def split_lex(lex): percent = means_linked * 100 // means_total if means_total > 0 else 0 distance_data_array[n1][n2] = round(distance, 2) complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" - + distance_dict[id_key].append((c, distance)) result = SwadeshAnalysis.export_dataframe(result_pool, complex_data_array, bundles, SwadeshAnalysis.get_entry_text) # GC del result_pool - xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) + (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( + result, distance_dict, base_language_name, storage) # 'lines' field is not needed any more del result['Cognates']['lines'] @@ -4896,6 +4912,7 @@ def split_lex(lex): result = html_result, xlsx_url = xlsx_url, + json_url = json_url, dictionary_count = len(perspective_info_list), group_count = len(group_list), not_enough_count = not_enough_count, @@ -5056,6 +5073,7 @@ class Arguments: result = graphene.String() xlsx_url = graphene.String() + json_url = graphene.String() minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) embedding_2d = graphene.List(graphene.List(graphene.Float)) embedding_3d = graphene.List(graphene.List(graphene.Float)) @@ -5278,11 +5296,16 @@ def morph_cognate_statistics( distance_header_array = numpy.full(dictionary_count, "", dtype='object') bundles = set() + distance_dict = {'__perspectives__': []} # Calculate each-to-each distances, exclude self-to-self for n1, (perspective1, meaning_to_links1) in enumerate(meaning_to_links.items()): + pers_data = result_pool[perspective1] + distance_dict['__perspectives__'].append((perspective1, pers_data['name'])) + id_key = f'{perspective1[0]},{perspective1[1]}' + distance_dict[id_key] = [] # Numerate dictionaries - result_pool[perspective1]['name'] = f"{n1 + 1}. {result_pool[perspective1]['name']}" - distance_header_array[n1] = result_pool[perspective1]['name'] + pers_data['name'] = f"{n1 + 1}. {pers_data['name']}" + distance_header_array[n1] = pers_data['name'] to_canon_meaning1 = to_canon_meaning[perspective1] canon_meanings1_set = set(to_canon_meaning1.values()) @@ -5319,17 +5342,21 @@ def morph_cognate_statistics( ''' # meanings_linked > 0 meanings that meanings_total > 0 even more so - distance = math.log(meanings_linked / meanings_total) / -0.14 if meanings_linked > 0 else 50 + c = meanings_linked / meanings_total if meanings_total > 0 else 0 + distance = math.log(c) / -0.14 if c > 0 else 50 + #distance = math.sqrt(math.log(c) / -0.1 / math.sqrt(c)) if c > 0 else 25 percent = meanings_linked * 100 // meanings_total if meanings_total > 0 else 0 distance_data_array[n1][n2] = round(distance, 2) complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" + distance_dict[id_key].append((c, distance)) result = SwadeshAnalysis.export_dataframe(result_pool, complex_data_array, bundles, MorphCognateAnalysis.get_entry_text) # GC del result_pool - xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) + (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( + result, distance_dict, base_language_name, storage) # 'lines' field is not needed any more del result['Cognates']['lines'] @@ -5356,6 +5383,7 @@ def morph_cognate_statistics( result = html_result, xlsx_url = xlsx_url, + json_url = json_url, dictionary_count=len(perspective_info_list), group_count=len(group_list), not_enough_count = not_enough_count, From c876eda25591bfe9a17ca69ac75ef5ce69be4356 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Sat, 28 Sep 2024 14:56:39 +0300 Subject: [PATCH 02/17] store cognate analysis --- lingvodoc/schema/gql_cognate.py | 47 ++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 1afa5374..5243cf91 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -3263,6 +3263,7 @@ def perform_cognate_analysis( translation_count = total_translation_count, result = '', xlsx_url = '', + json_url = '', distance_list = [], figure_url = '', intermediate_url_list = None) @@ -3851,8 +3852,8 @@ def perform_cognate_analysis( current_datetime = datetime.datetime.now(datetime.timezone.utc) - xlsx_filename = pathvalidate.sanitize_filename( - '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.xlsx'.format( + filename = pathvalidate.sanitize_filename( + '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}'.format( base_language_name[:64], ' ' + mode if mode else '', current_datetime.year, @@ -3860,24 +3861,28 @@ def perform_cognate_analysis( current_datetime.day)) if storage_dir is None: - cur_time = time.time() storage_dir = os.path.join(storage['path'], 'cognate', str(cur_time)) + os.makedirs(storage_dir, exist_ok = True) + # Storing Excel file with the results. - xlsx_path = os.path.join(storage_dir, xlsx_filename) - os.makedirs(os.path.dirname(xlsx_path), exist_ok = True) + xlsx_path = os.path.join(storage_dir, f'{filename}.xlsx') + json_path = os.path.join(storage_dir, f'{filename}') + + url_path = ''.join([ + storage['prefix'], storage['static_route'], + 'cognate', '/', str(cur_time)]) + + xlsx_url = f'{url_path}/{filename}.xlsx' + json_url = f'{url_path}/{filename}' workbook_stream.seek(0) with open(xlsx_path, 'wb') as xlsx_file: shutil.copyfileobj(workbook_stream, xlsx_file) - xlsx_url = ''.join([ - storage['prefix'], storage['static_route'], - 'cognate', '/', str(cur_time), '/', xlsx_filename]) - # Selecting one of the distance matrices, if we have any. distance_header_array = None @@ -3901,6 +3906,27 @@ def perform_cognate_analysis( distance_header_array, distance_data_array) = distance_matrix + # Compute distance_dict to store it into json + + max_diff = 500 + distance_dict = {'__perspectives__': []} + + for p_id, p_name, p_diffs in zip( + perspective_id_list, distance_header_list, distance_data_list): + + distance_dict['__perspectives__'].append((p_id, p_name)) + + p_id_key = f'{p_id[0]},{p_id[1]}' + distance_dict[p_id_key] = [] + + for diff in p_diffs: + + relation = round(1 - int(diff) / max_diff, 2) + distance_dict[p_id_key].append((relation, None)) + + with open(json_path, 'w') as json_file: + json.dump(distance_dict, json_file) + # Generating list of etymological distances to the reference perspective, if required. distance_list = None @@ -3976,7 +4002,7 @@ def perform_cognate_analysis( if task_status is not None: result_link_list = ( - [xlsx_url] + + [xlsx_url, json_url] + ([] if figure_url is None else [figure_url]) + (intermediate_url_list if __intermediate_flag__ else [])) @@ -3997,6 +4023,7 @@ def perform_cognate_analysis( result = wrapped_output, xlsx_url = xlsx_url, + json_url = json_url, distance_list = distance_list, figure_url = figure_url, From 1e89c1bb916b317bd46ebed13294d6521ed649b4 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Sun, 29 Sep 2024 17:29:52 +0300 Subject: [PATCH 03/17] getting language id --- lingvodoc/schema/gql_cognate.py | 57 +++++++++++++++++---------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 5243cf91..4ddb6aea 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -828,7 +828,7 @@ def tag_data_aggregated( # All tags for tagged lexical entries in specified perspectives. - for perspective_id, transcription_field_id, translation_field_id, _ in perspective_info_list: + for _, perspective_id, transcription_field_id, translation_field_id, _ in perspective_info_list: tag_query = ( @@ -1138,7 +1138,7 @@ def tag_data_plpgsql( perspective_id_list = [ perspective_id - for perspective_id, _, _, _ in perspective_info_list] + for _, perspective_id, _, _, _ in perspective_info_list] if not perspective_id_list: @@ -2676,7 +2676,7 @@ def perform_cognate_analysis( hashlib.md5( repr(list(group_field_id) + - [perspective_info[0] for perspective_info in perspective_info_list]) + [perspective_info[1] for perspective_info in perspective_info_list]) .encode('utf-8')) @@ -2780,7 +2780,7 @@ def perform_cognate_analysis( sg_both_count = 0 source_perspective_index = None - for index, (perspective_id, transcription_field_id, translation_field_id, _) in \ + for index, (_, perspective_id, transcription_field_id, translation_field_id, _) in \ enumerate(perspective_info_list): if perspective_id == source_perspective_id: @@ -3097,7 +3097,7 @@ def perform_cognate_analysis( perspective_id_list = [] perspective_name_list = [] - for perspective_id, transcription_field_id, translation_field_id, _ in perspective_info_list: + for _, perspective_id, transcription_field_id, translation_field_id, _ in perspective_info_list: perspective_id_list.append(perspective_id) perspective_data = perspective_dict[perspective_id] @@ -4109,7 +4109,7 @@ def mutate( set( client_id - for (client_id, _), _, _, _ in perspective_info_list)) + for _, (client_id, _), _, _, _ in perspective_info_list)) author_id_check = ( @@ -4264,12 +4264,13 @@ def mutate( perspective_info_list = [ - (tuple(perspective_id), - tuple(transcription_field_id), - tuple(translation_field_id), - None) + (None, + tuple(perspective_id), + tuple(transcription_field_id), + tuple(translation_field_id), + None) - for perspective_id, + for _, perspective_id, transcription_field_id, translation_field_id, _ in perspective_info_list] @@ -4651,7 +4652,7 @@ def split_lex(lex): hashlib.md5( repr(list(group_field_id) + - [perspective_info[0] for perspective_info in perspective_info_list]) + [perspective_info[1] for perspective_info in perspective_info_list]) .encode('utf-8')) @@ -4687,7 +4688,7 @@ def split_lex(lex): swadesh_total = {} result_pool = {} tiny_dicts = set() - for index, (perspective_id, transcription_field_id, translation_field_id, lexeme_field_id) in \ + for index, (_, perspective_id, transcription_field_id, translation_field_id, lexeme_field_id) in \ enumerate(perspective_info_list): # Getting and saving perspective info. @@ -4990,7 +4991,7 @@ def mutate( set( client_id - for (client_id, _), _, _, _ in perspective_info_list)) + for _, (client_id, _), _, _, _ in perspective_info_list)) author_id_check = ( @@ -5049,12 +5050,13 @@ def mutate( perspective_info_list = [ - (tuple(perspective_id), - tuple(transcription_field_id), - tuple(translation_field_id), - tuple(lexeme_field_id)) + (None, + tuple(perspective_id), + tuple(transcription_field_id), + tuple(translation_field_id), + tuple(lexeme_field_id)) - for perspective_id, + for _, perspective_id, transcription_field_id, translation_field_id, lexeme_field_id in perspective_info_list] @@ -5141,7 +5143,7 @@ def morph_cognate_statistics( tag_data_digest = ( hashlib.md5( repr(list(group_field_id) + - [perspective_info[0] for perspective_info in perspective_info_list]) + [perspective_info[1] for perspective_info in perspective_info_list]) .encode('utf-8')) .hexdigest()) @@ -5175,7 +5177,7 @@ def morph_cognate_statistics( meaning_re = re.compile('[.\dA-Z<>]+') meaning_with_comment_re = re.compile('[.\dA-Z<>]+ *\([.,:;\d\w ]+\)') - for index, (perspective_id, affix_field_id, meaning_field_id, _) in \ + for index, (_, perspective_id, affix_field_id, meaning_field_id, _) in \ enumerate(perspective_info_list): # Getting and saving perspective info. @@ -5461,7 +5463,7 @@ def mutate( set( client_id - for (client_id, _), _, _, _ in perspective_info_list)) + for _, (client_id, _), _, _, _ in perspective_info_list)) author_id_check = ( @@ -5520,12 +5522,13 @@ def mutate( perspective_info_list = [ - (tuple(perspective_id), - tuple(affix_field_id), - tuple(meaning_field_id), - None) + (None, + tuple(perspective_id), + tuple(affix_field_id), + tuple(meaning_field_id), + None) - for perspective_id, + for _, perspective_id, affix_field_id, meaning_field_id, _ in perspective_info_list] From 1126916e8d843c2d7041e80bf311ba2c5e71ef26 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Sun, 29 Sep 2024 20:51:31 +0300 Subject: [PATCH 04/17] getting language_id --- lingvodoc/schema/gql_cognate.py | 66 +++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 4ddb6aea..892bb6aa 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -3094,11 +3094,13 @@ def perform_cognate_analysis( result_list = [[]] + language_id_list = [] perspective_id_list = [] perspective_name_list = [] - for _, perspective_id, transcription_field_id, translation_field_id, _ in perspective_info_list: + for language_id, perspective_id, transcription_field_id, translation_field_id, _ in perspective_info_list: + language_id_list.append(language_id) perspective_id_list.append(perspective_id) perspective_data = perspective_dict[perspective_id] @@ -3886,6 +3888,7 @@ def perform_cognate_analysis( # Selecting one of the distance matrices, if we have any. distance_header_array = None + distance_dict = {'__perspectives__': []} if distance_matrix_list is not None: @@ -3906,23 +3909,30 @@ def perform_cognate_analysis( distance_header_array, distance_data_array) = distance_matrix - # Compute distance_dict to store it into json + # Compute distance_dict to store it into json - max_diff = 500 - distance_dict = {'__perspectives__': []} + max_diff = 500 - for p_id, p_name, p_diffs in zip( - perspective_id_list, distance_header_list, distance_data_list): + for ( + l_id, + p_id, + p_name, + p_diffs + ) in zip( + language_id_list, + perspective_id_list, + distance_header_list, + distance_data_list): - distance_dict['__perspectives__'].append((p_id, p_name)) + distance_dict['__perspectives__'].append((l_id, p_id, p_name)) - p_id_key = f'{p_id[0]},{p_id[1]}' - distance_dict[p_id_key] = [] + p_id_key = f'{p_id[0]},{p_id[1]}' + distance_dict[p_id_key] = [] - for diff in p_diffs: + for diff in p_diffs: - relation = round(1 - int(diff) / max_diff, 2) - distance_dict[p_id_key].append((relation, None)) + relation = round(1 - int(diff) / max_diff, 2) + distance_dict[p_id_key].append((relation, None)) with open(json_path, 'w') as json_file: json.dump(distance_dict, json_file) @@ -4688,7 +4698,7 @@ def split_lex(lex): swadesh_total = {} result_pool = {} tiny_dicts = set() - for index, (_, perspective_id, transcription_field_id, translation_field_id, lexeme_field_id) in \ + for index, (language_id, perspective_id, transcription_field_id, translation_field_id, lexeme_field_id) in \ enumerate(perspective_info_list): # Getting and saving perspective info. @@ -4786,7 +4796,7 @@ def split_lex(lex): # Grouping translations by lexical entries. entries_set[perspective_id] = set() swadesh_total[perspective_id] = set() - result_pool[perspective_id] = {'name': dictionary_name} + result_pool[perspective_id] = {'name': dictionary_name, 'lang_id': language_id} for row_index, row in enumerate(data_query): entry_id = tuple(row[:2]) transcription_list, translation_list, lexeme_list = row[2:5] @@ -4869,15 +4879,15 @@ def split_lex(lex): bundles = set() distance_dict = {'__perspectives__': []} # Calculate each-to-each distances, exclude self-to-self - for n1, (perspective1, means1) in enumerate(means.items()): - pers_data = result_pool[perspective1] - distance_dict['__perspectives__'].append((perspective1, pers_data['name'])) - id_key = f'{perspective1[0]},{perspective1[1]}' + for n1, (pers1, means1) in enumerate(means.items()): + pers_data = result_pool[pers1] + distance_dict['__perspectives__'].append((pers_data['lang_id'], pers1, pers_data['name'])) + id_key = f'{pers1[0]},{pers1[1]}' distance_dict[id_key] = [] # Numerate dictionaries pers_data['name'] = f"{n1 + 1}. {pers_data['name']}" distance_header_array[n1] = pers_data['name'] - for n2, (perspective2, means2) in enumerate(means.items()): + for n2, (pers2, means2) in enumerate(means.items()): if n1 == n2: distance_data_array[n1][n2] = 0 complex_data_array[n1][n2] = "n/a" @@ -4894,7 +4904,7 @@ def split_lex(lex): bundles.update(links_common) means_linked += 1 - means_total = len(swadesh_total[perspective1] & swadesh_total[perspective2]) + means_total = len(swadesh_total[pers1] & swadesh_total[pers2]) if n2 > n1 and means_linked >= means_total: log.debug(f"{n1+1},{n2+1} : " @@ -5177,7 +5187,7 @@ def morph_cognate_statistics( meaning_re = re.compile('[.\dA-Z<>]+') meaning_with_comment_re = re.compile('[.\dA-Z<>]+ *\([.,:;\d\w ]+\)') - for index, (_, perspective_id, affix_field_id, meaning_field_id, _) in \ + for index, (language_id, perspective_id, affix_field_id, meaning_field_id, _) in \ enumerate(perspective_info_list): # Getting and saving perspective info. @@ -5255,7 +5265,7 @@ def morph_cognate_statistics( del meaning_query meaning_to_links[perspective_id] = {} - result_pool[perspective_id] = {'name': dictionary_name} + result_pool[perspective_id] = {'name': dictionary_name, 'lang_id': language_id} for row in data_query: entry_id = tuple(row[:2]) @@ -5327,19 +5337,19 @@ def morph_cognate_statistics( bundles = set() distance_dict = {'__perspectives__': []} # Calculate each-to-each distances, exclude self-to-self - for n1, (perspective1, meaning_to_links1) in enumerate(meaning_to_links.items()): - pers_data = result_pool[perspective1] - distance_dict['__perspectives__'].append((perspective1, pers_data['name'])) - id_key = f'{perspective1[0]},{perspective1[1]}' + for n1, (pers1, meaning_to_links1) in enumerate(meaning_to_links.items()): + pers_data = result_pool[pers1] + distance_dict['__perspectives__'].append((pers_data['lang_id'], pers1, pers_data['name'])) + id_key = f'{pers1[0]},{pers1[1]}' distance_dict[id_key] = [] # Numerate dictionaries pers_data['name'] = f"{n1 + 1}. {pers_data['name']}" distance_header_array[n1] = pers_data['name'] - to_canon_meaning1 = to_canon_meaning[perspective1] + to_canon_meaning1 = to_canon_meaning[pers1] canon_meanings1_set = set(to_canon_meaning1.values()) - for n2, (perspective2, meaning_to_links2) in enumerate(meaning_to_links.items()): + for n2, (pers2, meaning_to_links2) in enumerate(meaning_to_links.items()): if n1 == n2: distance_data_array[n1][n2] = 0 complex_data_array[n1][n2] = "n/a" From ab68cccb28de83d5fcf056ff535816e1bcd48fe6 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Sun, 29 Sep 2024 23:58:37 +0300 Subject: [PATCH 05/17] fixes --- lingvodoc/schema/gql_cognate.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 892bb6aa..53f287ee 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -4274,13 +4274,14 @@ def mutate( perspective_info_list = [ - (None, + (tuple(language_id), tuple(perspective_id), tuple(transcription_field_id), tuple(translation_field_id), None) - for _, perspective_id, + for language_id, + perspective_id, transcription_field_id, translation_field_id, _ in perspective_info_list] @@ -4891,6 +4892,7 @@ def split_lex(lex): if n1 == n2: distance_data_array[n1][n2] = 0 complex_data_array[n1][n2] = "n/a" + distance_dict[id_key].append((1, 0)) else: # Common meanings of entries which have etymological links # but this links may be not mutual @@ -5060,13 +5062,14 @@ def mutate( perspective_info_list = [ - (None, + (tuple(language_id), tuple(perspective_id), tuple(transcription_field_id), tuple(translation_field_id), tuple(lexeme_field_id)) - for _, perspective_id, + for language_id, + perspective_id, transcription_field_id, translation_field_id, lexeme_field_id in perspective_info_list] @@ -5353,6 +5356,7 @@ def morph_cognate_statistics( if n1 == n2: distance_data_array[n1][n2] = 0 complex_data_array[n1][n2] = "n/a" + distance_dict[id_key].append((1, 0)) else: # Compile new meaning_to_links2 using canon_meanings instead of sub_meanings canon_meaning_to_links2 = collections.defaultdict(set) @@ -5532,13 +5536,14 @@ def mutate( perspective_info_list = [ - (None, + (tuple(language_id), tuple(perspective_id), tuple(affix_field_id), tuple(meaning_field_id), None) - for _, perspective_id, + for language_id, + perspective_id, affix_field_id, meaning_field_id, _ in perspective_info_list] From 6fd6fb3420377c75323196cd4eabb44859b66fd1 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Mon, 30 Sep 2024 23:20:05 +0300 Subject: [PATCH 06/17] next steps --- lingvodoc/schema/gql_cognate.py | 197 ++++++++++++++++++++++++++------ 1 file changed, 164 insertions(+), 33 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 53f287ee..746cb2dc 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -3888,7 +3888,9 @@ def perform_cognate_analysis( # Selecting one of the distance matrices, if we have any. distance_header_array = None - distance_dict = {'__perspectives__': []} + n1 = n2 = len(perspective_info_list) + relation_data_array = numpy.full((n1, n2), 1, dtype='float') + distance_dict = {'__perspectives__': [], '__relation_array__': relation_data_array} if distance_matrix_list is not None: @@ -3913,26 +3915,22 @@ def perform_cognate_analysis( max_diff = 500 - for ( + for n1, ( l_id, p_id, - p_name, p_diffs - ) in zip( - language_id_list, - perspective_id_list, - distance_header_list, - distance_data_list): - - distance_dict['__perspectives__'].append((l_id, p_id, p_name)) + ) in enumerate( + zip( + language_id_list, + perspective_id_list, + distance_data_list)): - p_id_key = f'{p_id[0]},{p_id[1]}' - distance_dict[p_id_key] = [] + distance_dict['__perspectives__'].append((l_id, p_id)) - for diff in p_diffs: + for n2, diff in enumerate(p_diffs): relation = round(1 - int(diff) / max_diff, 2) - distance_dict[p_id_key].append((relation, None)) + distance_dict['__relation_array__'][n1][n2] = relation with open(json_path, 'w') as json_file: json.dump(distance_dict, json_file) @@ -4508,6 +4506,7 @@ def export_xlsx_json( result, distance_dict, base_language_name, + analysis_str, storage ): # Exporting analysis results as an Excel file. @@ -4516,7 +4515,7 @@ def export_xlsx_json( filename = pathvalidate.sanitize_filename( '{0} {1} {2:04d}.{3:02d}.{4:02d}'.format( base_language_name[:64], - 'glottochronology', + analysis_str, current_datetime.year, current_datetime.month, current_datetime.day)) @@ -4871,6 +4870,7 @@ def split_lex(lex): dictionary_count = len(means) distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float') + relation_data_array = numpy.full((dictionary_count, dictionary_count), 1, dtype='float') complex_data_array = numpy.full((dictionary_count, dictionary_count), "n/a", dtype='object') distance_header_array = numpy.full(dictionary_count, "", dtype='object') @@ -4878,13 +4878,11 @@ def split_lex(lex): # So length of this intersection is the similarity of corresponding perspectives # means_total is amount of Swadesh's lexemes met in the both perspectives bundles = set() - distance_dict = {'__perspectives__': []} + distance_dict = {'__perspectives__': [], '__relation_array__': relation_data_array} # Calculate each-to-each distances, exclude self-to-self for n1, (pers1, means1) in enumerate(means.items()): pers_data = result_pool[pers1] - distance_dict['__perspectives__'].append((pers_data['lang_id'], pers1, pers_data['name'])) - id_key = f'{pers1[0]},{pers1[1]}' - distance_dict[id_key] = [] + distance_dict['__perspectives__'].append((pers_data['lang_id'], pers1)) # Numerate dictionaries pers_data['name'] = f"{n1 + 1}. {pers_data['name']}" distance_header_array[n1] = pers_data['name'] @@ -4892,7 +4890,6 @@ def split_lex(lex): if n1 == n2: distance_data_array[n1][n2] = 0 complex_data_array[n1][n2] = "n/a" - distance_dict[id_key].append((1, 0)) else: # Common meanings of entries which have etymological links # but this links may be not mutual @@ -4918,14 +4915,16 @@ def split_lex(lex): percent = means_linked * 100 // means_total if means_total > 0 else 0 distance_data_array[n1][n2] = round(distance, 2) complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" - distance_dict[id_key].append((c, distance)) - result = SwadeshAnalysis.export_dataframe(result_pool, complex_data_array, bundles, SwadeshAnalysis.get_entry_text) + distance_dict['__relation_array__'][n1][n2] = c + + result = SwadeshAnalysis.export_dataframe( + result_pool, complex_data_array, bundles, SwadeshAnalysis.get_entry_text) # GC del result_pool (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( - result, distance_dict, base_language_name, storage) + result, distance_dict, base_language_name, 'glottochronology', storage) # 'lines' field is not needed any more del result['Cognates']['lines'] @@ -4990,7 +4989,7 @@ def mutate( # Administrator / perspective author / editing permission check. error_str = ( 'Only administrator, perspective author and users with perspective editing permissions ' - 'can perform Swadesh analysis.') + 'can perform glottochronological analysis.') client_id = info.context.client_id @@ -5338,13 +5337,13 @@ def morph_cognate_statistics( distance_header_array = numpy.full(dictionary_count, "", dtype='object') bundles = set() - distance_dict = {'__perspectives__': []} + n1 = n2 = len(perspective_info_list) + relation_data_array = numpy.full((n1, n2), 1, dtype='float') + distance_dict = {'__perspectives__': [], '__relation_array__': relation_data_array} # Calculate each-to-each distances, exclude self-to-self for n1, (pers1, meaning_to_links1) in enumerate(meaning_to_links.items()): pers_data = result_pool[pers1] - distance_dict['__perspectives__'].append((pers_data['lang_id'], pers1, pers_data['name'])) - id_key = f'{pers1[0]},{pers1[1]}' - distance_dict[id_key] = [] + distance_dict['__perspectives__'].append((pers_data['lang_id'], pers1)) # Numerate dictionaries pers_data['name'] = f"{n1 + 1}. {pers_data['name']}" distance_header_array[n1] = pers_data['name'] @@ -5356,7 +5355,6 @@ def morph_cognate_statistics( if n1 == n2: distance_data_array[n1][n2] = 0 complex_data_array[n1][n2] = "n/a" - distance_dict[id_key].append((1, 0)) else: # Compile new meaning_to_links2 using canon_meanings instead of sub_meanings canon_meaning_to_links2 = collections.defaultdict(set) @@ -5391,15 +5389,16 @@ def morph_cognate_statistics( percent = meanings_linked * 100 // meanings_total if meanings_total > 0 else 0 distance_data_array[n1][n2] = round(distance, 2) complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" - distance_dict[id_key].append((c, distance)) + distance_dict['__relation_array__'][n1][n2] = c - result = SwadeshAnalysis.export_dataframe(result_pool, complex_data_array, bundles, MorphCognateAnalysis.get_entry_text) + result = SwadeshAnalysis.export_dataframe( + result_pool, complex_data_array, bundles, MorphCognateAnalysis.get_entry_text) # GC del result_pool (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( - result, distance_dict, base_language_name, storage) + result, distance_dict, base_language_name, 'morphology', storage) # 'lines' field is not needed any more del result['Cognates']['lines'] @@ -5464,7 +5463,7 @@ def mutate( # Administrator / perspective author / editing permission check. error_str = ( 'Only administrator, perspective author and users with perspective editing permissions ' - 'can perform Swadesh analysis.') + 'can perform morphological analysis.') client_id = info.context.client_id @@ -5574,6 +5573,138 @@ def mutate( 'Exception:\n' + traceback_string) +class BalancedReport(graphene.Mutation): + class Arguments: + + base_language_id = LingvodocID(required = True) + result_pool = graphene.List(ObjectVal, required = True) + debug_flag = graphene.Boolean() + + triumph = graphene.Boolean() + + result = graphene.String() + minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) + embedding_2d = graphene.List(graphene.List(graphene.Float)) + embedding_3d = graphene.List(graphene.List(graphene.Float)) + language_name_list = graphene.List(graphene.String) + + @staticmethod + def export_html(distances, tree_path_list): + html_result = build_table(distances, 'orange_light', width="300px", index=True) + return html_result + + @staticmethod + def mutate( + self, + info, + base_language_id, + result_pool, + debug_flag = False): + + # Registered user check. + client_id = info.context.client_id + + if not client_id: + return ResponseError('Only registered users can get balanced report.') + + user = Client.get_user_by_client_id(client_id) + + # Debug mode check. + if debug_flag and user.id != 1: + return ResponseError('Only administrator can use debug mode.') + + try: + + languages_frame = [] + + # Reduce array size if there are languages duplicates + for analysis in result_pool: + + pers_by_lang = collections.defaultdict(list) + languages_frame.append(pers_by_lang) + + perspectives = numpy.array(analysis.get('__perspectives__', [])) + relation_array = numpy.array(analysis.get('__relation_array__')) + p_num = len(perspectives) + nums_to_delete = [] + + for i, (l1_id, p1_id) in enumerate(perspectives): + pers_by_lang[tuple(l1_id)].append(tuple(p1_id)) + + for j in range((i + 1), p_num): + l2_id, _ = perspectives[j] + + if l2_id == l1_id: + for k in range(p_num): + # Get maximum values for found similar languages + # and assign to first found row and column (the matrix is triangular) + relation_array[i][k] = max(relation_array[[i, j], k]) + relation_array[k][i] = max(relation_array[k, [i, j]]) + + nums_to_delete.append(j) + + # Delete duplicates of languages from perspectives list and from data matrix + numpy.delete(perspectives, nums_to_delete) + numpy.delete(numpy.delete(relation_array, nums_to_delete, 0), nums_to_delete, 1) + + + + + + + + + locale_id = info.context.locale_id + + base_language = DBSession.query(dbLanguage).filter_by( + client_id = base_language_id[0], object_id = base_language_id[1]).first() + + base_language_name = base_language.get_translation(locale_id) + + request = info.context.request + storage = request.registry.settings['storage'] + + # Transforming client/object pair ids from lists to 2-tuples. + + source_perspective_id = tuple(source_perspective_id) + base_language_id = tuple(base_language_id) + group_field_id = tuple(group_field_id) + + perspective_info_list = [ + + (tuple(language_id), + tuple(perspective_id), + tuple(affix_field_id), + tuple(meaning_field_id), + None) + + for language_id, + perspective_id, + affix_field_id, + meaning_field_id, + _ in perspective_info_list] + + result_dict = dict( + triumph=True, + result=build_table(distances, 'orange_light', width="300px", index=True) + ) + + return BalancedReport(**result_dict) + + # Exception occured while we tried to perform swadesh analysis. + except Exception as exception: + + traceback_string = ''.join( + traceback.format_exception( + exception, exception, exception.__traceback__))[:-1] + + log.warning(f'balanced_report {base_language_id}: exception') + log.warning(traceback_string) + + return ResponseError( + message='Exception:\n' + traceback_string) + + class XlsxBulkDisconnect(graphene.Mutation): """ Parses uploaded XLSX file, disconnects highlighted cognates. From df491bb57662baa5ebaa42ddf5a4b98971d4113e Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Tue, 1 Oct 2024 02:04:31 +0300 Subject: [PATCH 07/17] result computation --- lingvodoc/schema/gql_cognate.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 746cb2dc..b8efeb49 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -5615,14 +5615,12 @@ def mutate( try: - languages_frame = [] + pers_by_lang = collections.defaultdict(list) + relation_result = {} # Reduce array size if there are languages duplicates for analysis in result_pool: - pers_by_lang = collections.defaultdict(list) - languages_frame.append(pers_by_lang) - perspectives = numpy.array(analysis.get('__perspectives__', [])) relation_array = numpy.array(analysis.get('__relation_array__')) p_num = len(perspectives) @@ -5644,10 +5642,28 @@ def mutate( nums_to_delete.append(j) # Delete duplicates of languages from perspectives list and from data matrix - numpy.delete(perspectives, nums_to_delete) - numpy.delete(numpy.delete(relation_array, nums_to_delete, 0), nums_to_delete, 1) - - + relation_array = ( + numpy.delete(numpy.delete(relation_array, nums_to_delete, 0), nums_to_delete, 1)) + + languages = perspectives[:, 0] + languages = numpy.delete(languages, nums_to_delete) + l_num = len(languages) + + # Collect languages pairs with their relation + relation_dict = {} + for i, l1_id in enumerate(languages): + for j in range((i + 1), l_num): + l2_id = languages[j] + relation_dict[(tuple(l1_id), tuple(l2_id))] = relation_array[i, j] + + union = set(relation_result) | set(relation_dict) + intersection = set(relation_result) & set(relation_dict) + + for pair in union: + if pair in intersection: + relation_result[pair] = (relation_result[pair] + relation_dict[pair]) / 2 + elif pair in relation_dict: + relation_result[pair] = relation_dict[pair] From db4700f99f3c447681b9fa5c3e91af0a3ce3ee70 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Tue, 1 Oct 2024 12:36:19 +0300 Subject: [PATCH 08/17] get balanced matrix --- lingvodoc/schema/gql_cognate.py | 129 ++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 57 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index b8efeb49..c99e3a92 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -5589,86 +5589,101 @@ class Arguments: language_name_list = graphene.List(graphene.String) @staticmethod - def export_html(distances, tree_path_list): - html_result = build_table(distances, 'orange_light', width="300px", index=True) - return html_result + def get_balanced_matrix(result_pool): - @staticmethod - def mutate( - self, - info, - base_language_id, - result_pool, - debug_flag = False): + pers_by_lang = collections.defaultdict(list) + relation_result = {} - # Registered user check. - client_id = info.context.client_id + # Reducing array size if there are languages duplicates + for analysis in result_pool: - if not client_id: - return ResponseError('Only registered users can get balanced report.') + perspectives = numpy.array(analysis.get('__perspectives__', [])) + relation_array = numpy.array(analysis.get('__relation_array__')) + p_num = len(perspectives) + nums_to_delete = [] - user = Client.get_user_by_client_id(client_id) + for i, (l1_id, p1_id) in enumerate(perspectives): + pers_by_lang[tuple(l1_id)].append(tuple(p1_id)) - # Debug mode check. - if debug_flag and user.id != 1: - return ResponseError('Only administrator can use debug mode.') + for j in range((i + 1), p_num): + l2_id, _ = perspectives[j] - try: + if l2_id == l1_id: + for k in range(p_num): + # Get maximum values for found similar languages + # and assign to first found row and column (the matrix is triangular) + relation_array[i][k] = max(relation_array[[i, j], k]) + relation_array[k][i] = max(relation_array[k, [i, j]]) - pers_by_lang = collections.defaultdict(list) - relation_result = {} + nums_to_delete.append(j) - # Reduce array size if there are languages duplicates - for analysis in result_pool: + # Delete duplicates of languages from perspectives list and from data matrix - perspectives = numpy.array(analysis.get('__perspectives__', [])) - relation_array = numpy.array(analysis.get('__relation_array__')) - p_num = len(perspectives) - nums_to_delete = [] + relation_array = ( + numpy.delete(numpy.delete(relation_array, nums_to_delete, 0), nums_to_delete, 1)) - for i, (l1_id, p1_id) in enumerate(perspectives): - pers_by_lang[tuple(l1_id)].append(tuple(p1_id)) + languages = perspectives[:, 0] + languages = numpy.delete(languages, nums_to_delete) + l_num = len(languages) - for j in range((i + 1), p_num): - l2_id, _ = perspectives[j] + # Collecting languages pairs with their relation - if l2_id == l1_id: - for k in range(p_num): - # Get maximum values for found similar languages - # and assign to first found row and column (the matrix is triangular) - relation_array[i][k] = max(relation_array[[i, j], k]) - relation_array[k][i] = max(relation_array[k, [i, j]]) + relation_dict = {} + for i, l1_id in enumerate(languages): + for j in range((i + 1), l_num): + l2_id = languages[j] + relation_dict[(tuple(l1_id), tuple(l2_id))] = relation_array[i, j] - nums_to_delete.append(j) + # Getting balanced list - # Delete duplicates of languages from perspectives list and from data matrix - relation_array = ( - numpy.delete(numpy.delete(relation_array, nums_to_delete, 0), nums_to_delete, 1)) + union = set(relation_result) | set(relation_dict) + intersection = set(relation_result) & set(relation_dict) - languages = perspectives[:, 0] - languages = numpy.delete(languages, nums_to_delete) - l_num = len(languages) + for pair in union: + if pair in intersection: + relation_result[pair] = (relation_result[pair] + relation_dict[pair]) / 2 + elif pair in relation_dict: + relation_result[pair] = relation_dict[pair] - # Collect languages pairs with their relation - relation_dict = {} - for i, l1_id in enumerate(languages): - for j in range((i + 1), l_num): - l2_id = languages[j] - relation_dict[(tuple(l1_id), tuple(l2_id))] = relation_array[i, j] + # Getting result balanced matrix - union = set(relation_result) | set(relation_dict) - intersection = set(relation_result) & set(relation_dict) + language_result = [pair[0] for pair in relation_result] + l_num = len(language_result) + relation_result_matrix = numpy.full((l_num, l_num), 1, dtype='float') - for pair in union: - if pair in intersection: - relation_result[pair] = (relation_result[pair] + relation_dict[pair]) / 2 - elif pair in relation_dict: - relation_result[pair] = relation_dict[pair] + for (l1_id, l2_id), relation in relation_result.items(): + i = language_result.index(l1_id) + j = language_result.index(l2_id) + relation_result_matrix[i, j] = relation_result_matrix[j, i] = relation + return relation_result_matrix, pers_by_lang + @staticmethod + def export_html(distances, tree_path_list): + html_result = build_table(distances, 'orange_light', width="300px", index=True) + return html_result + @staticmethod + def mutate( + self, + info, + base_language_id, + result_pool, + debug_flag = False): + # Registered user check + client_id = info.context.client_id + if not client_id: + return ResponseError('Only registered users can get balanced report.') + + user = Client.get_user_by_client_id(client_id) + + # Debug mode check + if debug_flag and user.id != 1: + return ResponseError('Only administrator can use debug mode.') + + try: locale_id = info.context.locale_id From 6500e2dbf08cdba7913b9fbfbcce0b7247f6cba1 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Tue, 1 Oct 2024 14:46:40 +0300 Subject: [PATCH 09/17] refactoring --- lingvodoc/schema/gql_cognate.py | 53 +++++++++++++++++---------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index c99e3a92..e5a67859 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -3890,7 +3890,7 @@ def perform_cognate_analysis( distance_header_array = None n1 = n2 = len(perspective_info_list) relation_data_array = numpy.full((n1, n2), 1, dtype='float') - distance_dict = {'__perspectives__': [], '__relation_array__': relation_data_array} + distance_dict = {'__perspectives__': [], '__relation_matrix__': relation_data_array} if distance_matrix_list is not None: @@ -3930,7 +3930,7 @@ def perform_cognate_analysis( for n2, diff in enumerate(p_diffs): relation = round(1 - int(diff) / max_diff, 2) - distance_dict['__relation_array__'][n1][n2] = relation + distance_dict['__relation_matrix__'][n1][n2] = relation with open(json_path, 'w') as json_file: json.dump(distance_dict, json_file) @@ -4878,7 +4878,7 @@ def split_lex(lex): # So length of this intersection is the similarity of corresponding perspectives # means_total is amount of Swadesh's lexemes met in the both perspectives bundles = set() - distance_dict = {'__perspectives__': [], '__relation_array__': relation_data_array} + distance_dict = {'__perspectives__': [], '__relation_matrix__': relation_data_array} # Calculate each-to-each distances, exclude self-to-self for n1, (pers1, means1) in enumerate(means.items()): pers_data = result_pool[pers1] @@ -4915,7 +4915,7 @@ def split_lex(lex): percent = means_linked * 100 // means_total if means_total > 0 else 0 distance_data_array[n1][n2] = round(distance, 2) complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" - distance_dict['__relation_array__'][n1][n2] = c + distance_dict['__relation_matrix__'][n1][n2] = c result = SwadeshAnalysis.export_dataframe( result_pool, complex_data_array, bundles, SwadeshAnalysis.get_entry_text) @@ -5339,7 +5339,7 @@ def morph_cognate_statistics( bundles = set() n1 = n2 = len(perspective_info_list) relation_data_array = numpy.full((n1, n2), 1, dtype='float') - distance_dict = {'__perspectives__': [], '__relation_array__': relation_data_array} + distance_dict = {'__perspectives__': [], '__relation_matrix__': relation_data_array} # Calculate each-to-each distances, exclude self-to-self for n1, (pers1, meaning_to_links1) in enumerate(meaning_to_links.items()): pers_data = result_pool[pers1] @@ -5384,12 +5384,12 @@ def morph_cognate_statistics( # meanings_linked > 0 meanings that meanings_total > 0 even more so c = meanings_linked / meanings_total if meanings_total > 0 else 0 - distance = math.log(c) / -0.14 if c > 0 else 50 - #distance = math.sqrt(math.log(c) / -0.1 / math.sqrt(c)) if c > 0 else 25 + #distance = math.log(c) / -0.14 if c > 0 else 50 + distance = math.sqrt(math.log(c) / -0.1 / math.sqrt(c)) if c > 0 else 25 percent = meanings_linked * 100 // meanings_total if meanings_total > 0 else 0 distance_data_array[n1][n2] = round(distance, 2) complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" - distance_dict['__relation_array__'][n1][n2] = c + distance_dict['__relation_matrix__'][n1][n2] = c result = SwadeshAnalysis.export_dataframe( result_pool, complex_data_array, bundles, MorphCognateAnalysis.get_entry_text) @@ -5591,19 +5591,19 @@ class Arguments: @staticmethod def get_balanced_matrix(result_pool): - pers_by_lang = collections.defaultdict(list) + pers_by_lang = collections.defaultdict(set) relation_result = {} # Reducing array size if there are languages duplicates for analysis in result_pool: perspectives = numpy.array(analysis.get('__perspectives__', [])) - relation_array = numpy.array(analysis.get('__relation_array__')) + relation_matrix = numpy.array(analysis.get('__relation_matrix__')) p_num = len(perspectives) nums_to_delete = [] for i, (l1_id, p1_id) in enumerate(perspectives): - pers_by_lang[tuple(l1_id)].append(tuple(p1_id)) + pers_by_lang[tuple(l1_id)].add(tuple(p1_id)) for j in range((i + 1), p_num): l2_id, _ = perspectives[j] @@ -5612,51 +5612,52 @@ def get_balanced_matrix(result_pool): for k in range(p_num): # Get maximum values for found similar languages # and assign to first found row and column (the matrix is triangular) - relation_array[i][k] = max(relation_array[[i, j], k]) - relation_array[k][i] = max(relation_array[k, [i, j]]) + relation_matrix[i][k] = max(relation_matrix[[i, j], k]) + relation_matrix[k][i] = max(relation_matrix[k, [i, j]]) nums_to_delete.append(j) # Delete duplicates of languages from perspectives list and from data matrix - relation_array = ( - numpy.delete(numpy.delete(relation_array, nums_to_delete, 0), nums_to_delete, 1)) + relation_matrix = ( + numpy.delete(numpy.delete(relation_matrix, nums_to_delete, 0), nums_to_delete, 1)) languages = perspectives[:, 0] languages = numpy.delete(languages, nums_to_delete) l_num = len(languages) - # Collecting languages pairs with their relation + # Collecting languages pairs with their relations - relation_dict = {} + relation_by_pair = {} for i, l1_id in enumerate(languages): for j in range((i + 1), l_num): l2_id = languages[j] - relation_dict[(tuple(l1_id), tuple(l2_id))] = relation_array[i, j] + relation_by_pair[(tuple(l1_id), tuple(l2_id))] = relation_matrix[i, j] # Getting balanced list - union = set(relation_result) | set(relation_dict) - intersection = set(relation_result) & set(relation_dict) + union = set(relation_result) | set(relation_by_pair) + intersection = set(relation_result) & set(relation_by_pair) for pair in union: if pair in intersection: - relation_result[pair] = (relation_result[pair] + relation_dict[pair]) / 2 - elif pair in relation_dict: - relation_result[pair] = relation_dict[pair] + relation_result[pair] = (relation_result[pair] + relation_by_pair[pair]) / 2 + elif pair in relation_by_pair: + relation_result[pair] = relation_by_pair[pair] # Getting result balanced matrix language_result = [pair[0] for pair in relation_result] l_num = len(language_result) - relation_result_matrix = numpy.full((l_num, l_num), 1, dtype='float') + distance_matrix = numpy.full((l_num, l_num), 1, dtype='float') for (l1_id, l2_id), relation in relation_result.items(): i = language_result.index(l1_id) j = language_result.index(l2_id) - relation_result_matrix[i, j] = relation_result_matrix[j, i] = relation + distance_matrix[i, j] = distance_matrix[j, i] = ( + math.sqrt(math.log(relation) / -0.1 / math.sqrt(relation)) if relation > 0 else 25) - return relation_result_matrix, pers_by_lang + return distance_matrix, pers_by_lang @staticmethod def export_html(distances, tree_path_list): From 543448828375bfb0b10c1705f2529937f1a471ee Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Tue, 1 Oct 2024 18:23:33 +0300 Subject: [PATCH 10/17] export html --- lingvodoc/schema/gql_cognate.py | 91 ++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 35 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index e5a67859..2c06d064 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -5647,22 +5647,17 @@ def get_balanced_matrix(result_pool): # Getting result balanced matrix - language_result = [pair[0] for pair in relation_result] - l_num = len(language_result) + language_list = [pair[0] for pair in relation_result] + l_num = len(language_list) distance_matrix = numpy.full((l_num, l_num), 1, dtype='float') for (l1_id, l2_id), relation in relation_result.items(): - i = language_result.index(l1_id) - j = language_result.index(l2_id) + i = language_list.index(l1_id) + j = language_list.index(l2_id) distance_matrix[i, j] = distance_matrix[j, i] = ( math.sqrt(math.log(relation) / -0.1 / math.sqrt(relation)) if relation > 0 else 25) - return distance_matrix, pers_by_lang - - @staticmethod - def export_html(distances, tree_path_list): - html_result = build_table(distances, 'orange_light', width="300px", index=True) - return html_result + return distance_matrix, language_list, pers_by_lang @staticmethod def mutate( @@ -5684,46 +5679,72 @@ def mutate( if debug_flag and user.id != 1: return ResponseError('Only administrator can use debug mode.') - try: + locale_id = info.context.locale_id - locale_id = info.context.locale_id + def get_language_str(language_id): + language_obj = DBSession.query(dbLanguage).filter_by( + client_id=language_id[0], object_id=language_id[1]).one() - base_language = DBSession.query(dbLanguage).filter_by( - client_id = base_language_id[0], object_id = base_language_id[1]).first() + return language_obj.get_translation(locale_id) - base_language_name = base_language.get_translation(locale_id) + def get_perspective_str(perspective_id): + perspective_obj = DBSession.query(dbPerspective).filter_by( + client_id=perspective_id[0], object_id=perspective_id[1]).one() - request = info.context.request - storage = request.registry.settings['storage'] + perspective_name = perspective_obj.get_translation(locale_id) + dictionary_name = perspective_obj.parent.get_translation(locale_id) - # Transforming client/object pair ids from lists to 2-tuples. + return f'{dictionary_name} - {perspective_name}' - source_perspective_id = tuple(source_perspective_id) - base_language_id = tuple(base_language_id) - group_field_id = tuple(group_field_id) + try: + distance_matrix, language_list, pers_by_lang = ( + BalancedReport.get_balanced_matrix(result_pool)) - perspective_info_list = [ + language_header = [get_language_str(lang_id) for lang_id in language_list] - (tuple(language_id), - tuple(perspective_id), - tuple(affix_field_id), - tuple(meaning_field_id), - None) + def export_html(): - for language_id, - perspective_id, - affix_field_id, - meaning_field_id, - _ in perspective_info_list] + distance_frame = pd.DataFrame(distance_matrix, columns=language_header) + html_result = build_table(distance_frame, 'yellow_dark', width="300px", index=True) + + for index1, lang, pers in enumerate(pers_by_lang.items()): + html_result += ( + f"

* The language {get_language_str(lang)} " + f"is presented by perspective(s):

") + for index2, per in enumerate(pers): + html_result += f"

** {index2 + 1}. {get_perspective_str(per)}

" + + return html_result + + language_str = f'language {base_language_id[0]}/{base_language_id[1]}' + base_language_name = get_language_str(base_language_id) + + _, mst_list, embedding_2d_pca, embedding_3d_pca = \ + CognateAnalysis.distance_graph( + language_str, + base_language_name, + distance_matrix, + language_header, + None, + None, + None, + analysis_str='balanced_report', + __debug_flag__=debug_flag, + __plot_flag__=False + ) result_dict = dict( - triumph=True, - result=build_table(distances, 'orange_light', width="300px", index=True) + triumph = True, + result = export_html(), + minimum_spanning_tree = mst_list, + embedding_2d = embedding_2d_pca, + embedding_3d = embedding_3d_pca, + language_name_list = language_header ) return BalancedReport(**result_dict) - # Exception occured while we tried to perform swadesh analysis. + # Exception occured while we tried to get balanced report except Exception as exception: traceback_string = ''.join( From 849c0a548fa25053ce2fb414db90411c32720010 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Tue, 1 Oct 2024 20:14:01 +0300 Subject: [PATCH 11/17] fixed json serialization --- lingvodoc/schema/gql_cognate.py | 42 ++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 2c06d064..a23709d5 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -3890,7 +3890,7 @@ def perform_cognate_analysis( distance_header_array = None n1 = n2 = len(perspective_info_list) relation_data_array = numpy.full((n1, n2), 1, dtype='float') - distance_dict = {'__perspectives__': [], '__relation_matrix__': relation_data_array} + perspectives = [] if distance_matrix_list is not None: @@ -3925,12 +3925,14 @@ def perform_cognate_analysis( perspective_id_list, distance_data_list)): - distance_dict['__perspectives__'].append((l_id, p_id)) + perspectives.append((l_id, p_id)) for n2, diff in enumerate(p_diffs): relation = round(1 - int(diff) / max_diff, 2) - distance_dict['__relation_matrix__'][n1][n2] = relation + relation_data_array[n1, n2] = relation + + distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': list(relation_data_array)} with open(json_path, 'w') as json_file: json.dump(distance_dict, json_file) @@ -4873,23 +4875,23 @@ def split_lex(lex): relation_data_array = numpy.full((dictionary_count, dictionary_count), 1, dtype='float') complex_data_array = numpy.full((dictionary_count, dictionary_count), "n/a", dtype='object') distance_header_array = numpy.full(dictionary_count, "", dtype='object') + perspectives = [] # Calculate intersection between lists of linked meanings (Swadesh matching) # So length of this intersection is the similarity of corresponding perspectives # means_total is amount of Swadesh's lexemes met in the both perspectives bundles = set() - distance_dict = {'__perspectives__': [], '__relation_matrix__': relation_data_array} # Calculate each-to-each distances, exclude self-to-self for n1, (pers1, means1) in enumerate(means.items()): pers_data = result_pool[pers1] - distance_dict['__perspectives__'].append((pers_data['lang_id'], pers1)) + perspectives.append((pers_data['lang_id'], pers1)) # Numerate dictionaries pers_data['name'] = f"{n1 + 1}. {pers_data['name']}" distance_header_array[n1] = pers_data['name'] for n2, (pers2, means2) in enumerate(means.items()): if n1 == n2: - distance_data_array[n1][n2] = 0 - complex_data_array[n1][n2] = "n/a" + distance_data_array[n1, n2] = 0 + complex_data_array[n1, n2] = "n/a" else: # Common meanings of entries which have etymological links # but this links may be not mutual @@ -4913,9 +4915,9 @@ def split_lex(lex): c = means_linked / means_total if means_total > 0 else 0 distance = math.sqrt( math.log(c) / -0.1 / math.sqrt(c) ) if c > 0 else 25 percent = means_linked * 100 // means_total if means_total > 0 else 0 - distance_data_array[n1][n2] = round(distance, 2) - complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" - distance_dict['__relation_matrix__'][n1][n2] = c + distance_data_array[n1, n2] = round(distance, 2) + complex_data_array[n1, n2] = f"{distance_data_array[n1, n2]:.2f} ({percent}%)" + relation_data_array[n1, n2] = c result = SwadeshAnalysis.export_dataframe( result_pool, complex_data_array, bundles, SwadeshAnalysis.get_entry_text) @@ -4923,6 +4925,8 @@ def split_lex(lex): # GC del result_pool + distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': list(relation_data_array)} + (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( result, distance_dict, base_language_name, 'glottochronology', storage) @@ -5339,11 +5343,11 @@ def morph_cognate_statistics( bundles = set() n1 = n2 = len(perspective_info_list) relation_data_array = numpy.full((n1, n2), 1, dtype='float') - distance_dict = {'__perspectives__': [], '__relation_matrix__': relation_data_array} + perspectives = [] # Calculate each-to-each distances, exclude self-to-self for n1, (pers1, meaning_to_links1) in enumerate(meaning_to_links.items()): pers_data = result_pool[pers1] - distance_dict['__perspectives__'].append((pers_data['lang_id'], pers1)) + perspectives.append((pers_data['lang_id'], pers1)) # Numerate dictionaries pers_data['name'] = f"{n1 + 1}. {pers_data['name']}" distance_header_array[n1] = pers_data['name'] @@ -5353,8 +5357,8 @@ def morph_cognate_statistics( for n2, (pers2, meaning_to_links2) in enumerate(meaning_to_links.items()): if n1 == n2: - distance_data_array[n1][n2] = 0 - complex_data_array[n1][n2] = "n/a" + distance_data_array[n1, n2] = 0 + complex_data_array[n1, n2] = "n/a" else: # Compile new meaning_to_links2 using canon_meanings instead of sub_meanings canon_meaning_to_links2 = collections.defaultdict(set) @@ -5387,9 +5391,11 @@ def morph_cognate_statistics( #distance = math.log(c) / -0.14 if c > 0 else 50 distance = math.sqrt(math.log(c) / -0.1 / math.sqrt(c)) if c > 0 else 25 percent = meanings_linked * 100 // meanings_total if meanings_total > 0 else 0 - distance_data_array[n1][n2] = round(distance, 2) - complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" - distance_dict['__relation_matrix__'][n1][n2] = c + distance_data_array[n1, n2] = round(distance, 2) + complex_data_array[n1, n2] = f"{distance_data_array[n1, n2]:.2f} ({percent}%)" + relation_data_array[n1, n2] = c + + result = SwadeshAnalysis.export_dataframe( result_pool, complex_data_array, bundles, MorphCognateAnalysis.get_entry_text) @@ -5397,6 +5403,8 @@ def morph_cognate_statistics( # GC del result_pool + distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': list(relation_data_array)} + (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( result, distance_dict, base_language_name, 'morphology', storage) From 6522e9a81cde363247f257eb1eb95fcc97a531e0 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Wed, 2 Oct 2024 15:18:08 +0300 Subject: [PATCH 12/17] init frontend --- lingvodoc/schema/gql_cognate.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index a23709d5..bace9d4e 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -3932,7 +3932,7 @@ def perform_cognate_analysis( relation = round(1 - int(diff) / max_diff, 2) relation_data_array[n1, n2] = relation - distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': list(relation_data_array)} + distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': relation_data_array.tolist()} with open(json_path, 'w') as json_file: json.dump(distance_dict, json_file) @@ -4925,7 +4925,7 @@ def split_lex(lex): # GC del result_pool - distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': list(relation_data_array)} + distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': relation_data_array.tolist()} (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( result, distance_dict, base_language_name, 'glottochronology', storage) @@ -5396,14 +5396,13 @@ def morph_cognate_statistics( relation_data_array[n1, n2] = c - result = SwadeshAnalysis.export_dataframe( result_pool, complex_data_array, bundles, MorphCognateAnalysis.get_entry_text) # GC del result_pool - distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': list(relation_data_array)} + distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': relation_data_array.tolist()} (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( result, distance_dict, base_language_name, 'morphology', storage) @@ -5581,7 +5580,7 @@ def mutate( 'Exception:\n' + traceback_string) -class BalancedReport(graphene.Mutation): +class ComplexDistance(graphene.Mutation): class Arguments: base_language_id = LingvodocID(required = True) @@ -5597,7 +5596,7 @@ class Arguments: language_name_list = graphene.List(graphene.String) @staticmethod - def get_balanced_matrix(result_pool): + def get_complex_matrix(result_pool): pers_by_lang = collections.defaultdict(set) relation_result = {} @@ -5642,7 +5641,7 @@ def get_balanced_matrix(result_pool): l2_id = languages[j] relation_by_pair[(tuple(l1_id), tuple(l2_id))] = relation_matrix[i, j] - # Getting balanced list + # Getting complex list union = set(relation_result) | set(relation_by_pair) intersection = set(relation_result) & set(relation_by_pair) @@ -5653,7 +5652,7 @@ def get_balanced_matrix(result_pool): elif pair in relation_by_pair: relation_result[pair] = relation_by_pair[pair] - # Getting result balanced matrix + # Getting result complex matrix language_list = [pair[0] for pair in relation_result] l_num = len(language_list) @@ -5679,7 +5678,7 @@ def mutate( client_id = info.context.client_id if not client_id: - return ResponseError('Only registered users can get balanced report.') + return ResponseError('Only registered users can get complex report.') user = Client.get_user_by_client_id(client_id) @@ -5706,7 +5705,7 @@ def get_perspective_str(perspective_id): try: distance_matrix, language_list, pers_by_lang = ( - BalancedReport.get_balanced_matrix(result_pool)) + ComplexDistance.get_complex_matrix(result_pool)) language_header = [get_language_str(lang_id) for lang_id in language_list] @@ -5736,7 +5735,7 @@ def export_html(): None, None, None, - analysis_str='balanced_report', + analysis_str='complex_report', __debug_flag__=debug_flag, __plot_flag__=False ) @@ -5750,16 +5749,16 @@ def export_html(): language_name_list = language_header ) - return BalancedReport(**result_dict) + return ComplexDistance(**result_dict) - # Exception occured while we tried to get balanced report + # Exception occured while we tried to get complex report except Exception as exception: traceback_string = ''.join( traceback.format_exception( exception, exception, exception.__traceback__))[:-1] - log.warning(f'balanced_report {base_language_id}: exception') + log.warning(f'complex_report {base_language_id}: exception') log.warning(traceback_string) return ResponseError( From 5821741bd11b5093e875c4b0d18f28927d8bbd43 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Wed, 2 Oct 2024 22:53:34 +0300 Subject: [PATCH 13/17] enhanced frontend --- lingvodoc/schema/gql_cognate.py | 16 +++++++++++----- lingvodoc/schema/query.py | 2 ++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index bace9d4e..9f6fcda2 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -3932,7 +3932,9 @@ def perform_cognate_analysis( relation = round(1 - int(diff) / max_diff, 2) relation_data_array[n1, n2] = relation - distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': relation_data_array.tolist()} + distance_dict = {'__base_language_id__': base_language_id, + '__perspectives__': perspectives, + '__relation_matrix__': relation_data_array.tolist()} with open(json_path, 'w') as json_file: json.dump(distance_dict, json_file) @@ -4925,7 +4927,9 @@ def split_lex(lex): # GC del result_pool - distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': relation_data_array.tolist()} + distance_dict = {'__base_language_id__': base_language_id, + '__perspectives__': perspectives, + '__relation_matrix__': relation_data_array.tolist()} (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( result, distance_dict, base_language_name, 'glottochronology', storage) @@ -5402,7 +5406,9 @@ def morph_cognate_statistics( # GC del result_pool - distance_dict = {'__perspectives__': perspectives, '__relation_matrix__': relation_data_array.tolist()} + distance_dict = {'__base_language_id__': base_language_id, + '__perspectives__': perspectives, + '__relation_matrix__': relation_data_array.tolist()} (xlsx_url, json_url) = SwadeshAnalysis.export_xlsx_json( result, distance_dict, base_language_name, 'morphology', storage) @@ -5583,13 +5589,13 @@ def mutate( class ComplexDistance(graphene.Mutation): class Arguments: - base_language_id = LingvodocID(required = True) result_pool = graphene.List(ObjectVal, required = True) debug_flag = graphene.Boolean() triumph = graphene.Boolean() result = graphene.String() + message = graphene.String() minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) embedding_2d = graphene.List(graphene.List(graphene.Float)) embedding_3d = graphene.List(graphene.List(graphene.Float)) @@ -5670,7 +5676,6 @@ def get_complex_matrix(result_pool): def mutate( self, info, - base_language_id, result_pool, debug_flag = False): @@ -5687,6 +5692,7 @@ def mutate( return ResponseError('Only administrator can use debug mode.') locale_id = info.context.locale_id + base_language_id = result_pool[0].get('__base_language_id__') def get_language_str(language_id): language_obj = DBSession.query(dbLanguage).filter_by( diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index a34804d0..325e9ef4 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -137,6 +137,7 @@ from lingvodoc.schema.gql_cognate import ( CognateAnalysis, + ComplexDistance, MorphCognateAnalysis, PhonemicAnalysis, SwadeshAnalysis, @@ -7737,6 +7738,7 @@ class MyMutations(graphene.ObjectType): set_valency_annotation = SetValencyAnnotation.Field() bidirectional_links = BidirectionalLinks.Field() cognates_summary = CognatesSummary.Field() + complex_distance = ComplexDistance.Field() schema = graphene.Schema( From 5740754967b4d06ed8937920b82a96eefaa5e364 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Thu, 3 Oct 2024 18:51:53 +0300 Subject: [PATCH 14/17] debugging --- lingvodoc/schema/gql_cognate.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 9f6fcda2..9492ef99 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -5610,23 +5610,30 @@ def get_complex_matrix(result_pool): # Reducing array size if there are languages duplicates for analysis in result_pool: - perspectives = numpy.array(analysis.get('__perspectives__', [])) - relation_matrix = numpy.array(analysis.get('__relation_matrix__')) + perspectives = analysis.get('__perspectives__', []) p_num = len(perspectives) + + relation_matrix = numpy.array(analysis.get('__relation_matrix__', [])) + + if not p_num or len(relation_matrix) != p_num: + continue + + languages = [] nums_to_delete = [] for i, (l1_id, p1_id) in enumerate(perspectives): pers_by_lang[tuple(l1_id)].add(tuple(p1_id)) + languages.append(tuple(l1_id)) for j in range((i + 1), p_num): l2_id, _ = perspectives[j] - if l2_id == l1_id: + if tuple(l2_id) == tuple(l1_id) and j not in nums_to_delete: for k in range(p_num): # Get maximum values for found similar languages # and assign to first found row and column (the matrix is triangular) - relation_matrix[i][k] = max(relation_matrix[[i, j], k]) - relation_matrix[k][i] = max(relation_matrix[k, [i, j]]) + relation_matrix[i, k] = max(relation_matrix[[i, j], k]) + relation_matrix[k, i] = max(relation_matrix[k, [i, j]]) nums_to_delete.append(j) @@ -5635,8 +5642,7 @@ def get_complex_matrix(result_pool): relation_matrix = ( numpy.delete(numpy.delete(relation_matrix, nums_to_delete, 0), nums_to_delete, 1)) - languages = perspectives[:, 0] - languages = numpy.delete(languages, nums_to_delete) + languages = [lang for i, lang in enumerate(languages) if i not in nums_to_delete] l_num = len(languages) # Collecting languages pairs with their relations @@ -5659,8 +5665,7 @@ def get_complex_matrix(result_pool): relation_result[pair] = relation_by_pair[pair] # Getting result complex matrix - - language_list = [pair[0] for pair in relation_result] + language_list = list(pers_by_lang.keys()) l_num = len(language_list) distance_matrix = numpy.full((l_num, l_num), 1, dtype='float') @@ -5714,7 +5719,7 @@ def get_perspective_str(perspective_id): ComplexDistance.get_complex_matrix(result_pool)) language_header = [get_language_str(lang_id) for lang_id in language_list] - + A() def export_html(): distance_frame = pd.DataFrame(distance_matrix, columns=language_header) From 4847e11e4dbd46d7b8bdee7b484065ec3a834ca7 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Thu, 3 Oct 2024 19:34:39 +0300 Subject: [PATCH 15/17] first result --- lingvodoc/schema/gql_cognate.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index 9492ef99..de6f12e9 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -5675,7 +5675,7 @@ def get_complex_matrix(result_pool): distance_matrix[i, j] = distance_matrix[j, i] = ( math.sqrt(math.log(relation) / -0.1 / math.sqrt(relation)) if relation > 0 else 25) - return distance_matrix, language_list, pers_by_lang + return distance_matrix, pers_by_lang @staticmethod def mutate( @@ -5715,27 +5715,30 @@ def get_perspective_str(perspective_id): return f'{dictionary_name} - {perspective_name}' try: - distance_matrix, language_list, pers_by_lang = ( + distance_matrix, pers_by_lang = ( ComplexDistance.get_complex_matrix(result_pool)) - language_header = [get_language_str(lang_id) for lang_id in language_list] - A() + language_header = [f' {i+1}. {get_language_str(lang_id)}' for i, lang_id in enumerate(pers_by_lang)] + def export_html(): distance_frame = pd.DataFrame(distance_matrix, columns=language_header) + # Start index for distances from 1 to match with dictionaries numbers + distance_frame.index += 1 + html_result = build_table(distance_frame, 'yellow_dark', width="300px", index=True) - for index1, lang, pers in enumerate(pers_by_lang.items()): + for i1, (lang, pers) in enumerate(pers_by_lang.items()): html_result += ( - f"

* The language {get_language_str(lang)} " - f"is presented by perspective(s):

") - for index2, per in enumerate(pers): - html_result += f"

** {index2 + 1}. {get_perspective_str(per)}

" + f"
\n{' ' * 2}{i1 + 1}. {get_language_str(lang)}
") + for i2, per in enumerate(pers): + html_result += ( + f"
{' ' * 6}{i1 + 1}.{i2 + 1} {get_perspective_str(per)}
") return html_result - language_str = f'language {base_language_id[0]}/{base_language_id[1]}' - base_language_name = get_language_str(base_language_id) + language_str = f'language {base_language_id[0]}/{base_language_id[1]}' if base_language_id else "" + base_language_name = get_language_str(base_language_id) if base_language_id else "" _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( From dbb483ab60816995ad48b06620bc7ff914b9075e Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Thu, 3 Oct 2024 21:04:04 +0300 Subject: [PATCH 16/17] beauty --- lingvodoc/schema/gql_cognate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index de6f12e9..a48e93b0 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -5726,7 +5726,7 @@ def export_html(): # Start index for distances from 1 to match with dictionaries numbers distance_frame.index += 1 - html_result = build_table(distance_frame, 'yellow_dark', width="300px", index=True) + html_result = build_table(distance_frame, 'orange_light', width="300px", index=True) for i1, (lang, pers) in enumerate(pers_by_lang.items()): html_result += ( From 074ca56abe764a1a567fba48aa479f65533c4381 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Thu, 3 Oct 2024 22:25:46 +0300 Subject: [PATCH 17/17] fixed min and max distances --- lingvodoc/schema/gql_cognate.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index a48e93b0..1e25f319 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -4873,7 +4873,7 @@ def split_lex(lex): not_enough_count += (count < 2) dictionary_count = len(means) - distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float') + distance_data_array = numpy.full((dictionary_count, dictionary_count), 25, dtype='float') relation_data_array = numpy.full((dictionary_count, dictionary_count), 1, dtype='float') complex_data_array = numpy.full((dictionary_count, dictionary_count), "n/a", dtype='object') distance_header_array = numpy.full(dictionary_count, "", dtype='object') @@ -5340,7 +5340,7 @@ def morph_cognate_statistics( not_enough_count += (count < 2) dictionary_count = len(result_pool) - distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float') + distance_data_array = numpy.full((dictionary_count, dictionary_count), 25, dtype='float') complex_data_array = numpy.full((dictionary_count, dictionary_count), "n/a", dtype='object') distance_header_array = numpy.full(dictionary_count, "", dtype='object') @@ -5665,15 +5665,17 @@ def get_complex_matrix(result_pool): relation_result[pair] = relation_by_pair[pair] # Getting result complex matrix + max_distance = 25 language_list = list(pers_by_lang.keys()) l_num = len(language_list) - distance_matrix = numpy.full((l_num, l_num), 1, dtype='float') + distance_matrix = numpy.full((l_num, l_num), max_distance, dtype='float') for (l1_id, l2_id), relation in relation_result.items(): i = language_list.index(l1_id) j = language_list.index(l2_id) distance_matrix[i, j] = distance_matrix[j, i] = ( - math.sqrt(math.log(relation) / -0.1 / math.sqrt(relation)) if relation > 0 else 25) + math.sqrt(math.log(relation) / -0.1 / math.sqrt(relation)) if relation > 0 else max_distance) + distance_matrix[i, i] = distance_matrix[j, j] = 0 return distance_matrix, pers_by_lang