From 1181cfcedb05e164502048f1a8e2de132e65fc0b Mon Sep 17 00:00:00 2001 From: Bram Buitendijk Date: Wed, 6 Jul 2022 15:25:13 +0200 Subject: [PATCH] organize imports --- pagexml/analysis/layout_stats.py | 2 +- pagexml/analysis/text_stats.py | 4 ++-- pagexml/helper/pagexml_helper.py | 10 +++++----- pagexml/helper/text_helper.py | 14 +++++++------- pagexml/model/physical_document_model.py | 3 ++- pagexml/parser.py | 4 ++-- pagexml/plotting/plot_dist.py | 6 +++--- 7 files changed, 22 insertions(+), 21 deletions(-) diff --git a/pagexml/analysis/layout_stats.py b/pagexml/analysis/layout_stats.py index 1fa81ae..a558d4d 100644 --- a/pagexml/analysis/layout_stats.py +++ b/pagexml/analysis/layout_stats.py @@ -1,6 +1,6 @@ -from typing import Dict, Generator, List, Tuple, Union from collections import Counter from collections import defaultdict +from typing import Dict, Generator, List, Tuple, Union import numpy as np diff --git a/pagexml/analysis/text_stats.py b/pagexml/analysis/text_stats.py index 9b87c50..c503e83 100644 --- a/pagexml/analysis/text_stats.py +++ b/pagexml/analysis/text_stats.py @@ -1,7 +1,7 @@ -from typing import List, Set, Tuple, Union +import re from collections import Counter from collections import defaultdict -import re +from typing import List, Set, Tuple, Union import pagexml.helper.text_helper as text_helper diff --git a/pagexml/helper/pagexml_helper.py b/pagexml/helper/pagexml_helper.py index 1b194a9..ded2450 100644 --- a/pagexml/helper/pagexml_helper.py +++ b/pagexml/helper/pagexml_helper.py @@ -1,16 +1,16 @@ -from typing import Dict, Generator, List, Tuple, Union -from collections import Counter import copy -import re import gzip +import re import string +from collections import Counter +from typing import Dict, Generator, List, Tuple, Union import numpy as np -import pagexml.model.physical_document_model as pdm import pagexml.analysis.layout_stats as summarise import pagexml.analysis.text_stats as text_stats import pagexml.helper.text_helper as text_helper +import pagexml.model.physical_document_model as pdm def sort_regions_in_reading_order(doc: pdm.PageXMLDoc) -> List[pdm.PageXMLTextRegion]: @@ -319,7 +319,7 @@ def read_line_format_file(line_format_files: Union[str, List[str]]) -> Generator def make_line_text(line: pdm.PageXMLTextLine, do_merge: bool, end_word: str, merge_word: str, line_break_char: str = '-') -> str: line_text = line.text - if len(line_text) >= 2 and line_text.endswith(line_break_char*2): + if len(line_text) >= 2 and line_text.endswith(line_break_char * 2): # remove the redundant line break char line_text = line_text[:-1] if do_merge: diff --git a/pagexml/helper/text_helper.py b/pagexml/helper/text_helper.py index 6f0ed82..72f6cba 100644 --- a/pagexml/helper/text_helper.py +++ b/pagexml/helper/text_helper.py @@ -1,12 +1,12 @@ -from typing import Dict, List, Tuple, Union, Generator, Iterable -import re import gzip import math +import re from collections import Counter, defaultdict from itertools import combinations +from typing import Dict, List, Tuple, Union, Generator, Iterable -import pagexml.parser as parser import pagexml.model.physical_document_model as pdm +import pagexml.parser as parser class SkipGram: @@ -42,10 +42,10 @@ def text2skipgrams(text: str, ngram_size: int = 2, skip_size: int = 2) -> Genera :rtype: Generator[tuple]""" if ngram_size <= 0 or skip_size < 0: raise ValueError('ngram_size must be a positive integer, skip_size must be a positive integer or zero') - indexes = [i for i in range(0, ngram_size+skip_size)] - skipgram_combinations = [combination for combination in combinations(indexes[1:], ngram_size-1)] - for offset in range(0, len(text)-1): - window = text[offset:offset+ngram_size+skip_size] + indexes = [i for i in range(0, ngram_size + skip_size)] + skipgram_combinations = [combination for combination in combinations(indexes[1:], ngram_size - 1)] + for offset in range(0, len(text) - 1): + window = text[offset:offset + ngram_size + skip_size] for skipgram, skipgram_length in insert_skips(window, skipgram_combinations): yield SkipGram(skipgram, offset, skipgram_length) diff --git a/pagexml/model/physical_document_model.py b/pagexml/model/physical_document_model.py index 8c990a7..f42e546 100644 --- a/pagexml/model/physical_document_model.py +++ b/pagexml/model/physical_document_model.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import Dict, List, Set, Tuple, Union + from collections import defaultdict +from typing import Dict, List, Set, Tuple, Union import numpy as np from scipy.spatial import ConvexHull diff --git a/pagexml/parser.py b/pagexml/parser.py index 49f2f28..cfac40c 100644 --- a/pagexml/parser.py +++ b/pagexml/parser.py @@ -1,7 +1,7 @@ -from typing import Generator, List, Dict, Union -from datetime import datetime import glob import re +from datetime import datetime +from typing import Generator, List, Dict, Union import xmltodict from dateutil.parser import parse as date_parse diff --git a/pagexml/plotting/plot_dist.py b/pagexml/plotting/plot_dist.py index b7bce97..372d11c 100644 --- a/pagexml/plotting/plot_dist.py +++ b/pagexml/plotting/plot_dist.py @@ -1,9 +1,9 @@ -from typing import Dict from collections import Counter +from typing import Dict +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt import seaborn as sns sns.set_theme(style="darkgrid") @@ -26,5 +26,5 @@ def plot_dist_stats(stats: Dict[str, Dict[str, Counter]]) -> None: df = pd.DataFrame(data={field: points}) x, y = zip(*sorted(stats[doc_type][field].items())) sns.histplot(df, ax=axes[fi], x=field, binwidth=binwidth) - #sns.kdeplot(ax=axes[fi], x=x, y=y, cut=0) + # sns.kdeplot(ax=axes[fi], x=x, y=y, cut=0) axes[fi].set_title(field)