diff --git a/wagtail_wordpress_import/analysis.py b/wagtail_wordpress_import/analysis.py index f7d96289..59bf0a01 100644 --- a/wagtail_wordpress_import/analysis.py +++ b/wagtail_wordpress_import/analysis.py @@ -21,6 +21,11 @@ def __init__(self): self.classes_unique_pages = Counter() self.shortcodes_unique_pages = Counter() + self.tags_page_url = {} + self.attributes_page_url = {} + self.styles_page_url = {} + self.shortcodes_page_url = {} + @classmethod def find_all_tags(cls, dom): names = Counter() @@ -91,7 +96,7 @@ def find_all_shortcodes(cls, dom): return shortcodes - def analyze(self, html): + def analyze(self, html, page_url): self.total += 1 try: @@ -117,3 +122,15 @@ def analyze(self, html): self.styles_unique_pages.update(styles.keys()) self.classes_unique_pages.update(classes.keys()) self.shortcodes_unique_pages.update(shortcodes.keys()) + + for tag in tags.keys(): + self.tags_page_url[tag] = page_url + + for attribute in attributes.keys(): + self.attributes_page_url[attribute] = page_url + + for style in styles.keys(): + self.styles_page_url[style] = page_url + + for shortcode in shortcodes.keys(): + self.shortcodes_page_url[shortcode] = page_url diff --git a/wagtail_wordpress_import/importers/wordpress.py b/wagtail_wordpress_import/importers/wordpress.py index 8c056580..f8629b27 100644 --- a/wagtail_wordpress_import/importers/wordpress.py +++ b/wagtail_wordpress_import/importers/wordpress.py @@ -265,7 +265,8 @@ def analyze_html(self, html_analyzer, *, page_types, page_statuses): ): html_analyzer.analyze( - filter_linebreaks_wp(item.get("content:encoded")) + filter_linebreaks_wp(item.get("content:encoded")), + item.get("link"), ) def connect_richtext_page_links(self, imported_pages): diff --git a/wagtail_wordpress_import/management/commands/analyze_html_content.py b/wagtail_wordpress_import/management/commands/analyze_html_content.py index 314ab404..d40ceccf 100644 --- a/wagtail_wordpress_import/management/commands/analyze_html_content.py +++ b/wagtail_wordpress_import/management/commands/analyze_html_content.py @@ -58,9 +58,9 @@ def handle(self, **options): # Tags tags_table = PrettyTable() - tags_table.field_names = ["Tag", "Pages used on", "Total occurrences"] + tags_table.field_names = ["Tag", "Pages used on", "Total occurrences", "Last URL"] for tag, total_pages in analyzer.tags_unique_pages.most_common(): - tags_table.add_row([tag, total_pages, analyzer.tags_total[tag]]) + tags_table.add_row([tag, total_pages, analyzer.tags_total[tag], analyzer.tags_page_url[tag]]) self.stdout.write("Most commonly used HTML tags") self.stdout.write(str(tags_table)) @@ -72,6 +72,7 @@ def handle(self, **options): "Attribute", "Pages used on", "Total occurrences", + "Last URL", ] for ( tag, @@ -83,6 +84,7 @@ def handle(self, **options): attribute, total_pages, analyzer.attributes_total[(tag, attribute)], + analyzer.attributes_page_url[(tag, attribute)], ] ) @@ -97,10 +99,11 @@ def handle(self, **options): "Style", "Pages used on", "Total occurrences", + "Last URL", ] for (tag, style), total_pages in analyzer.styles_unique_pages.most_common(): styles_table.add_row( - [tag, style, total_pages, analyzer.styles_total[(tag, style)]] + [tag, style, total_pages, analyzer.styles_total[(tag, style)], analyzer.styles_page_url[(tag, style)]] ) self.stdout.write("") @@ -112,10 +115,11 @@ def handle(self, **options): "Shortcode", "Pages used on", "Total occurrences", + "Last URL", ] for shortcode, total_pages in analyzer.shortcodes_unique_pages.most_common(): shortcodes_table.add_row( - [shortcode, total_pages, analyzer.shortcodes_total[shortcode]] + [shortcode, total_pages, analyzer.shortcodes_total[shortcode], analyzer.shortcodes_page_url[shortcode]] ) self.stdout.write("Most commonly used shortcodes")