docMaker/html_to_markdown.py at main · stavarc/docMaker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python3

import os
import sys
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
import re
from typing import List, Set, Dict
import html2text

class DocumentationConverter:
    def __init__(self, input_path: str, output_path: str):
        self.input_path = Path(input_path)
        self.output_path = Path(output_path)
        self.processed_links: Set[str] = set()
        self.converter = html2text.HTML2Text()
        self.converter.body_width = 0  # Don't wrap text
        self.page_titles: Dict[str, str] = {}  # Maps file paths to their titles
        self.symbol_locations: Dict[str, str] = {}  # Maps symbol names to their section titles

    def _read_html_file(self, file_path: Path) -> str:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()

    def _convert_html_to_markdown(self, html_content: str, file_path: Path) -> str:
        soup = BeautifulSoup(html_content, 'html.parser')

        # First collect all section IDs and their titles
        for header in soup.find_all(['h1', 'h2', 'h3', 'h4']):
            if header.get('id'):
                self.symbol_locations[header['id']] = header.get_text().strip()

        # Update internal links to point to sections
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href.startswith('http') or href.startswith('mailto:'):
                continue

            # Handle fragment identifiers
            file_part = href.split('#')[0] if '#' in href else href
            fragment = href.split('#')[1] if '#' in href else ''

            if file_part:
                target_path = (file_path.parent / file_part).resolve()
                if target_path.suffix.lower() in ['.html', '.htm']:
                    # Get the title for this file
                    if str(target_path) in self.page_titles:
                        section_title = self.page_titles[str(target_path)]
                        if fragment and fragment in self.symbol_locations:
                            # If we have a specific symbol, use its title
                            section_title = self.symbol_locations[fragment]
                        # Create a proper markdown link
                        new_href = f"#{section_title.lower().replace(' ', '-').replace('()', '')}"
                        a['href'] = new_href
            elif fragment and fragment in self.symbol_locations:
                # Internal page link
                section_title = self.symbol_locations[fragment]
                new_href = f"#{section_title.lower().replace(' ', '-').replace('()', '')}"
                a['href'] = new_href

        # Special handling for table of contents
        toc = soup.find('div', class_='toctree-wrapper')
        if toc:
            output = []
            for item in toc.find_all('li', class_='toctree-l1'):
                module_name = item.get_text().strip()
                if module_name in self.symbol_locations:
                    link_text = f"[{module_name}](#{module_name.lower().replace(' ', '-').replace('()', '')})"
                    output.append(f"* {link_text}")

                # Handle nested items
                for subitem in item.find_all('li', class_='toctree-l2'):
                    symbol = subitem.get_text().strip()
                    if symbol in self.symbol_locations:
                        link_text = f"[`{symbol}`](#{symbol.lower().replace(' ', '-').replace('()', '')})"
                        output.append(f"  * {link_text}")

            if output:
                return "\n".join(output) + "\n\n" + self.converter.handle(str(soup))

        return self.converter.handle(str(soup))

    def _get_links(self, soup: BeautifulSoup, base_path: Path) -> List[Path]:
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href.startswith('http') or href.startswith('#') or href.startswith('mailto:'):
                continue

            # Get the file part of the href (ignore fragments)
            file_part = href.split('#')[0]
            if not file_part:
                continue

            # Convert relative path to absolute
            full_path = (base_path / file_part).resolve()
            if full_path.exists() and full_path.suffix.lower() in ['.html', '.htm']:
                links.append(full_path)
        return links

    def _get_page_title(self, soup: BeautifulSoup, file_path: Path) -> str:
        # Try to get title from first h1
        title = soup.find('h1')
        if not title:
            # Try to get from title tag
            title = soup.find('title')

        if title:
            # Clean up the title text
            title_text = title.get_text().strip()
            # Remove any special characters that might cause issues in markdown links
            title_text = re.sub(r'[^\w\s-]', '', title_text)
            # Store this title in the symbol locations
            self.symbol_locations[title_text] = title_text
            return title_text

        return file_path.stem

    def _process_page(self, file_path: Path, level: int = 1) -> str:
        if file_path in self.processed_links:
            return ""

        self.processed_links.add(file_path)
        content = self._read_html_file(file_path)
        soup = BeautifulSoup(content, 'html.parser')

        # Get and store the title
        title = self._get_page_title(soup, file_path)
        self.page_titles[str(file_path)] = title

        # Convert content to markdown with updated links
        markdown_content = self._convert_html_to_markdown(str(soup), file_path)

        # Process all links in the page
        links = self._get_links(soup, file_path.parent)
        sub_content = ""
        for link in links:
            sub_content += self._process_page(link, level + 1)

        return f"{'#' * level} {title}\n\n{markdown_content}\n\n{sub_content}"

    def convert(self):
        # Create output directory if it doesn't exist
        self.output_path.mkdir(parents=True, exist_ok=True)

        # Generate output filename
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_filename = f"{str(self.input_path).replace('/', '_')}_{timestamp}.md"
        output_file = self.output_path / output_filename

        # Process the index file
        index_file = self.input_path / 'index.html'
        if not index_file.exists():
            raise FileNotFoundError(f"Index file not found at {index_file}")

        # First pass: collect all page titles and symbols
        content = self._read_html_file(index_file)
        soup = BeautifulSoup(content, 'html.parser')
        links = self._get_links(soup, index_file.parent)
        self.page_titles[str(index_file)] = self._get_page_title(soup, index_file)

        # Collect all symbols from all pages
        for link in links:
            if str(link) not in self.page_titles:
                content = self._read_html_file(link)
                soup = BeautifulSoup(content, 'html.parser')
                self.page_titles[str(link)] = self._get_page_title(soup, link)
                # Collect symbols from headers
                for header in soup.find_all(['h1', 'h2', 'h3', 'h4']):
                    if header.get('id'):
                        self.symbol_locations[header['id']] = header.get_text().strip()
                    self.symbol_locations[header.get_text().strip()] = header.get_text().strip()

        # Reset processed links for the actual content generation
        self.processed_links.clear()

        # Second pass: generate content with proper links
        content = self._process_page(index_file)

        # Write the final markdown file
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(content)

        return output_file

def main():
    if len(sys.argv) != 3:
        print("Usage: python html_to_markdown.py <input_path> <output_path>")
        sys.exit(1)

    input_path = sys.argv[1]
    output_path = sys.argv[2]

    converter = DocumentationConverter(input_path, output_path)
    output_file = converter.convert()
    print(f"Documentation has been converted and saved to: {output_file}")

if __name__ == "__main__":
    main()