-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_to_markdown.py
More file actions
200 lines (163 loc) · 8.4 KB
/
html_to_markdown.py
File metadata and controls
200 lines (163 loc) · 8.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python3
import os
import sys
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
import re
from typing import List, Set, Dict
import html2text
class DocumentationConverter:
def __init__(self, input_path: str, output_path: str):
self.input_path = Path(input_path)
self.output_path = Path(output_path)
self.processed_links: Set[str] = set()
self.converter = html2text.HTML2Text()
self.converter.body_width = 0 # Don't wrap text
self.page_titles: Dict[str, str] = {} # Maps file paths to their titles
self.symbol_locations: Dict[str, str] = {} # Maps symbol names to their section titles
def _read_html_file(self, file_path: Path) -> str:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
def _convert_html_to_markdown(self, html_content: str, file_path: Path) -> str:
soup = BeautifulSoup(html_content, 'html.parser')
# First collect all section IDs and their titles
for header in soup.find_all(['h1', 'h2', 'h3', 'h4']):
if header.get('id'):
self.symbol_locations[header['id']] = header.get_text().strip()
# Update internal links to point to sections
for a in soup.find_all('a', href=True):
href = a['href']
if href.startswith('http') or href.startswith('mailto:'):
continue
# Handle fragment identifiers
file_part = href.split('#')[0] if '#' in href else href
fragment = href.split('#')[1] if '#' in href else ''
if file_part:
target_path = (file_path.parent / file_part).resolve()
if target_path.suffix.lower() in ['.html', '.htm']:
# Get the title for this file
if str(target_path) in self.page_titles:
section_title = self.page_titles[str(target_path)]
if fragment and fragment in self.symbol_locations:
# If we have a specific symbol, use its title
section_title = self.symbol_locations[fragment]
# Create a proper markdown link
new_href = f"#{section_title.lower().replace(' ', '-').replace('()', '')}"
a['href'] = new_href
elif fragment and fragment in self.symbol_locations:
# Internal page link
section_title = self.symbol_locations[fragment]
new_href = f"#{section_title.lower().replace(' ', '-').replace('()', '')}"
a['href'] = new_href
# Special handling for table of contents
toc = soup.find('div', class_='toctree-wrapper')
if toc:
output = []
for item in toc.find_all('li', class_='toctree-l1'):
module_name = item.get_text().strip()
if module_name in self.symbol_locations:
link_text = f"[{module_name}](#{module_name.lower().replace(' ', '-').replace('()', '')})"
output.append(f"* {link_text}")
# Handle nested items
for subitem in item.find_all('li', class_='toctree-l2'):
symbol = subitem.get_text().strip()
if symbol in self.symbol_locations:
link_text = f"[`{symbol}`](#{symbol.lower().replace(' ', '-').replace('()', '')})"
output.append(f" * {link_text}")
if output:
return "\n".join(output) + "\n\n" + self.converter.handle(str(soup))
return self.converter.handle(str(soup))
def _get_links(self, soup: BeautifulSoup, base_path: Path) -> List[Path]:
links = []
for a in soup.find_all('a', href=True):
href = a['href']
if href.startswith('http') or href.startswith('#') or href.startswith('mailto:'):
continue
# Get the file part of the href (ignore fragments)
file_part = href.split('#')[0]
if not file_part:
continue
# Convert relative path to absolute
full_path = (base_path / file_part).resolve()
if full_path.exists() and full_path.suffix.lower() in ['.html', '.htm']:
links.append(full_path)
return links
def _get_page_title(self, soup: BeautifulSoup, file_path: Path) -> str:
# Try to get title from first h1
title = soup.find('h1')
if not title:
# Try to get from title tag
title = soup.find('title')
if title:
# Clean up the title text
title_text = title.get_text().strip()
# Remove any special characters that might cause issues in markdown links
title_text = re.sub(r'[^\w\s-]', '', title_text)
# Store this title in the symbol locations
self.symbol_locations[title_text] = title_text
return title_text
return file_path.stem
def _process_page(self, file_path: Path, level: int = 1) -> str:
if file_path in self.processed_links:
return ""
self.processed_links.add(file_path)
content = self._read_html_file(file_path)
soup = BeautifulSoup(content, 'html.parser')
# Get and store the title
title = self._get_page_title(soup, file_path)
self.page_titles[str(file_path)] = title
# Convert content to markdown with updated links
markdown_content = self._convert_html_to_markdown(str(soup), file_path)
# Process all links in the page
links = self._get_links(soup, file_path.parent)
sub_content = ""
for link in links:
sub_content += self._process_page(link, level + 1)
return f"{'#' * level} {title}\n\n{markdown_content}\n\n{sub_content}"
def convert(self):
# Create output directory if it doesn't exist
self.output_path.mkdir(parents=True, exist_ok=True)
# Generate output filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"{str(self.input_path).replace('/', '_')}_{timestamp}.md"
output_file = self.output_path / output_filename
# Process the index file
index_file = self.input_path / 'index.html'
if not index_file.exists():
raise FileNotFoundError(f"Index file not found at {index_file}")
# First pass: collect all page titles and symbols
content = self._read_html_file(index_file)
soup = BeautifulSoup(content, 'html.parser')
links = self._get_links(soup, index_file.parent)
self.page_titles[str(index_file)] = self._get_page_title(soup, index_file)
# Collect all symbols from all pages
for link in links:
if str(link) not in self.page_titles:
content = self._read_html_file(link)
soup = BeautifulSoup(content, 'html.parser')
self.page_titles[str(link)] = self._get_page_title(soup, link)
# Collect symbols from headers
for header in soup.find_all(['h1', 'h2', 'h3', 'h4']):
if header.get('id'):
self.symbol_locations[header['id']] = header.get_text().strip()
self.symbol_locations[header.get_text().strip()] = header.get_text().strip()
# Reset processed links for the actual content generation
self.processed_links.clear()
# Second pass: generate content with proper links
content = self._process_page(index_file)
# Write the final markdown file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content)
return output_file
def main():
if len(sys.argv) != 3:
print("Usage: python html_to_markdown.py <input_path> <output_path>")
sys.exit(1)
input_path = sys.argv[1]
output_path = sys.argv[2]
converter = DocumentationConverter(input_path, output_path)
output_file = converter.convert()
print(f"Documentation has been converted and saved to: {output_file}")
if __name__ == "__main__":
main()