-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlinks2markdown.py
executable file
·132 lines (103 loc) · 3.66 KB
/
links2markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
# SPDX-License-Identifier: WTFPL
# /// script
# dependencies = ["requests"]
# ///
import argparse
import locale
import os
import re
import signal
import sys
from html.parser import HTMLParser
import requests
LINK_RE = re.compile(r"""(?<!\]\()(?<!<)https?://[^])'">\s]+""")
# Search naked links, skip links that are already in markdown.
# "...](https://..." looks like a markdown link: skip it
# same for "<https://...", looks like an autolink
# Link reference definitions are handled separately.
# Note this is very crude, unlike a real markdown/commonmark parser, and will
# fail some cases. For example "foo](https://foo" will be interpreted as
# markdown though the link is incomplete (missing "[" and ")").
class TitleFetchParser(HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.path = []
self.title = None
def handle_starttag(self, tag, attrs):
self.path.insert(0, tag)
def handle_endtag(self, tag):
try:
idx = self.path.index(tag)
except ValueError:
return
raise AssertionError(f"{self.path[0]!r} != {tag!r}")
del self.path[:idx + 1]
def handle_data(self, data):
if self.title:
return
if self.path and self.path[0] == "title" and "head" in self.path:
self.title = data
def fetch_title(url):
if sys.stderr.isatty():
# fill info string with spaces till the end of line & rewind line to overwrite
term_width = os.get_terminal_size(sys.stderr.fileno()).columns
print(f"{f'Fetching {url}':{term_width}}", file=sys.stderr, end="\r")
try:
response = requests.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0",
"Accept": "text/html",
},
timeout=120,
)
except requests.exceptions.RequestException as exc:
print(f"error: failed to fetch {url}: {exc}", file=sys.stderr)
return None
parser = TitleFetchParser(convert_charrefs=True)
try:
parser.feed(response.text)
parser.close()
except AssertionError as exc:
print(f"error: failed extracting title from {url}: {exc}", file=sys.stderr)
return None
else:
return parser.title
def link_to_markdown(m):
url = m[0]
if m.start() > 3 and m.string[m.start() - 3:m.start()] == "]: ":
# Looks like a "link reference definition".
# Commonmark allows much more whitespace variations, that this crude
# approach will not find. So it will not skip them, though it should.
return url
title = fetch_title(url)
if not title:
# Create an autolink: the link title will be the URL. Setting a fixed
# title like "error fetching title" would confuse the viewer more than
# having the URL as title.
return f"<{url}>"
title = re.sub(r"\s+", " ", title.strip())
return f"[{title}]({url})"
def main():
locale.setlocale(locale.LC_ALL, "")
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
parser = argparse.ArgumentParser(
epilog="Outputs processed text on stdout",
)
parser.add_argument(
"file", default="-", nargs="?",
help="Input file to process ('-' for stdin)",
)
args = parser.parse_args()
if args.file == "-":
fp = sys.stdin
else:
fp = open(args.file)
with fp:
for line in fp:
line = LINK_RE.sub(link_to_markdown, line)
print(line, end="")
if __name__ == "__main__":
main()