-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwatch.rb
245 lines (182 loc) · 6.75 KB
/
watch.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
require "optparse"
require "nokogiri"
require "httparty"
require "logger"
require "dotenv"
require "thread"
# create threads list
# threading is used in this program to improve performance
# since so many network requests are made
threads = []
# load environment variables
Dotenv.load
current_date = Time.now.strftime("%Y-%m-%d-%H:%M:%S")
@logger = Logger.new("logs/#{current_date}.log")
@logger.level = Logger::INFO
# get dictionary of command ARG
options = {}
OptionParser.new do |option|
option.on("-f", "--file FILE", "Directory to process") do |f| options[:file] = f end
option.on("-d", "--domain DOMAIN", "Domain to use") do |d| options[:domain] = d end
option.on("-h", "--help", "Show this message") do puts option; exit end
option.on("-w", "--webhook", "Send webhook when link program has completed") do |w| options[:webhook] = w end
option.on("-a", "--webhook-auth KEY", "Key to be sent in an Authorization header to Webhook") do |a| options[:key] = a end
option.on("-e", "--include DIR", "Folders to process, separated by commas") do |e| options[:exclude] = e end
end.parse!
if !options.key?(:file)
puts "Please specify a file to process."
exit
end
if !options.key?(:domain)
puts "Please specify a domain to use."
exit
end
@domain = options[:domain]
puts "Welcome to the HTML / Markdown link rot substitution tool."
puts "This tool will take a file and replace all links to the site with links to the archive.org website."
puts "This program may take a while to run depending on how many outgoing links you have on your site."
puts "Logging output to logs/#{current_date}.log"
headers = {
"User-Agent" => "link-rot-detector (https://github.com/capjamesg/markdown-html-link-rot)"
}
@substitutions = []
@failed_substitutions = []
# add user specified directories to list of directories to include
if options.key?(:exclude)
directories_to_include << options[:exclude].split(",")
end
def get_archive_link(anchor)
if !anchor.start_with?("http")
is_site_link = true
if anchor.start_with?("/")
anchor = "https://#{@domain}#{anchor.rstrip}"
else
return nil, nil
end
else
is_site_link = false
end
begin
r = HTTParty.get(anchor, headers: @headers, follow_redirects: true, timeout: 10)
# don't run archive substitution on valid urls
if r.code != 404
return nil, nil
end
rescue StandardError => e
@logger.warn("Failed to get archive link for #{anchor}")
@logger.warn(e)
nil
end
if is_site_link == true
@failed_substitutions << anchor
return nil, nil
end
archive_link = "https://archive.org/wayback/available?url=#{anchor}"
begin
req = HTTParty.get(archive_link)
rescue
@failed_substitutions << anchor
return nil, nil
end
if req.code != 200
@logger.info("Error: #{anchor} could not be retrieved from the Wayback Machine")
@failed_substitutions << anchor
return nil, nil
end
as_json = JSON.parse(req.body)
if as_json["archived_snapshots"] == {}
@logger.info("Error: #{anchor} could not be retrieved from the Wayback Machine")
@failed_substitutions << anchor
@logger.info("Submitting #{anchor} for archiving on the Wayback Machine")
begin
req = HTTParty.get("https://web.archive.org/save/https://#{anchor}")
rescue
@logger.info("Error: #{anchor} could not be submitted for archiving on the Wayback Machine")
end
if req.code == 200
@logger.info("Successfully submitted #{anchor} for archiving on the Wayback Machine")
else
@logger.info("Error: #{anchor} could not be submitted for archiving on the Wayback Machine")
end
return nil, nil
end
closest_link = as_json["archived_snapshots"]["closest"]["url"]
@logger.info("#{anchor} -> #{closest_link}")
return anchor, closest_link
end
markdown_files = Dir.glob("#{options[:file]}/**/*.md") + Dir.glob("#{options[:file]}/**/*.html")
markdown_files.each do |f|
threads << Thread.new {
changed = false
page = File.open(f)
full_page = page.read
markdown_links = full_page.scan(/\[(.*?)\]\((.*?)\)/)
@logger.info("Processing #{f}")
for l in markdown_links
anchor = l[1]
anchor, closest_link = get_archive_link(anchor)
if anchor == nil || closest_link == nil
next
end
# replace old link with new one
# add (archived) message to indicate a link has been archived
@substitutions << [anchor, closest_link]
puts "Substituting #{anchor} with #{closest_link}"
full_page.gsub!("(#{anchor})", "(#{closest_link}) (archived)")
changed = true
end
# get link path and anchor text
html_links = full_page.scan(/<a.*?href="(.*?)".*?>(.*?)<\/a>/)
for l in html_links
anchor = l[0]
anchor, closest_link = get_archive_link(anchor)
if anchor == nil || closest_link == nil
next
end
# replace old link with new one
# add (archived) message to indicate a link has been archived
@substitutions << [anchor, closest_link]
puts closest_link
# replace regex link with archive link
full_page.gsub!(/<a.*?href="(.*?)".*?>(.*?)<\/a>/, "<a href=\"#{closest_link}\">\\2</a> (archived)")
changed = true
end
if changed == true
File.open(f, "w") do |file|
puts "Fixed a link on #{f}"
file.write full_page
end
end
}
end
# execute all threads
threads.each { |thr| thr.join }
if @substitutions.length > 0
to_send = """
The link rot bot has identified #{@substitutions.length} broken links. These links have been replaced with archived versions.
See below for the changes made.
#{@substitutions.map { |s| "* #{s[0]} -> #{s[1]}" }.join("\n")}
"""
else
to_send = "The link rot bot has identified no broken links."
end
if @failed_substitutions.length > 0
to_send += "\n\n#{@failed_substitutions.length} links could not be archived. These are:\n\n"
to_send += @failed_substitutions.map { |s| "* #{s}" }.join("\n")
end
if options[:webhook] != nil
headers = {
"Authorization" => "Basic #{options[:key]}"
}
message = {
"message" => to_send
}
req = HTTParty.post(options[:webhook], body: message, headers: headers)
if req.code == 200
puts "Result sent to webhook"
else
puts "Error: #{req.code}"
end
else
puts to_send
end