forked from jlelse/GoBlog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck.go
139 lines (133 loc) · 3.46 KB
/
check.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
package main
import (
"context"
"fmt"
"io"
"net/http"
"strings"
"sync/atomic"
"time"
"github.com/carlmjohnson/requests"
"github.com/samber/lo"
"github.com/sourcegraph/conc/pool"
"go.goblog.app/app/pkgs/bodylimit"
cpkg "go.goblog.app/app/pkgs/cache"
"go.goblog.app/app/pkgs/httpcachetransport"
)
func (a *goBlog) checkAllExternalLinks() error {
posts, err := a.getPosts(&postsRequestConfig{
status: []postStatus{statusPublished},
visibility: []postVisibility{visibilityPublic, visibilityUnlisted},
fetchWithoutParams: true,
})
if err != nil {
return err
}
return a.checkLinks(posts...)
}
func (a *goBlog) checkLinks(posts ...*post) error {
// Get all links
allLinks, err := a.allLinksToCheck(posts...)
if err != nil {
return err
}
// Print some info
fmt.Println("Checking", len(allLinks), "links")
// Cancel context
cancelContext, cancelFunc := context.WithCancel(context.Background())
var done atomic.Bool
a.shutdown.Add(func() {
done.Store(true)
cancelFunc()
fmt.Println("Cancelled link check")
})
// Create HTTP client
cache := cpkg.New[string, []byte](time.Minute, 5000)
client := &http.Client{
Timeout: 30 * time.Second,
Transport: httpcachetransport.NewHttpCacheTransportNoBody(
newHttpTransport(),
cache, 60*time.Minute, 5*bodylimit.MB,
),
}
// Process all links
type checkresult struct {
in, link string
status int
err error
}
p := pool.NewWithResults[*checkresult]().WithMaxGoroutines(10).WithContext(cancelContext)
for _, link := range allLinks {
link := link
p.Go(func(ctx context.Context) (result *checkresult, _ error) {
if done.Load() {
return nil, nil
}
result = &checkresult{
in: link.First,
link: link.Second,
}
// Build request
req, err := requests.URL(link.Second).
UserAgent("Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0").
Accept("text/html").
Header("Accept-Language", "en-US,en;q=0.5").
Request(ctx)
if err != nil {
result.err = err
return
}
// Do request
resp, err := client.Do(req)
if err != nil {
result.err = err
return
}
// Save status code
result.status = resp.StatusCode
// Close request
_ = resp.Body.Close()
return
})
}
results, _ := p.Wait()
for _, r := range results {
if r == nil {
continue
}
if r.err != nil {
fmt.Printf("%s in %s: %s\n", r.link, r.in, r.err.Error())
} else if !successStatus(r.status) {
fmt.Printf("%s in %s: %d (%s)\n", r.link, r.in, r.status, http.StatusText(r.status))
}
}
fmt.Println("Finished link check")
return nil
}
func (a *goBlog) allLinksToCheck(posts ...*post) ([]*stringPair, error) {
p := pool.NewWithResults[[]*stringPair]().WithErrors()
for _, post := range posts {
post := post
p.Go(func() ([]*stringPair, error) {
pr, pw := io.Pipe()
go func() {
a.postHtmlToWriter(pw, &postHtmlOptions{p: post, absolute: true})
_ = pw.Close()
}()
links, err := allLinksFromHTML(pr, a.fullPostURL(post))
_ = pr.CloseWithError(err)
if err != nil {
return nil, err
}
// Remove internal links
links = lo.Filter(links, func(i string, _ int) bool { return !strings.HasPrefix(i, a.cfg.Server.PublicAddress) })
// Map to string pair
return lo.Map(links, func(s string, _ int) *stringPair { return &stringPair{a.fullPostURL(post), s} }), nil
})
}
results, err := p.Wait()
return lo.Flatten(results), err
}
func successStatus(status int) bool {
return status >= 200 && status < 400
}