Skip to content

Commit 0fad9cc

Browse files
committed
feat: initial commit with docs and output structure
1 parent 40b4950 commit 0fad9cc

File tree

11 files changed

+430
-0
lines changed

11 files changed

+430
-0
lines changed

.github/workflows/update.yml

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: Update Feeds
2+
on:
3+
schedule:
4+
- cron: '0 */12 * * *'
5+
workflow_dispatch:
6+
jobs:
7+
update:
8+
runs-on: ubuntu-latest
9+
steps:
10+
- uses: actions/checkout@v3
11+
- name: Set up Go
12+
uses: actions/setup-go@v4
13+
with:
14+
go-version: '1.21'
15+
- name: Prep
16+
run: mkdir -p output
17+
- name: Build and Run
18+
run: |
19+
go build -o harvest ./src
20+
./harvest
21+
- name: Commit
22+
run: |
23+
git config --local user.email "[email protected]"
24+
git config --local user.name "GitHub Action"
25+
git add output/blog_posts.md
26+
git commit -m "action: update blog posts" || echo "no changes to commit"
27+
git push

README.md

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# harvest
2+
3+
RSS feed aggregator for UPL member blogs.
4+
5+
- [Add your blog](docs/CONTRIBUTING.md)
6+
- [Technical details](docs/TECHNICAL.md)
7+
8+
Updates every 12 hours. See [output/blog_posts.md](output/blog_posts.md) for latest posts.

docs/CONTRIBUTING.md

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Contributing to Harvest
2+
3+
## Adding Your Blog
4+
5+
1. Fork this repository
6+
2. Add your RSS feed URL to `whitelist.toml`
7+
3. Create a PR
8+
9+
Requirements:
10+
- Must be a personal blog RSS feed
11+
- No sitemaps
12+
- Feed must include title, link, and publication date
13+
- Posts should be tech-focused
14+
15+
## Development
16+
17+
### Setup
18+
```bash
19+
go mod tidy
20+
```
21+
22+
### Build
23+
```bash
24+
go build -o harvest ./src
25+
```
26+
27+
### Run
28+
```bash
29+
./harvest
30+
```
31+
32+
### Structure
33+
- `src/`: Go source code
34+
- `output/`: Generated blog post markdown
35+
- `whitelist.toml`: RSS feed list

docs/TECHNICAL.md

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Technical Details
2+
3+
## RSS Support
4+
5+
Handles a bunch of RSS formats because everyone implements them differently:
6+
7+
### Core Fields
8+
```go
9+
type Item struct {
10+
Title string `xml:"title"`
11+
Link string `xml:"link"`
12+
PubDate string `xml:"pubDate"` // regular rss
13+
Date string `xml:"date"` // some use this
14+
Published string `xml:"published"` // atom folks
15+
Updated string `xml:"updated"` // fallback
16+
}
17+
```
18+
19+
### Date Hell
20+
RSS feeds use whatever date format they feel like. We handle:
21+
```go
22+
var dateFormats = []string{
23+
time.RFC1123Z, // most RSS
24+
time.RFC3339, // atom's favorite
25+
"02 Jan 2006 15:04 -0700", // why do people use this
26+
"2006-01-02", // at least it's simple
27+
}
28+
```
29+
30+
### Content Cleanup
31+
- Strips HTML (nobody needs that in a feed)
32+
- Fixes entities (`&``&`)
33+
- Handles missing descriptions (minimalist blogs)
34+
35+
## How It Works
36+
37+
1. Reads feeds from `whitelist.toml`
38+
2. Downloads them all at once (because waiting sucks)
39+
3. Parses XML, prays it's valid
40+
4. Cleans up the mess
41+
5. Dumps a nice markdown file in `output/`
42+
43+
The code's modular so we can add new formats when someone inevitably implements RSS wrong again. This has been so fun to troubleshoot.

go.mod

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
module github.com/UW-UPL/harvest
2+
3+
go 1.21
4+
5+
require github.com/pelletier/go-toml/v2 v2.1.1

go.sum

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
2+
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
3+
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
4+
github.com/pelletier/go-toml/v2 v2.1.1 h1:LWAJwfNvjQZCFIDKWYQaM62NcYeYViCmWIwmOStowAI=
5+
github.com/pelletier/go-toml/v2 v2.1.1/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc=
6+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
7+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
8+
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
9+
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
10+
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
11+
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
12+
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
13+
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
14+
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
15+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
16+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
17+
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
18+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

src/feed/fetcher.go

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
package feed
2+
3+
import (
4+
"encoding/xml"
5+
"fmt"
6+
"io"
7+
"log"
8+
"net/http"
9+
"regexp"
10+
"sort"
11+
"strings"
12+
"sync"
13+
"time"
14+
)
15+
16+
var dateFormats = []string{
17+
time.RFC1123Z,
18+
time.RFC1123,
19+
time.RFC3339,
20+
time.RFC3339Nano,
21+
"2006-01-02T15:04:05Z",
22+
"2006-01-02 15:04:05 -0700",
23+
"02 Jan 2006 15:04 -0700",
24+
"Mon, 02 Jan 2006 15:04:05 GMT",
25+
"02 Jan 2006 15:04 +0000",
26+
"2006-01-02",
27+
"January 2, 2006",
28+
}
29+
30+
func cleanHTML(input string) string {
31+
// first, remove HTML tags
32+
tagRegex := regexp.MustCompile("<[^>]*>")
33+
cleaned := tagRegex.ReplaceAllString(input, "")
34+
35+
// & convert HTML entities
36+
cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
37+
cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
38+
cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
39+
cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
40+
cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
41+
42+
// & normalize whitespace
43+
wsRegex := regexp.MustCompile(`\s+`)
44+
cleaned = wsRegex.ReplaceAllString(cleaned, " ")
45+
46+
return strings.TrimSpace(cleaned)
47+
}
48+
49+
func parseDate(item Item) time.Time {
50+
dateCandidates := []string{
51+
item.PubDate,
52+
item.Date,
53+
item.Published,
54+
item.Updated,
55+
}
56+
57+
for _, dateStr := range dateCandidates {
58+
if dateStr == "" {
59+
continue
60+
}
61+
62+
for _, format := range dateFormats {
63+
if t, err := time.Parse(format, dateStr); err == nil {
64+
return t
65+
}
66+
}
67+
}
68+
69+
log.Printf("warn: Could not parse any date from item %s", item.Title)
70+
return time.Now()
71+
}
72+
73+
func getDescription(item Item) string {
74+
candidates := []string{
75+
item.Description,
76+
item.Content,
77+
item.Encoded,
78+
}
79+
80+
for _, candidate := range candidates {
81+
if candidate != "" {
82+
return cleanHTML(candidate)
83+
}
84+
}
85+
86+
return "Visit post for details."
87+
}
88+
89+
func getAuthor(item Item, channelTitle string) string {
90+
if item.Author != "" {
91+
return item.Author
92+
}
93+
if item.Creator != "" {
94+
return item.Creator
95+
}
96+
return channelTitle
97+
}
98+
99+
func FetchFeed(url string) ([]BlogPost, error) {
100+
resp, err := http.Get(url)
101+
if err != nil {
102+
return nil, fmt.Errorf("fetching feed %s: %w", url, err)
103+
}
104+
defer resp.Body.Close()
105+
106+
body, err := io.ReadAll(resp.Body)
107+
if err != nil {
108+
return nil, fmt.Errorf("reading res from %s: %w", url, err)
109+
}
110+
111+
var feed Feed
112+
if err := xml.Unmarshal(body, &feed); err != nil {
113+
return nil, fmt.Errorf("parsing feed %s: %w", url, err)
114+
}
115+
116+
var posts []BlogPost
117+
118+
// we need to worry about both RSS *AND* Atom feeds
119+
items := feed.Channel.Items
120+
if len(items) == 0 {
121+
items = feed.Channel.Entries
122+
}
123+
if len(items) == 0 {
124+
items = feed.Entries
125+
}
126+
127+
for _, item := range items {
128+
post := BlogPost{
129+
Title: item.Title,
130+
Link: item.Link,
131+
Date: parseDate(item),
132+
Author: getAuthor(item, feed.Channel.Title),
133+
Summary: getDescription(item),
134+
}
135+
posts = append(posts, post)
136+
}
137+
138+
return posts, nil
139+
}
140+
141+
func FetchAllFeeds(feeds []string) []BlogPost {
142+
var (
143+
wg sync.WaitGroup
144+
mu sync.Mutex
145+
posts []BlogPost
146+
)
147+
148+
for _, feedURL := range feeds {
149+
wg.Add(1)
150+
go func(url string) {
151+
defer wg.Done()
152+
153+
feedPosts, err := FetchFeed(url)
154+
if err != nil {
155+
log.Printf("err fetching %s: %v", url, err)
156+
return
157+
}
158+
159+
mu.Lock()
160+
posts = append(posts, feedPosts...)
161+
mu.Unlock()
162+
}(feedURL)
163+
}
164+
165+
wg.Wait()
166+
167+
sort.Slice(posts, func(i, j int) bool {
168+
return posts[i].Date.After(posts[j].Date)
169+
})
170+
171+
return posts
172+
}

src/feed/types.go

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package feed
2+
3+
import "time"
4+
5+
type Config struct {
6+
Feeds []string `toml:"feeds"`
7+
}
8+
9+
type Item struct {
10+
Title string `xml:"title"`
11+
Link string `xml:"link"`
12+
PubDate string `xml:"pubDate"`
13+
Date string `xml:"date"`
14+
Published string `xml:"published"`
15+
Updated string `xml:"updated"`
16+
Author string `xml:"author"`
17+
Creator string `xml:"creator"`
18+
Description string `xml:"description"`
19+
Content string `xml:"content"`
20+
Encoded string `xml:"encoded"`
21+
}
22+
23+
type Channel struct {
24+
Title string `xml:"title"`
25+
Link string `xml:"link"`
26+
Description string `xml:"description"`
27+
Items []Item `xml:"item"`
28+
Entries []Item `xml:"entry"`
29+
}
30+
31+
type Feed struct {
32+
Channel Channel `xml:"channel"`
33+
Entries []Item `xml:"entry"`
34+
}
35+
36+
type BlogPost struct {
37+
Title string
38+
Link string
39+
Date time.Time
40+
Author string
41+
Summary string
42+
}

0 commit comments

Comments
 (0)