feat: initial commit with docs and output structure

nicosalm · nicosalm · commit 0fad9cc4f49e · 2025-01-26T14:10:40.000-06:00
diff --git a/.github/workflows/update.yml b/.github/workflows/update.yml
@@ -0,0 +1,27 @@
+name: Update Feeds
+on:
+  schedule:
+    - cron: '0 */12 * * *'
+  workflow_dispatch:
+jobs:
+  update:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Go
+        uses: actions/setup-go@v4
+        with:
+          go-version: '1.21'
+      - name: Prep
+        run: mkdir -p output
+      - name: Build and Run
+        run: |
+          go build -o harvest ./src
+          ./harvest
+      - name: Commit
+        run: |
+          git config --local user.email "action@github.com"
+          git config --local user.name "GitHub Action"
+          git add output/blog_posts.md
+          git commit -m "action: update blog posts" || echo "no changes to commit"
+          git push
diff --git a/README.md b/README.md
@@ -0,0 +1,8 @@
+# harvest
+
+RSS feed aggregator for UPL member blogs.
+
+- [Add your blog](docs/CONTRIBUTING.md)
+- [Technical details](docs/TECHNICAL.md)
+
+Updates every 12 hours. See [output/blog_posts.md](output/blog_posts.md) for latest posts.
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -0,0 +1,35 @@
+# Contributing to Harvest
+
+## Adding Your Blog
+
+1. Fork this repository
+2. Add your RSS feed URL to `whitelist.toml`
+3. Create a PR
+
+Requirements:
+- Must be a personal blog RSS feed
+- No sitemaps
+- Feed must include title, link, and publication date
+- Posts should be tech-focused
+
+## Development
+
+### Setup
+```bash
+go mod tidy
+```
+
+### Build
+```bash
+go build -o harvest ./src
+```
+
+### Run
+```bash
+./harvest
+```
+
+### Structure
+- `src/`: Go source code
+- `output/`: Generated blog post markdown
+- `whitelist.toml`: RSS feed list
diff --git a/docs/TECHNICAL.md b/docs/TECHNICAL.md
@@ -0,0 +1,43 @@
+# Technical Details
+
+## RSS Support
+
+Handles a bunch of RSS formats because everyone implements them differently:
+
+### Core Fields
+```go
+type Item struct {
+    Title       string `xml:"title"`
+    Link        string `xml:"link"`
+    PubDate     string `xml:"pubDate"`      // regular rss
+    Date        string `xml:"date"`         // some use this
+    Published   string `xml:"published"`    // atom folks
+    Updated     string `xml:"updated"`      // fallback
+}
+```
+
+### Date Hell
+RSS feeds use whatever date format they feel like. We handle:
+```go
+var dateFormats = []string{
+    time.RFC1123Z,              // most RSS
+    time.RFC3339,               // atom's favorite
+    "02 Jan 2006 15:04 -0700",  // why do people use this
+    "2006-01-02",               // at least it's simple
+}
+```
+
+### Content Cleanup
+- Strips HTML (nobody needs that in a feed)
+- Fixes entities (`&amp;` → `&`)
+- Handles missing descriptions (minimalist blogs)
+
+## How It Works
+
+1. Reads feeds from `whitelist.toml`
+2. Downloads them all at once (because waiting sucks)
+3. Parses XML, prays it's valid
+4. Cleans up the mess
+5. Dumps a nice markdown file in `output/`
+
+The code's modular so we can add new formats when someone inevitably implements RSS wrong again. This has been so fun to troubleshoot.
diff --git a/go.mod b/go.mod
@@ -0,0 +1,5 @@
+module github.com/UW-UPL/harvest
+
+go 1.21
+
+require github.com/pelletier/go-toml/v2 v2.1.1
diff --git a/go.sum b/go.sum
@@ -0,0 +1,18 @@
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pelletier/go-toml/v2 v2.1.1 h1:LWAJwfNvjQZCFIDKWYQaM62NcYeYViCmWIwmOStowAI=
+github.com/pelletier/go-toml/v2 v2.1.1/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/src/feed/fetcher.go b/src/feed/fetcher.go
@@ -0,0 +1,172 @@
+package feed
+
+import (
+	"encoding/xml"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"regexp"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+)
+
+var dateFormats = []string{
+	time.RFC1123Z,
+	time.RFC1123,
+	time.RFC3339,
+	time.RFC3339Nano,
+	"2006-01-02T15:04:05Z",
+	"2006-01-02 15:04:05 -0700",
+	"02 Jan 2006 15:04 -0700",
+	"Mon, 02 Jan 2006 15:04:05 GMT",
+	"02 Jan 2006 15:04 +0000",
+	"2006-01-02",
+	"January 2, 2006",
+}
+
+func cleanHTML(input string) string {
+	// first, remove HTML tags
+	tagRegex := regexp.MustCompile("<[^>]*>")
+	cleaned := tagRegex.ReplaceAllString(input, "")
+
+	// & convert HTML entities
+	cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
+	cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
+	cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
+	cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
+	cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
+
+	// & normalize whitespace
+	wsRegex := regexp.MustCompile(`\s+`)
+	cleaned = wsRegex.ReplaceAllString(cleaned, " ")
+
+	return strings.TrimSpace(cleaned)
+}
+
+func parseDate(item Item) time.Time {
+	dateCandidates := []string{
+		item.PubDate,
+		item.Date,
+		item.Published,
+		item.Updated,
+	}
+
+	for _, dateStr := range dateCandidates {
+		if dateStr == "" {
+			continue
+		}
+
+		for _, format := range dateFormats {
+			if t, err := time.Parse(format, dateStr); err == nil {
+				return t
+			}
+		}
+	}
+
+	log.Printf("warn: Could not parse any date from item %s", item.Title)
+	return time.Now()
+}
+
+func getDescription(item Item) string {
+	candidates := []string{
+		item.Description,
+		item.Content,
+		item.Encoded,
+	}
+
+	for _, candidate := range candidates {
+		if candidate != "" {
+			return cleanHTML(candidate)
+		}
+	}
+
+	return "Visit post for details."
+}
+
+func getAuthor(item Item, channelTitle string) string {
+	if item.Author != "" {
+		return item.Author
+	}
+	if item.Creator != "" {
+		return item.Creator
+	}
+	return channelTitle
+}
+
+func FetchFeed(url string) ([]BlogPost, error) {
+	resp, err := http.Get(url)
+	if err != nil {
+		return nil, fmt.Errorf("fetching feed %s: %w", url, err)
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("reading res from %s: %w", url, err)
+	}
+
+	var feed Feed
+	if err := xml.Unmarshal(body, &feed); err != nil {
+		return nil, fmt.Errorf("parsing feed %s: %w", url, err)
+	}
+
+	var posts []BlogPost
+
+	// we need to worry about both RSS *AND* Atom feeds
+	items := feed.Channel.Items
+	if len(items) == 0 {
+		items = feed.Channel.Entries
+	}
+	if len(items) == 0 {
+		items = feed.Entries
+	}
+
+	for _, item := range items {
+		post := BlogPost{
+			Title:   item.Title,
+			Link:    item.Link,
+			Date:    parseDate(item),
+			Author:  getAuthor(item, feed.Channel.Title),
+			Summary: getDescription(item),
+		}
+		posts = append(posts, post)
+	}
+
+	return posts, nil
+}
+
+func FetchAllFeeds(feeds []string) []BlogPost {
+	var (
+		wg    sync.WaitGroup
+		mu    sync.Mutex
+		posts []BlogPost
+	)
+
+	for _, feedURL := range feeds {
+		wg.Add(1)
+		go func(url string) {
+			defer wg.Done()
+
+			feedPosts, err := FetchFeed(url)
+			if err != nil {
+				log.Printf("err fetching %s: %v", url, err)
+				return
+			}
+
+			mu.Lock()
+			posts = append(posts, feedPosts...)
+			mu.Unlock()
+		}(feedURL)
+	}
+
+	wg.Wait()
+
+	sort.Slice(posts, func(i, j int) bool {
+		return posts[i].Date.After(posts[j].Date)
+	})
+
+	return posts
+}
diff --git a/src/feed/types.go b/src/feed/types.go
@@ -0,0 +1,42 @@
+package feed
+
+import "time"
+
+type Config struct {
+	Feeds []string `toml:"feeds"`
+}
+
+type Item struct {
+	Title       string `xml:"title"`
+	Link        string `xml:"link"`
+	PubDate     string `xml:"pubDate"`
+	Date        string `xml:"date"`
+	Published   string `xml:"published"`
+	Updated     string `xml:"updated"`
+	Author      string `xml:"author"`
+	Creator     string `xml:"creator"`
+	Description string `xml:"description"`
+	Content     string `xml:"content"`
+	Encoded     string `xml:"encoded"`
+}
+
+type Channel struct {
+	Title       string `xml:"title"`
+	Link        string `xml:"link"`
+	Description string `xml:"description"`
+	Items       []Item `xml:"item"`
+	Entries     []Item `xml:"entry"`
+}
+
+type Feed struct {
+	Channel Channel `xml:"channel"`
+	Entries []Item  `xml:"entry"`
+}
+
+type BlogPost struct {
+	Title   string
+	Link    string
+	Date    time.Time
+	Author  string
+	Summary string
+}
diff --git a/src/main.go b/src/main.go
diff --git a/src/markdown/generator.go b/src/markdown/generator.go
diff --git a/whitelist.toml b/whitelist.toml