Skip to content

Commit

Permalink
A new way of sanitizing HTML, updated deps. (#143)
Browse files Browse the repository at this point in the history
* updated deps
* Convert to and from Markdown to sanitise HTML
* dedupe the final image list against embedded images in content
  • Loading branch information
TheMightyGit authored Nov 10, 2024
1 parent 5717e6e commit 6399333
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 193 deletions.
2 changes: 1 addition & 1 deletion badge.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 11 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
module github.com/TheMightyGit/rssole

go 1.22
go 1.23

toolchain go1.23.2

require (
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0
github.com/NYTimes/gziphandler v1.1.1
github.com/andybalholm/cascadia v1.3.2
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81
github.com/k3a/html2text v1.2.1
github.com/mmcdole/gofeed v1.3.0
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948
golang.org/x/net v0.28.0
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f
golang.org/x/net v0.31.0
)

require (
github.com/PuerkitoBio/goquery v1.9.2 // indirect
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 // indirect
github.com/PuerkitoBio/goquery v1.10.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mmcdole/goxpp v1.1.1 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
golang.org/x/text v0.17.0 // indirect
golang.org/x/text v0.20.0 // indirect
)
47 changes: 22 additions & 25 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 h1:TDlO/A2QqlNhdvH+hDnu8cv1rouhfHgLwhGzJeHGgFQ=
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364/go.mod h1:U+fBZLZTYiZCOwQUT04V3J4I+0TxyLNnj0R8nBlO4fk=
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0 h1:k6vBBqTmQOqLnaYkELgCU/F9xVPt3xhO1754hvlP/HM=
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0/go.mod h1:djCj8ehU80KpSAepQciLcNzrp8hwZ1vQFnYKRo4/Cio=
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81 h1:5lyLWsV+qCkoYqsKUDuycESh9DEIPVKN6iCFeL7ag50=
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
Expand All @@ -19,21 +22,23 @@ github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
github.com/mmcdole/gofeed v1.2.1 h1:tPbFN+mfOLcM1kDF1x2c/N68ChbdBatkppdzf/vDe1s=
github.com/mmcdole/gofeed v1.2.1/go.mod h1:2wVInNpgmC85q16QTTuwbuKxtKkHLCDDtf0dCmnrNr4=
github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4=
github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE=
github.com/mmcdole/goxpp v1.1.0 h1:WwslZNF7KNAXTFuzRtn/OKZxFLJAAyOA9w82mDz2ZGI=
github.com/mmcdole/goxpp v1.1.0/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
github.com/mmcdole/goxpp v1.1.1 h1:RGIX+D6iQRIunGHrKqnA2+700XMCnNv0bAOOv5MUhx8=
github.com/mmcdole/goxpp v1.1.1/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de h1:D5x39vF5KCwKQaw+OC9ZPiLVHXz3UFw2+psEX+gYcto=
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de/go.mod h1:kJun4WP5gFuHZgRjZUWWuH1DTxCtxbHDOIJsudS8jzY=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sebdah/goldie/v2 v2.5.5 h1:rx1mwF95RxZ3/83sdS4Yp7t2C5TCokvWP4TBRbAyEWY=
github.com/sebdah/goldie/v2 v2.5.5/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
Expand All @@ -43,32 +48,27 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yuin/goldmark v1.7.8 h1:iERMLn0/QJeHFhxSt3p6PeN9mGnvIKSpG9YYorDMnic=
github.com/yuin/goldmark v1.7.8/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/exp v0.0.0-20240213143201-ec583247a57a h1:HinSgX1tJRX3KsL//Gxynpw5CTOAIPhgL4W8PNiIpVE=
golang.org/x/exp v0.0.0-20240213143201-ec583247a57a/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc=
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 h1:kx6Ds3MlpiUHKj7syVnbp57++8WpuKPcR5yjLBjvLEA=
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
Expand All @@ -80,14 +80,11 @@ golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc=
golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
Expand Down
173 changes: 52 additions & 121 deletions internal/rssole/item.go
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
package rssole

import (
"bytes"
"crypto/md5"
"encoding/hex"
"log/slog"
"net/url"
"regexp"
"strings"
"sync"

htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/gomarkdown/markdown"
"github.com/gomarkdown/markdown/html"
"github.com/gomarkdown/markdown/parser"
"github.com/k3a/html2text"
"github.com/mmcdole/gofeed"
"golang.org/x/exp/slog"
"golang.org/x/net/html"
"github.com/mpvl/unique"
)

type wrappedItem struct {
IsUnread bool
Feed *feed
*gofeed.Item

summary *string
description *string
descriptionImagesForDedupe *[]string
images *[]string
onceDescription sync.Once
summary *string
description *string
images *[]string
onceDescription sync.Once
}

func (w *wrappedItem) MarkReadID() string {
Expand All @@ -46,14 +47,9 @@ func (w *wrappedItem) Images() []string {

images := []string{}

// NOTE: we exclude images that already appear in the description (gibiz)

// standard supplied image
if w.Item.Image != nil {
if !w.isDescriptionImage(w.Item.Image.URL) {
// fmt.Println(w.Item.Image.URL)
images = append(images, w.Item.Image.URL)
}
images = append(images, w.Item.Image.URL)
}

// mastodon/gibiz images
Expand All @@ -62,11 +58,7 @@ func (w *wrappedItem) Images() []string {
for _, v := range content {
if v.Attrs["medium"] == "image" {
imageURL := v.Attrs["url"]
if !w.isDescriptionImage(imageURL) {
// fmt.Println(w.Description())
// fmt.Printf("%v = %+v\n", k, imageUrl)
images = append(images, imageURL)
}
images = append(images, imageURL)
}
}
}
Expand All @@ -91,34 +83,28 @@ func (w *wrappedItem) Images() []string {
}
}

w.images = &images

return *w.images
}
// Now... remove any meta images that are embedded in the description.
// Ignore any query string args.

func (w *wrappedItem) isDescriptionImage(src string) bool {
// strip anything after ? to get rid of query string part
srcNoQueryString := strings.Split(src, "?")[0]
dedupedImages := []string{}

if w.descriptionImagesForDedupe == nil {
// force lazy load if it hasn't already
_ = w.Description()
}

for _, v := range *w.descriptionImagesForDedupe {
// fmt.Println(v, "==", src)
if v == srcNoQueryString {
return true
// Remove any image sources already within the description...
for _, img := range images {
srcNoQueryString := strings.Split(img, "?")[0]
if !strings.Contains(w.Description(), srcNoQueryString) {
dedupedImages = append(dedupedImages, img)
} else {
slog.Info("dedeuped meta image as already found in content", "src", img)
}
}

return false
}
// Remove any internal duplicates within the list...
unique.Strings(&dedupedImages)

var (
tagsToRemoveRe = regexp.MustCompile("script|style|link|meta|iframe|form")
attrsToRemoveRe = regexp.MustCompile("style|class|hx-.*|data-.*|srcset|width|height|sizes|loading|decoding|target")
)
w.images = &dedupedImages

return *w.images
}

func (w *wrappedItem) Description() string {
w.onceDescription.Do(func() {
Expand Down Expand Up @@ -156,89 +142,32 @@ func (w *wrappedItem) Description() string {
}
}

// try and sanitise any html
doc, err := html.Parse(strings.NewReader(*desc))
if err != nil {
// failed to sanitise, so just return as is...
slog.Warn("html.Parse failed, returning unsanitised content", "error", err)
// Now simplify the (potential) HTML by converting
// it to and from markdown.

w.description = desc
} else {
w.descriptionImagesForDedupe = &[]string{}
toDelete := []*html.Node{}

var f func(*html.Node)
f = func(n *html.Node) {
// fmt.Println(n)
if n.Type == html.ElementNode {
// fmt.Println(n.Data)
if tagsToRemoveRe.MatchString(n.Data) {
// fmt.Println("removing", n.Data, "tag")
toDelete = append(toDelete, n)

return
}

allowedAttrs := []html.Attribute{}

for i := range n.Attr {
if !attrsToRemoveRe.MatchString(n.Attr[i].Key) {
allowedAttrs = append(allowedAttrs, n.Attr[i])
}
}

n.Attr = allowedAttrs

if n.Data == "a" {
// fmt.Println("making", n.Data, "tag target new tab")
n.Attr = append(n.Attr, html.Attribute{
Namespace: "",
Key: "target",
Val: "_new",
})
// disable href if it starts with #
for i := range n.Attr {
if n.Attr[i].Key == "href" && n.Attr[i].Val[0] == '#' {
n.Attr[i].Key = "xxxhref" // easier than removing the attr

break
}
}
}

if n.Data == "img" || n.Data == "svg" {
// fmt.Println("making", n.Data, "tag style max-width 60%")
n.Attr = append(n.Attr, html.Attribute{
Namespace: "",
Key: "style",
Val: "max-width: 60%;",
})
// keep a note of images so we can de-dupe attached
// images that also appear in the content.
for _, a := range n.Attr {
if a.Key == "src" {
// strip anything after ? to get rid of query string part
bits := strings.Split(a.Val, "?")
*w.descriptionImagesForDedupe = append(*w.descriptionImagesForDedupe, bits[0])
}
}
}
}
// First convert rando HTML to Markdown....
doc, err := htmltomarkdown.ConvertString(*desc)

for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
switch {
case err != nil:
slog.Warn("htmltomarkdown.ConvertString failed, returning unsanitised content", "error", err)

for _, n := range toDelete {
n.Parent.RemoveChild(n)
}
w.description = desc
case doc == "":
slog.Warn("htmltomarkdown.ConvertString result blank, using original.")

renderBuf := bytes.NewBufferString("")
_ = html.Render(renderBuf, doc)
desc := renderBuf.String()
w.description = &desc
w.description = desc
default:
// parse markdown
p := parser.NewWithExtensions(parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock)
md := p.Parse([]byte(doc))

// render to HTML (we choose to exclude embedded images and rely on them being passed in metadata)
renderer := html.NewRenderer(html.RendererOptions{
Flags: html.CommonFlags | html.HrefTargetBlank,
})
mdHTML := string(markdown.Render(md, renderer))
w.description = &mdHTML
}
})

Expand All @@ -257,6 +186,8 @@ func (w *wrappedItem) Summary() string {
plainDesc = plainDesc[:maxDescriptionLength]
}

plainDesc = strings.TrimSpace(plainDesc)

// if summary is identical to title return nothing
if plainDesc == w.Title {
plainDesc = ""
Expand Down
Loading

0 comments on commit 6399333

Please sign in to comment.