Skip to content

Commit

Permalink
Follow up to #150: Added support for JSON feed (#154)
Browse files Browse the repository at this point in the history
* Added the structure for JSON Feed

* Updated package name of json and added parser and test wrapper

* Added tests for sample json feed parser

* Added detector for feed type json

* Removed unwated dependencies

* Added parser and empty translator for json feeds

* Added translator functions for json feed

* Added test for json translator

* Added tests for content text and banner image

* Added tests for invalid feed and string output

* Added tests for json in parser

* Updated README

* Fixed REAME formatting

* check for xml first

* update dependencies

Co-authored-by: Sudhanshu Raheja <[email protected]>
  • Loading branch information
nkanaev and sudhanshuraheja authored Sep 6, 2020
1 parent 193d002 commit a9ae673
Show file tree
Hide file tree
Showing 19 changed files with 896 additions and 90 deletions.
151 changes: 83 additions & 68 deletions README.md

Large diffs are not rendered by default.

60 changes: 45 additions & 15 deletions detector.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package gofeed

import (
"bytes"
"io"
"strings"

jsoniter "github.com/json-iterator/go"
"github.com/mmcdole/gofeed/internal/shared"
xpp "github.com/mmcdole/goxpp"
)
Expand All @@ -20,29 +22,57 @@ const (
FeedTypeAtom
// FeedTypeRSS represents an RSS feed
FeedTypeRSS
// FeedTypeJSON represents a JSON feed
FeedTypeJSON
)

// DetectFeedType attempts to determine the type of feed
// by looking for specific xml elements unique to the
// various feed types.
func DetectFeedType(feed io.Reader) FeedType {
p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
buffer := new(bytes.Buffer)
buffer.ReadFrom(feed)

xmlBase := shared.XMLBase{}
_, err := xmlBase.FindRoot(p)
if err != nil {
return FeedTypeUnknown
// remove leading whitespace (if exists)
var firstChar byte
for {
ch, err := buffer.ReadByte()
if err != nil {
return FeedTypeUnknown
}
if ch != ' ' && ch != '\t' {
firstChar = ch
buffer.UnreadByte()
break
}
}

name := strings.ToLower(p.Name)
switch name {
case "rdf":
return FeedTypeRSS
case "rss":
return FeedTypeRSS
case "feed":
return FeedTypeAtom
default:
return FeedTypeUnknown
if firstChar == '<' {
// Check if it's an XML based feed
p := xpp.NewXMLPullParser(bytes.NewReader(buffer.Bytes()), false, shared.NewReaderLabel)

xmlBase := shared.XMLBase{}
_, err := xmlBase.FindRoot(p)
if err != nil {
return FeedTypeUnknown
}

name := strings.ToLower(p.Name)
switch name {
case "rdf":
return FeedTypeRSS
case "rss":
return FeedTypeRSS
case "feed":
return FeedTypeAtom
default:
return FeedTypeUnknown
}
} else if firstChar == '{' {
// Check if document is valid JSON
if jsoniter.Valid(buffer.Bytes()) {
return FeedTypeJSON
}
}
return FeedTypeUnknown
}
1 change: 1 addition & 0 deletions detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ func TestDetectFeedType(t *testing.T) {
{"rdf_feed.xml", gofeed.FeedTypeRSS},
{"unknown_feed.xml", gofeed.FeedTypeUnknown},
{"empty_feed.xml", gofeed.FeedTypeUnknown},
{"json_feed.json", gofeed.FeedTypeJSON},
}

for _, test := range feedTypeTests {
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ go 1.14
require (
github.com/PuerkitoBio/goquery v1.5.1
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/json-iterator/go v1.1.10
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf
github.com/stretchr/testify v1.2.2
github.com/stretchr/testify v1.3.0
github.com/urfave/cli v1.22.3
golang.org/x/net v0.0.0-20200301022130-244492dfa37a
golang.org/x/text v0.3.2
Expand Down
11 changes: 11 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,29 @@ github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5z
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68=
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf h1:sWGE2v+hO0Nd4yFU/S/mDBM5plIU8v/Qhfz41hkDIAI=
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf/go.mod h1:pasqhqstspkosTneA62Nc+2p9SOBBYAPbnmRRWPQ0V8=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742 h1:Esafd1046DLDQ0W1YjYsBW+p8U2u7vzgW2SQVmlNazg=
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/urfave/cli v1.22.3 h1:FpNT6zq26xNpHZy08emi755QwzLPs6Pukqjlc7RfOMU=
github.com/urfave/cli v1.22.3/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
Expand Down
62 changes: 62 additions & 0 deletions json/feed.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package json

import "encoding/json"

// Feed describes the structure for JSON Feed v1.0
// https://www.jsonfeed.org/version/1/
type Feed struct {
Version string `json:"version"` // version (required, string) is the URL of the version of the format the feed uses
Title string `json:"title,omitempty"` // title (required, string) is the name of the feed
HomePageURL string `json:"home_page_url,omitempty"` // home_page_url (optional but strongly recommended, string) is the URL of the resource that the feed describes. This resource should be an HTML page
FeedURL string `json:"feed_url,omitempty"` // feed_url (optional but strongly recommended, string) is the URL of the feed, and serves as the unique identifier for the feed
Description string `json:"description,omitempty"` // description (optional, string)
UserComment string `json:"user_comment,omitempty"` // user_comment (optional, string) is a description of the purpose of the feed. This is for the use of people looking at the raw JSON, and should be ignored by feed readers.
NextURL string `json:"next_url,omitempty"` // next_url (optional, string) is the URL of a feed that provides the next n items. This allows for pagination
Icon string `json:"icon,omitempty"` // icon (optional, string) is the URL of an image for the feed suitable to be used in a timeline. It should be square and relatively large — such as 512 x 512
Favicon string `json:"favicon,omitempty"` // favicon (optional, string) is the URL of an image for the feed suitable to be used in a source list. It should be square and relatively small, but not smaller than 64 x 64
Author *Author `json:"author,omitempty"` // author (optional, object) specifies the feed author. The author object has several members. These are all optional — but if you provide an author object, then at least one is required:
Expired bool `json:"expired,omitempty"` // expired (optional, boolean) says whether or not the feed is finished — that is, whether or not it will ever update again.
Items []*Item `json:"items"` // items is an array, and is required
// TODO Hubs // hubs (very optional, array of objects) describes endpoints that can be used to subscribe to real-time notifications from the publisher of this feed. Each object has a type and url, both of which are required. See the section “Subscribing to Real-time Notifications” below for details.
// TODO Extensions
}

func (f Feed) String() string {
json, _ := json.MarshalIndent(f, "", " ")
return string(json)
}

// Item defines an item in the feed
type Item struct {
ID string `json:"id,omitempty"` // id (required, string) is unique for that item for that feed over time. If an id is presented as a number or other type, a JSON Feed reader must coerce it to a string. Ideally, the id is the full URL of the resource described by the item, since URLs make great unique identifiers.
URL string `json:"url,omitempty"` // url (optional, string) is the URL of the resource described by the item. It’s the permalink
ExternalURL string `json:"external_url,omitempty"` // external_url (very optional, string) is the URL of a page elsewhere. This is especially useful for linkblogs
Title string `json:"title,omitempty"` // title (optional, string) is plain text. Microblog items in particular may omit titles.
ContentHTML string `json:"content_html,omitempty"` // content_html and content_text are each optional strings — but one or both must be present. This is the HTML or plain text of the item. Important: the only place HTML is allowed in this format is in content_html. A Twitter-like service might use content_text, while a blog might use content_html. Use whichever makes sense for your resource. (It doesn’t even have to be the same for each item in a feed.)
ContentText string `json:"content_text,omitempty"` // Same as above
Summary string `json:"summary,omitempty"` // summary (optional, string) is a plain text sentence or two describing the item.
Image string `json:"image,omitempty"` // image (optional, string) is the URL of the main image for the item. This image may also appear in the content_html
BannerImage string `json:"banner_image,omitempty"` // banner_image (optional, string) is the URL of an image to use as a banner.
DatePublished string `json:"date_published,omitempty"` // date_published (optional, string) specifies the date in RFC 3339 format. (Example: 2010-02-07T14:04:00-05:00.)
DateModified string `json:"date_modified,omitempty"` // date_modified (optional, string) specifies the modification date in RFC 3339 format.
Author *Author `json:"author,omitempty"` // author (optional, object) has the same structure as the top-level author. If not specified in an item, then the top-level author, if present, is the author of the item.
Tags []string `json:"tags,omitempty"` // tags (optional, array of strings) can have any plain text values you want. Tags tend to be just one word, but they may be anything.
Attachments *[]Attachments `json:"attachments,omitempty"` // attachments (optional, array) lists related resources. Podcasts, for instance, would include an attachment that’s an audio or video file. An individual item may have one or more attachments.
// TODO Extensions
}

// Author defines the feed author structure. The author object has several members. These are all optional — but if you provide an author object, then at least one is required:
type Author struct {
Name string `json:"name,omitempty"` // name (optional, string) is the author’s name.
URL string `json:"url,omitempty"` // url (optional, string) is the URL of a site owned by the author
Avatar string `json:"avatar,omitempty"` // avatar (optional, string) is the URL for an image for the author. It should be square and relatively large — such as 512 x 512
}

// Attachments defines the structure for related sources. Podcasts, for instance, would include an attachment that’s an audio or video file
type Attachments struct {
URL string `json:"url,omitempty"` // url (required, string) specifies the location of the attachment.
MimeType string `json:"mime_type,omitempty"` // mime_type (required, string) specifies the type of the attachment, such as “audio/mpeg.”
Title string `json:"title,omitempty"` // title (optional, string) is a name for the attachment. Important: if there are multiple attachments, and two or more have the exact same title (when title is present), then they are considered as alternate representations of the same thing. In this way a podcaster, for instance, might provide an audio recording in different formats.
SizeInBytes int64 `json:"size_in_bytes,omitempty"` // size_in_bytes (optional, number) specifies how large the file is.
DurationInSeconds int64 `json:"duration_in_seconds,omitempty"` // duration_in_seconds (optional, number) specifies how long it takes to listen to or watch, when played at normal speed.
}
29 changes: 29 additions & 0 deletions json/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package json

import (
"bytes"
"io"

jsoniter "github.com/json-iterator/go"
)

var (
j = jsoniter.ConfigCompatibleWithStandardLibrary
)

// Parser is an JSON Feed Parser
type Parser struct{}

// Parse parses an json feed into an json.Feed
func (ap *Parser) Parse(feed io.Reader) (*Feed, error) {
jsonFeed := &Feed{}

buffer := new(bytes.Buffer)
buffer.ReadFrom(feed)

err := j.Unmarshal(buffer.Bytes(), jsonFeed)
if err != nil {
return nil, err
}
return jsonFeed, err
}
78 changes: 78 additions & 0 deletions json/parser_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package json_test

import (
"bytes"
"fmt"
"io/ioutil"
"testing"

"github.com/mmcdole/gofeed/json"
"github.com/stretchr/testify/assert"
)

// Tests

func TestParser_Parse(t *testing.T) {
name := "invalid"
fmt.Printf("Testing %s... ", name)

// Get actual source feed
ff := fmt.Sprintf("../testdata/parser/json/%s.json", name)
fmt.Println(ff)
f, _ := ioutil.ReadFile(ff)

// Parse actual feed
fp := &json.Parser{}
_, err := fp.Parse(bytes.NewReader(f))
assert.Contains(t, err.Error(), "expect }")

name = "sample"
fmt.Printf("Testing %s... ", name)

// Get actual source feed
ff = fmt.Sprintf("../testdata/parser/json/%s.json", name)
fmt.Println(ff)
f, _ = ioutil.ReadFile(ff)

// Parse actual feed
actual, _ := fp.Parse(bytes.NewReader(f))

assert.Equal(t, "1.0", actual.Version)
assert.Equal(t, "title", actual.Title)
assert.Equal(t, "https://sample-json-feed.com", actual.HomePageURL)
assert.Equal(t, "https://sample-json-feed.com/feed.json", actual.FeedURL)
assert.Equal(t, "description", actual.Description)
assert.Equal(t, "user_comment", actual.UserComment)
assert.Equal(t, "https://sample-json-feed.com/feed.json?next=500", actual.NextURL)
assert.Equal(t, "https://sample-json-feed.com/icon.png", actual.Icon)
assert.Equal(t, "https://sample-json-feed.com/favicon.png", actual.Favicon)
assert.Equal(t, "author_name", actual.Author.Name)
assert.Equal(t, "https://sample-feed-author.com", actual.Author.URL)
assert.Equal(t, "https://sample-feed-author.com/me.png", actual.Author.Avatar)
assert.Equal(t, false, actual.Expired)
assert.Equal(t, "id", actual.Items[0].ID)
assert.Equal(t, "https://sample-json-feed.com/id", actual.Items[0].URL)
assert.Equal(t, "https://sample-json-feed.com/external", actual.Items[0].ExternalURL)
assert.Equal(t, "title", actual.Items[0].Title)
assert.Contains(t, actual.Items[0].ContentHTML, "content_html")
assert.Equal(t, "content_text", actual.Items[0].ContentText)
assert.Equal(t, "summary", actual.Items[0].Summary)
assert.Equal(t, "https://sample-json-feed.com/image.png", actual.Items[0].Image)
assert.Equal(t, "https://sample-json-feed.com/banner_image.png", actual.Items[0].BannerImage)
assert.Equal(t, "2019-10-12T07:20:50.52Z", actual.Items[0].DatePublished)
assert.Equal(t, "2019-10-12T07:20:50.52Z", actual.Items[0].DateModified)
assert.Equal(t, "author_name", actual.Items[0].Author.Name)
assert.Equal(t, "https://sample-feed-author.com", actual.Items[0].Author.URL)
assert.Equal(t, "https://sample-feed-author.com/me.png", actual.Items[0].Author.Avatar)
assert.Equal(t, "tag1", actual.Items[0].Tags[0])
assert.Equal(t, "tag2", actual.Items[0].Tags[1])
assert.Equal(t, "https://sample-json-feed.com/attachment", (*actual.Items[0].Attachments)[0].URL)
assert.Equal(t, "audio/mpeg", (*actual.Items[0].Attachments)[0].MimeType)
assert.Equal(t, "title", (*actual.Items[0].Attachments)[0].Title)
assert.Equal(t, int64(100), (*actual.Items[0].Attachments)[0].SizeInBytes)
assert.Equal(t, int64(100), (*actual.Items[0].Attachments)[0].DurationInSeconds)

assert.Contains(t, actual.String(), "https://sample-json-feed.com/attachment")
}

// TODO: Examples
26 changes: 24 additions & 2 deletions parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"strings"

"github.com/mmcdole/gofeed/atom"
"github.com/mmcdole/gofeed/json"
"github.com/mmcdole/gofeed/rss"
)

Expand All @@ -33,23 +34,26 @@ func (err HTTPError) Error() string {
type Parser struct {
AtomTranslator Translator
RSSTranslator Translator
JSONTranslator Translator
Client *http.Client
rp *rss.Parser
ap *atom.Parser
jp *json.Parser
}

// NewParser creates a universal feed parser.
func NewParser() *Parser {
fp := Parser{
rp: &rss.Parser{},
ap: &atom.Parser{},
jp: &json.Parser{},
}
return &fp
}

// Parse parses a RSS or Atom feed into
// Parse parses a RSS or Atom or JSON feed into
// the universal gofeed.Feed. It takes an
// io.Reader which should return the xml content.
// io.Reader which should return the xml/json content.
func (f *Parser) Parse(feed io.Reader) (*Feed, error) {
// Wrap the feed io.Reader in a io.TeeReader
// so we can capture all the bytes read by the
Expand All @@ -69,6 +73,8 @@ func (f *Parser) Parse(feed io.Reader) (*Feed, error) {
return f.parseAtomFeed(r)
case FeedTypeRSS:
return f.parseRSSFeed(r)
case FeedTypeJSON:
return f.parseJSONFeed(r)
}

return nil, ErrFeedTypeNotDetected
Expand Down Expand Up @@ -140,6 +146,14 @@ func (f *Parser) parseRSSFeed(feed io.Reader) (*Feed, error) {
return f.rssTrans().Translate(rf)
}

func (f *Parser) parseJSONFeed(feed io.Reader) (*Feed, error) {
jf, err := f.jp.Parse(feed)
if err != nil {
return nil, err
}
return f.jsonTrans().Translate(jf)
}

func (f *Parser) atomTrans() Translator {
if f.AtomTranslator != nil {
return f.AtomTranslator
Expand All @@ -156,6 +170,14 @@ func (f *Parser) rssTrans() Translator {
return f.RSSTranslator
}

func (f *Parser) jsonTrans() Translator {
if f.JSONTranslator != nil {
return f.JSONTranslator
}
f.JSONTranslator = &DefaultJSONTranslator{}
return f.JSONTranslator
}

func (f *Parser) httpClient() *http.Client {
if f.Client != nil {
return f.Client
Expand Down
Loading

0 comments on commit a9ae673

Please sign in to comment.