Skip to content

Commit

Permalink
Merge pull request #3 from its-my-data/dev
Browse files Browse the repository at this point in the history
Supporting drama type and sorting urls to make resuming deterministic
  • Loading branch information
MewX authored Dec 18, 2023
2 parents e2ec1dc + fa087c0 commit 71db27a
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 19 deletions.
29 changes: 16 additions & 13 deletions proto/category.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion proto/category.proto
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ option go_package = "github.com/its-my-data/doubak/proto";
// - 游戏 game
// - 移动应用 app
// - 评论 review
// - 舞台剧 drama
// - 小组 group (not supported)
// - 日记 note (not supported)
// - 图片 album (not supported)
// - 小站 site (not supported)
// - 同城活动 activity (not supported)
// - 舞台剧 drama (not supported)
// - 豆品 thing (not supported)
enum Category {
broadcast = 0;
Expand All @@ -28,4 +28,5 @@ enum Category {
music = 4;
app = 5;
review = 6;
drama = 7;
}
63 changes: 60 additions & 3 deletions task/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"log"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
Expand All @@ -27,6 +28,7 @@ const PeopleURL = DoubanURL + "people/"
const MoviePeopleURL = MovieURL + "people/"
const BookPeopleURL = BookURL + "people/"
const MusicPeopleURL = MusicURL + "people/"
const LocationPeopleURL = DoubanURL + "location/people/"

const startingPage = 1
const startingItemId = 0
Expand Down Expand Up @@ -101,6 +103,9 @@ func (task *Collector) Execute() error {
case proto.Category_music.String():
task.crawlMusicListDispatcher()
task.crawlItemDetails(proto.Category_music, "div.item > div.info > ul > li.title > a:nth-child(1)")
case proto.Category_drama.String():
task.crawlDramaListDispatcher()
task.crawlItemDetails(proto.Category_drama, "div.item > div.info > ul > li.title > a:nth-child(1)")
default:
return errors.New("Category not implemented " + c)
}
Expand Down Expand Up @@ -409,6 +414,50 @@ func (task *Collector) crawlMusicLists(totalItems int, tag string, urlAction str
return task.crawlItemLists(proto.Category_music, totalItems, pageStep, tag, urlTemplate)
}

func (task *Collector) crawlDramaListDispatcher() error {
// The drama page does not have an entry (https://www.douban.com/location/people/<user_name>/drama/).
// However, each page contains the following parts:
// - To-watch dramas.
// - Watched dramas.

// Drama list starts with item ID (which is 0). Each page has 15 items. Example:
// https://www.douban.com/location/people/mewcatcher/drama/collect?sort=time&start=0&filter=all&mode=grid&tags_sort=count
nToWatch := 0
nWatched := 0
c := util.NewColly()
c.OnHTML("div.article > div.mod > h2", func(e *colly.HTMLElement) {
secText := e.Text
re := regexp.MustCompile("[0-9]+")
nParsed, _ := strconv.Atoi(re.FindString(secText))

switch {
case strings.Contains(secText, "想看"):
nToWatch = nParsed
log.Println("Found to-watch dramas:", nToWatch)
case strings.Contains(secText, "看过"):
nWatched = nParsed
log.Println("Found watched dramas:", nWatched)
default:
log.Println("Ignoring:", util.MergeSpaces(&secText))
}
})
c.Visit(LocationPeopleURL + task.user + "/drama/")

if err := task.crawlDramaLists(nWatched, "watched", "collect"); err != nil {
return err
}
if err := task.crawlDramaLists(nToWatch, "towatch", "wish"); err != nil {
return err
}
return nil
}

func (task *Collector) crawlDramaLists(totalItems int, tag string, urlAction string) error {
const pageStep = 15
urlTemplate := fmt.Sprintf("https://www.douban.com/location/people/%s/drama/%s?sort=time&start=%%d&filter=all&mode=grid&tags_sort=count", task.user, urlAction)
return task.crawlItemLists(proto.Category_drama, totalItems, pageStep, tag, urlTemplate)
}

// TODO: implement more crawlers.

// crawlItemLists downloads an item list universally.
Expand Down Expand Up @@ -452,7 +501,8 @@ func (task *Collector) crawlItemLists(cat proto.Category, totalItems int, pageSt
}

func (task *Collector) crawlItemDetails(cat proto.Category, selector string) error {
var urls []string
// Run statistical URL counter.
urlCounter := make(map[string]int)
inputFileNamePattern := fmt.Sprintf("*_%s_*.html", cat)
files := util.GetFilePathListWithPattern(task.outputDir, inputFileNamePattern)
for _, fn := range files {
Expand All @@ -466,10 +516,17 @@ func (task *Collector) crawlItemDetails(cat proto.Category, selector string) err
if !exists {
log.Fatal("Found item without link", sel.Text())
}
urls = append(urls, url)
urlCounter[url]++
})
}

// Convert map to sorted slices to make resuming idempotent.
var urls []string
for url := range urlCounter {
urls = append(urls, url)
}
sort.Strings(urls)

// Hack around to continue progress. Set to the last downloaded progress count (1-based, 0 by default).
// This hack will continue with the next URL in the queue.
const iResume = 0
Expand Down Expand Up @@ -542,7 +599,7 @@ func (task *Collector) getItemMatcherPattern(cat proto.Category) string {
switch cat {
case proto.Category_book:
return "class=\"subject-item\""
case proto.Category_movie, proto.Category_music:
case proto.Category_movie, proto.Category_music, proto.Category_drama:
return "class=\"item\""
case proto.Category_game:
return "class=\"common-item\""
Expand Down
5 changes: 3 additions & 2 deletions util/crawlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const RequestInterval = 1 * time.Second
// TODO: add a rate limiter.
func NewQueue() *queue.Queue {
q, err := queue.New(
1, // Number of consumer threads
1, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
)
if err != nil {
Expand All @@ -42,7 +42,7 @@ func NewColly() *colly.Collector {

c := colly.NewCollector(
colly.MaxDepth(1),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61"),
)

c.OnError(func(r *colly.Response, err error) {
Expand All @@ -64,6 +64,7 @@ func NewColly() *colly.Collector {
r.Headers.Set("Cookie", cookies)
}

r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
r.Headers.Set("Referer", "https://www.douban.com/")
r.Headers.Set("Host", "https://www.douban.com/")
})
Expand Down

0 comments on commit 71db27a

Please sign in to comment.