Merge pull request #3 from its-my-data/dev

Supporting drama type and sorting urls to make resuming deterministic
its-my-data · Dec 18, 2023 · 71db27a · 71db27a
2 parents e2ec1dc + fa087c0
commit 71db27a
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 19 deletions.
diff --git a/proto/category.pb.go b/proto/category.pb.go
diff --git a/proto/category.proto b/proto/category.proto
@@ -13,12 +13,12 @@ option go_package = "github.com/its-my-data/doubak/proto";
 // - 游戏 game
 // - 移动应用 app
 // - 评论 review
+// - 舞台剧 drama
 // - 小组 group (not supported)
 // - 日记 note (not supported)
 // - 图片 album (not supported)
 // - 小站 site (not supported)
 // - 同城活动 activity (not supported)
-// - 舞台剧 drama (not supported)
 // - 豆品 thing (not supported)
 enum Category {
     broadcast = 0;
@@ -28,4 +28,5 @@ enum Category {
     music = 4;
     app = 5;
     review = 6;
+    drama = 7;
 }
diff --git a/task/collector.go b/task/collector.go
@@ -11,6 +11,7 @@ import (
 	"log"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strconv"
 	"strings"
 	"time"
@@ -27,6 +28,7 @@ const PeopleURL = DoubanURL + "people/"
 const MoviePeopleURL = MovieURL + "people/"
 const BookPeopleURL = BookURL + "people/"
 const MusicPeopleURL = MusicURL + "people/"
+const LocationPeopleURL = DoubanURL + "location/people/"
 
 const startingPage = 1
 const startingItemId = 0
@@ -101,6 +103,9 @@ func (task *Collector) Execute() error {
 		case proto.Category_music.String():
 			task.crawlMusicListDispatcher()
 			task.crawlItemDetails(proto.Category_music, "div.item > div.info > ul > li.title > a:nth-child(1)")
+		case proto.Category_drama.String():
+			task.crawlDramaListDispatcher()
+			task.crawlItemDetails(proto.Category_drama, "div.item > div.info > ul > li.title > a:nth-child(1)")
 		default:
 			return errors.New("Category not implemented " + c)
 		}
@@ -409,6 +414,50 @@ func (task *Collector) crawlMusicLists(totalItems int, tag string, urlAction str
 	return task.crawlItemLists(proto.Category_music, totalItems, pageStep, tag, urlTemplate)
 }
 
+func (task *Collector) crawlDramaListDispatcher() error {
+	// The drama page does not have an entry (https://www.douban.com/location/people/<user_name>/drama/).
+	// However, each page contains the following parts:
+	// - To-watch dramas.
+	// - Watched dramas.
+
+	// Drama list starts with item ID (which is 0). Each page has 15 items. Example:
+	// https://www.douban.com/location/people/mewcatcher/drama/collect?sort=time&start=0&filter=all&mode=grid&tags_sort=count
+	nToWatch := 0
+	nWatched := 0
+	c := util.NewColly()
+	c.OnHTML("div.article > div.mod > h2", func(e *colly.HTMLElement) {
+		secText := e.Text
+		re := regexp.MustCompile("[0-9]+")
+		nParsed, _ := strconv.Atoi(re.FindString(secText))
+
+		switch {
+		case strings.Contains(secText, "想看"):
+			nToWatch = nParsed
+			log.Println("Found to-watch dramas:", nToWatch)
+		case strings.Contains(secText, "看过"):
+			nWatched = nParsed
+			log.Println("Found watched dramas:", nWatched)
+		default:
+			log.Println("Ignoring:", util.MergeSpaces(&secText))
+		}
+	})
+	c.Visit(LocationPeopleURL + task.user + "/drama/")
+
+	if err := task.crawlDramaLists(nWatched, "watched", "collect"); err != nil {
+		return err
+	}
+	if err := task.crawlDramaLists(nToWatch, "towatch", "wish"); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (task *Collector) crawlDramaLists(totalItems int, tag string, urlAction string) error {
+	const pageStep = 15
+	urlTemplate := fmt.Sprintf("https://www.douban.com/location/people/%s/drama/%s?sort=time&start=%%d&filter=all&mode=grid&tags_sort=count", task.user, urlAction)
+	return task.crawlItemLists(proto.Category_drama, totalItems, pageStep, tag, urlTemplate)
+}
+
 // TODO: implement more crawlers.
 
 // crawlItemLists downloads an item list universally.
@@ -452,7 +501,8 @@ func (task *Collector) crawlItemLists(cat proto.Category, totalItems int, pageSt
 }
 
 func (task *Collector) crawlItemDetails(cat proto.Category, selector string) error {
-	var urls []string
+	// Run statistical URL counter.
+	urlCounter := make(map[string]int)
 	inputFileNamePattern := fmt.Sprintf("*_%s_*.html", cat)
 	files := util.GetFilePathListWithPattern(task.outputDir, inputFileNamePattern)
 	for _, fn := range files {
@@ -466,10 +516,17 @@ func (task *Collector) crawlItemDetails(cat proto.Category, selector string) err
 			if !exists {
 				log.Fatal("Found item without link", sel.Text())
 			}
-			urls = append(urls, url)
+			urlCounter[url]++
 		})
 	}
 
+	// Convert map to sorted slices to make resuming idempotent.
+	var urls []string
+	for url := range urlCounter {
+		urls = append(urls, url)
+	}
+	sort.Strings(urls)
+
 	// Hack around to continue progress. Set to the last downloaded progress count (1-based, 0 by default).
 	// This hack will continue with the next URL in the queue.
 	const iResume = 0
@@ -542,7 +599,7 @@ func (task *Collector) getItemMatcherPattern(cat proto.Category) string {
 	switch cat {
 	case proto.Category_book:
 		return "class=\"subject-item\""
-	case proto.Category_movie, proto.Category_music:
+	case proto.Category_movie, proto.Category_music, proto.Category_drama:
 		return "class=\"item\""
 	case proto.Category_game:
 		return "class=\"common-item\""

diff --git a/util/crawlers.go b/util/crawlers.go
@@ -19,7 +19,7 @@ const RequestInterval = 1 * time.Second
 // TODO: add a rate limiter.
 func NewQueue() *queue.Queue {
 	q, err := queue.New(
-		1,                                           // Number of consumer threads
+		1, // Number of consumer threads
 		&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
 	)
 	if err != nil {
@@ -42,7 +42,7 @@ func NewColly() *colly.Collector {
 
 	c := colly.NewCollector(
 		colly.MaxDepth(1),
-		colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"),
+		colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61"),
 	)
 
 	c.OnError(func(r *colly.Response, err error) {
@@ -64,6 +64,7 @@ func NewColly() *colly.Collector {
 			r.Headers.Set("Cookie", cookies)
 		}
 
+		r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
 		r.Headers.Set("Referer", "https://www.douban.com/")
 		r.Headers.Set("Host", "https://www.douban.com/")
 	})