Skip to content

Commit

Permalink
Added drama support
Browse files Browse the repository at this point in the history
  • Loading branch information
MewX committed Feb 4, 2023
1 parent 39542d4 commit fa087c0
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 17 deletions.
29 changes: 16 additions & 13 deletions proto/category.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion proto/category.proto
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ option go_package = "github.com/its-my-data/doubak/proto";
// - 游戏 game
// - 移动应用 app
// - 评论 review
// - 舞台剧 drama
// - 小组 group (not supported)
// - 日记 note (not supported)
// - 图片 album (not supported)
// - 小站 site (not supported)
// - 同城活动 activity (not supported)
// - 舞台剧 drama (not supported)
// - 豆品 thing (not supported)
enum Category {
broadcast = 0;
Expand All @@ -28,4 +28,5 @@ enum Category {
music = 4;
app = 5;
review = 6;
drama = 7;
}
50 changes: 49 additions & 1 deletion task/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const PeopleURL = DoubanURL + "people/"
const MoviePeopleURL = MovieURL + "people/"
const BookPeopleURL = BookURL + "people/"
const MusicPeopleURL = MusicURL + "people/"
const LocationPeopleURL = DoubanURL + "location/people/"

const startingPage = 1
const startingItemId = 0
Expand Down Expand Up @@ -102,6 +103,9 @@ func (task *Collector) Execute() error {
case proto.Category_music.String():
task.crawlMusicListDispatcher()
task.crawlItemDetails(proto.Category_music, "div.item > div.info > ul > li.title > a:nth-child(1)")
case proto.Category_drama.String():
task.crawlDramaListDispatcher()
task.crawlItemDetails(proto.Category_drama, "div.item > div.info > ul > li.title > a:nth-child(1)")
default:
return errors.New("Category not implemented " + c)
}
Expand Down Expand Up @@ -410,6 +414,50 @@ func (task *Collector) crawlMusicLists(totalItems int, tag string, urlAction str
return task.crawlItemLists(proto.Category_music, totalItems, pageStep, tag, urlTemplate)
}

func (task *Collector) crawlDramaListDispatcher() error {
// The drama page does not have an entry (https://www.douban.com/location/people/<user_name>/drama/).
// However, each page contains the following parts:
// - To-watch dramas.
// - Watched dramas.

// Drama list starts with item ID (which is 0). Each page has 15 items. Example:
// https://www.douban.com/location/people/mewcatcher/drama/collect?sort=time&start=0&filter=all&mode=grid&tags_sort=count
nToWatch := 0
nWatched := 0
c := util.NewColly()
c.OnHTML("div.article > div.mod > h2", func(e *colly.HTMLElement) {
secText := e.Text
re := regexp.MustCompile("[0-9]+")
nParsed, _ := strconv.Atoi(re.FindString(secText))

switch {
case strings.Contains(secText, "想看"):
nToWatch = nParsed
log.Println("Found to-watch dramas:", nToWatch)
case strings.Contains(secText, "看过"):
nWatched = nParsed
log.Println("Found watched dramas:", nWatched)
default:
log.Println("Ignoring:", util.MergeSpaces(&secText))
}
})
c.Visit(LocationPeopleURL + task.user + "/drama/")

if err := task.crawlDramaLists(nWatched, "watched", "collect"); err != nil {
return err
}
if err := task.crawlDramaLists(nToWatch, "towatch", "wish"); err != nil {
return err
}
return nil
}

func (task *Collector) crawlDramaLists(totalItems int, tag string, urlAction string) error {
const pageStep = 15
urlTemplate := fmt.Sprintf("https://www.douban.com/location/people/%s/drama/%s?sort=time&start=%%d&filter=all&mode=grid&tags_sort=count", task.user, urlAction)
return task.crawlItemLists(proto.Category_drama, totalItems, pageStep, tag, urlTemplate)
}

// TODO: implement more crawlers.

// crawlItemLists downloads an item list universally.
Expand Down Expand Up @@ -551,7 +599,7 @@ func (task *Collector) getItemMatcherPattern(cat proto.Category) string {
switch cat {
case proto.Category_book:
return "class=\"subject-item\""
case proto.Category_movie, proto.Category_music:
case proto.Category_movie, proto.Category_music, proto.Category_drama:
return "class=\"item\""
case proto.Category_game:
return "class=\"common-item\""
Expand Down
5 changes: 3 additions & 2 deletions util/crawlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const RequestInterval = 1 * time.Second
// TODO: add a rate limiter.
func NewQueue() *queue.Queue {
q, err := queue.New(
1, // Number of consumer threads
1, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
)
if err != nil {
Expand All @@ -42,7 +42,7 @@ func NewColly() *colly.Collector {

c := colly.NewCollector(
colly.MaxDepth(1),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61"),
)

c.OnError(func(r *colly.Response, err error) {
Expand All @@ -64,6 +64,7 @@ func NewColly() *colly.Collector {
r.Headers.Set("Cookie", cookies)
}

r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
r.Headers.Set("Referer", "https://www.douban.com/")
r.Headers.Set("Host", "https://www.douban.com/")
})
Expand Down

0 comments on commit fa087c0

Please sign in to comment.