moviev2+bookv2 update and merge

JimSunJing · Sep 5, 2020 · 3bcb685 · 3bcb685
1 parent 3e34833
commit 3bcb685
Show file tree

Hide file tree

Showing 4 changed files with 176 additions and 82 deletions.
diff --git a/code/bookv2.py b/code/bookv2.py
@@ -1,7 +1,6 @@
-import requests
+import requests, traceback
 from bs4 import BeautifulSoup
-import re
-from time import sleep,perf_counter
+from time import sleep
 from random import uniform,choice
 from doubanUtils import *
 
@@ -15,9 +14,10 @@ def __init__(self,doubanid):
         self.id=doubanid
         #wish dict format: {bookid:[书名,作者,译者,原作名,出版社,出版年,页数,ISBN,评分,评分人数]}
         self.wish_dict={}
-        self.Keys=['书名','作者','译者','原作名',\
-                    '出版社','出版年','页数','ISBN','评分','评分人数']
+        self.itemKeys=['subjectId','书名','封面','作者','译者','原作名','丛书',\
+                    '出版社','出版年','页数','ISBN','评分','评分人数','标记日期','短评们']
         #saw dict format: {bookid:[书名,作者,译者,出版社,出版年,页数,ISBN,评分,评分人数,用户评分,评论,标记日期]}
+        self.sawKeys = self.itemKeys + ['用户标签','用户评分','短评']
         self.saw_dict={}
         self.head='https://book.douban.com/subject/'
 
@@ -32,18 +32,19 @@ def wish_get(self,item):
         bid = url.split('/')[-2]
         return date,name,url,bid
 
-    def wish_store(self,wishes):
+    def wish_store(self,wishes,lastBid):
         for item in wishes:
             date,name,url,bid = self.wish_get(item)
-            self.wish_dict[bid]={'书名':name,'封面':'','豆瓣链接':url,\
-                '标记日期':date,'作者':'','译者':'','原作名':'','出版社':'',\
-                '出版年':'','页数':'','ISBN':'','评分':'','评分人数':''}
+            if (lastBid == str(bid)):
+                return -1
+            self.wish_dict[bid]={'书名':name,'豆瓣链接':url,\
+                '标记日期':date,'subjectId':bid}
 
     def Wish(self):
         # 豆瓣图书反爬机制
         homepage='https://book.douban.com/people/'+self.id
         self.s.get(homepage)
-        self.s.get(homepage+'wish')
+        self.s.get(homepage+'/wish')
 
         print('\n开始爬取'+self.id+'的想读列表')
         beg,end = pageControl(10)
@@ -54,17 +55,23 @@ def Wish(self):
         soup, status = self.get_soup(firstpage)
         print(f'第{page}页',status)
 
+        lastBid = getLastBackUpItem(self.id,"想读")
+
         #get book name and id
-        self.wish_store(soup.find_all(class_='item'))
+        if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1):
+            self.feature_helper(self.wish_dict)
+            return self.wish_dict
         next_ = hasNextPage(soup)
 
         #get all wish list
         while (next_!=False) and (page < end):
-            NextPage = 'https://book.douban.com'+next
+            NextPage = 'https://book.douban.com'+next_
             soup, status = self.get_soup(NextPage)
             page += 1
             print(f'第{page}页',status)
-            self.wish_store(soup.find_all(class_='item'))
+            if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1):
+                self.feature_helper(self.wish_dict)
+                return self.wish_dict
             next_ = hasNextPage(soup)
 
         #add feature for every book
@@ -106,9 +113,13 @@ def get_feature(self,bid,dic):
                     if ':' in i :
                         i=i.replace(' ','')
                         key,value=i.split(':',1)
-                        if key in self.Keys:
-                            dic[bid][key]=value
-                dic[mid]['封面']=soup2.find('img').get('src')
+                        dic[bid][key]=value
+                dic[bid]['封面']=soup2.find('img').get('src')
+                dic[bid]['出版年']=getYear(dic[bid]['出版年'])
+                try:
+                    dic[bid]['短评们']=getShortComments(soup2.findAll(class_="comment"))
+                except:
+                    dic[bid]['短评们']='...'
                 try:
                     dic[bid]['评分']=soup2.find(property=re.compile('average')).text.strip(' ')
                 except:
@@ -117,18 +128,20 @@ def get_feature(self,bid,dic):
                     dic[bid]['评分人数']=soup2.find(class_="rating_people").span.text.strip(' ')
                 except:
                     dic[bid]['评分人数']='0'
-        except:
+        except Exception as e:
             print('\r打开书籍页失败，失败的书籍链接：'+head+bid)
+            print(e)
             self.switch_header()
             return bid
 
-    def saw_store(self,saw):
+    def saw_store(self,saw,lastBid):
         for item in saw:
             date,star,comment,owntag,name,bid=self.saw_get(item)
+            if (lastBid == str(bid)):
+                return -1
             self.saw_dict[bid]={'书名':name,'封面':'','豆瓣链接':self.head+bid,\
-                '标记日期':date,'作者':'','译者':'','原作名':'','出版社':'',\
-                '出版年':'','页数':'','ISBN':'','评分':'','评分人数':'',\
-                '用户评分':star,'短评':comment,'用户标签':owntag}
+                '标记日期':date,'用户评分':star,'短评':comment,\
+                '用户标签':owntag,'subjectId':bid}
 
     def saw_get(self,saw):
         date=saw(class_=re.compile('date'))[0].get_text(strip=True)
@@ -164,8 +177,12 @@ def Saw(self):
         soup, status = self.get_soup(Sfirstpage)
         print(f'第{page}页',status)
 
+        lastBid = getLastBackUpItem(self.id,"读过")
+
         #get book name and id
-        self.saw_store(soup.find_all(class_='item'))
+        if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1):
+            self.feature_helper(self.saw_dict)
+            return self.saw_dict
         next_ = hasNextPage(soup)
 
         #get all saw list
@@ -175,21 +192,23 @@ def Saw(self):
             soup, status = self.get_soup(NextPage)
             page += 1
             print(f'第{page}页',status)
-            self.saw_store(soup.find_all(class_='item'))
+            if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1):
+                self.feature_helper(self.saw_dict)
+                return self.saw_dict
             next_ = hasNextPage(soup)
 
         #add feature for every book
         self.feature_helper(self.saw_dict)
-
         return self.saw_dict
 
-    def save_helper(self, dic, save_type):
-        fw = open(fn(self.id+'-'+getFormatTime()+save_type+'plus.csv'),\
-            'a',endcoding='utf-8_sig')
-        fw.write(','.join(list(dic[list(dic.keys())[0]].keys()))+'\n')
-        for bid in dic.keys():
-            fw.write(','.join(list(map(noco, dic[bid].values())))+'\n')
-        fw.close()
+    def save_helper(self, dic, Type):
+        with open(fn(self.id+'-'+getFormatTime()+Type+'plus.csv'),\
+            'a',encoding='utf-8_sig') as f:
+            fieldNames = self.sawKeys if Type == '读过' else self.itemKeys
+            writer = csv.DictWriter(f, fieldnames=fieldNames, restval="...", extrasaction='ignore')
+            writer.writeheader()
+            for bid in dic.keys():
+                writer.writerow(dic[bid])
 
     def save_as_csv(self,choice):
         if choice in ['a','c']:
@@ -201,12 +220,17 @@ def switch_header(self):
         headers0['User-Agent']=choice(user_agent_list)
         self.s.headers.update(headers0)
 
+    def add_cookies(self,raw_cookies):
+        cookies=getCookie(raw_cookies)
+        self.s.cookies.update(cookies)
+
+
     def main(self):
         print('''
-以下为选项
-    A：想读列表
-    B：读过列表
-    C：想读+读过''')
+        以下为选项
+            A：想读列表
+            B：读过列表
+            C：想读+读过''')
         ans2=input('请输入你需要爬取的内容：')
         ans2=ans2.lower()
         if ans2=='a':
@@ -229,11 +253,19 @@ def main():
     if ans1=='yes':
         Douid=input('请输入你的豆瓣id： ')
         clawer=Douban_Book(doubanid=Douid)
+        # book.douban.com 有反爬，需要cookies
+        print("由于豆瓣图书的防爬虫机制，需要你提供cookies")
+        raw_cookies=input('输入cookies: ')
+        clawer.add_cookies(raw_cookies)
         clawer.main()
     print('\n问题反馈：[email protected] | https://github.com/JimSunJing/douban_clawer')
 
 
 if __name__ == '__main__':
-    main()
-    sleep(10)
-    over=input('按任意键退出')
+    try:
+        main()
+    except Exception as e:
+        traceback.print_exc()
+    finally:
+        sleep(10)
+        over=input('按任意键退出')
diff --git a/code/doubanUtils.py b/code/doubanUtils.py
@@ -1,5 +1,6 @@
 import requests
-import re
+import csv, os, os.path, re
+from functools import reduce
 from bs4 import BeautifulSoup
 from time import localtime,strftime,perf_counter,strptime
 
@@ -57,3 +58,61 @@ def getFormatTime():
 
 def string2Time(s):
     return strptime(s, '%Y-%m-%d %H-%M-%S')
+
+def fileTimeCompare(fn1, fn2):
+    fn1 = fn1.replace(".csv","").split('-',1)[1][:-6]
+    fn2 = fn2.replace(".csv","").split('-',1)[1][:-6]
+    return string2Time(fn1) > string2Time(fn2) 
+
+def getLastBackUpItem(douId,Type):
+    # 获取上次文件
+    matchFiles = []
+    # 文件名
+    fnMatch = r"iiid-\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}tttypeplus.csv"\
+        .replace('iiid',douId).replace('tttype',Type)
+    for _, _, files in os.walk("."):
+        for file in files:
+            # print(file)
+            if re.match(fnMatch,file):
+                matchFiles.append(file)
+    ## 得到最新的电影名
+    if len(matchFiles) != 0:
+        latest = reduce(lambda x,y: x if fileTimeCompare(x,y) else y,\
+            matchFiles)
+        with open(latest, 'r', encoding='utf-8_sig') as f:
+            reader = csv.DictReader(f)
+            # 获取第一行电影的id
+            try:
+                row = reader.__next__()
+                return row['subjectId']
+            except:
+                return None
+    else: 
+        return None 
+
+def getCookie(raw_cookies):
+    cookies={}
+    for line in raw_cookies.split(';'):
+        key,value=line.split('=',1) 
+        cookies[key]=value
+    return cookies   
+
+def getYear(raw):
+    yearRex = r'([1|2][9|0]\d{2})'
+    res = re.match(yearRex,raw)
+    try:
+        return res.group(1)
+    except:
+        return ''
+
+def getShortComments(comments):
+    res = ''
+    for com in comments:
+        # 先得到评价用户名
+        user = com.find(class_="comment-info").get_text(strip=True).replace('\xa0','').replace('\n','')
+        res += user
+        res += '：'
+        short = com.find(class_="short").get_text(strip=True).replace('\xa0','').replace('\n','')
+        res += short
+        res += '；  |  '
+    return res.replace("看过"," ")