From 3bcb685d37bfdc7b58144ce07cd1961d576490fc Mon Sep 17 00:00:00 2001
From: JimSunJing <jimsun6428@gmail.com>
Date: Sat, 5 Sep 2020 09:13:51 +0800
Subject: [PATCH] moviev2+bookv2 update and merge

---
 code/bookv2.py          | 108 ++++++++++++++++++++++++++--------------
 code/doubanUtils.py     |  61 ++++++++++++++++++++++-
 code/moviev2.py         |  74 ++++++++++++---------------
 code/personalCrawler.py |  15 ++++++
 4 files changed, 176 insertions(+), 82 deletions(-)
 create mode 100644 code/personalCrawler.py

diff --git a/code/bookv2.py b/code/bookv2.py
index c47d15f..8e0113d 100644
--- a/code/bookv2.py
+++ b/code/bookv2.py
@@ -1,7 +1,6 @@
-import requests
+import requests, traceback
 from bs4 import BeautifulSoup
-import re
-from time import sleep,perf_counter
+from time import sleep
 from random import uniform,choice
 from doubanUtils import *
 
@@ -15,9 +14,10 @@ def __init__(self,doubanid):
         self.id=doubanid
         #wish dict format: {bookid:[书名,作者,译者,原作名,出版社,出版年,页数,ISBN,评分,评分人数]}
         self.wish_dict={}
-        self.Keys=['书名','作者','译者','原作名',\
-                    '出版社','出版年','页数','ISBN','评分','评分人数']
+        self.itemKeys=['subjectId','书名','封面','作者','译者','原作名','丛书',\
+                    '出版社','出版年','页数','ISBN','评分','评分人数','标记日期','短评们']
         #saw dict format: {bookid:[书名,作者,译者,出版社,出版年,页数,ISBN,评分,评分人数,用户评分,评论,标记日期]}
+        self.sawKeys = self.itemKeys + ['用户标签','用户评分','短评']
         self.saw_dict={}
         self.head='https://book.douban.com/subject/'
 
@@ -32,18 +32,19 @@ def wish_get(self,item):
         bid = url.split('/')[-2]
         return date,name,url,bid
 
-    def wish_store(self,wishes):
+    def wish_store(self,wishes,lastBid):
         for item in wishes:
             date,name,url,bid = self.wish_get(item)
-            self.wish_dict[bid]={'书名':name,'封面':'','豆瓣链接':url,\
-                '标记日期':date,'作者':'','译者':'','原作名':'','出版社':'',\
-                '出版年':'','页数':'','ISBN':'','评分':'','评分人数':''}
+            if (lastBid == str(bid)):
+                return -1
+            self.wish_dict[bid]={'书名':name,'豆瓣链接':url,\
+                '标记日期':date,'subjectId':bid}
 
     def Wish(self):
         # 豆瓣图书反爬机制
         homepage='https://book.douban.com/people/'+self.id
         self.s.get(homepage)
-        self.s.get(homepage+'wish')
+        self.s.get(homepage+'/wish')
 
         print('\n开始爬取'+self.id+'的想读列表')
         beg,end = pageControl(10)
@@ -54,17 +55,23 @@ def Wish(self):
         soup, status = self.get_soup(firstpage)
         print(f'第{page}页',status)
 
+        lastBid = getLastBackUpItem(self.id,"想读")
+
         #get book name and id
-        self.wish_store(soup.find_all(class_='item'))
+        if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1):
+            self.feature_helper(self.wish_dict)
+            return self.wish_dict
         next_ = hasNextPage(soup)
 
         #get all wish list
         while (next_!=False) and (page < end):
-            NextPage = 'https://book.douban.com'+next
+            NextPage = 'https://book.douban.com'+next_
             soup, status = self.get_soup(NextPage)
             page += 1
             print(f'第{page}页',status)
-            self.wish_store(soup.find_all(class_='item'))
+            if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1):
+                self.feature_helper(self.wish_dict)
+                return self.wish_dict
             next_ = hasNextPage(soup)
 
         #add feature for every book
@@ -106,9 +113,13 @@ def get_feature(self,bid,dic):
                     if ':' in i :
                         i=i.replace(' ','')
                         key,value=i.split(':',1)
-                        if key in self.Keys:
-                            dic[bid][key]=value
-                dic[mid]['封面']=soup2.find('img').get('src')
+                        dic[bid][key]=value
+                dic[bid]['封面']=soup2.find('img').get('src')
+                dic[bid]['出版年']=getYear(dic[bid]['出版年'])
+                try:
+                    dic[bid]['短评们']=getShortComments(soup2.findAll(class_="comment"))
+                except:
+                    dic[bid]['短评们']='...'
                 try:
                     dic[bid]['评分']=soup2.find(property=re.compile('average')).text.strip(' ')
                 except:
@@ -117,18 +128,20 @@ def get_feature(self,bid,dic):
                     dic[bid]['评分人数']=soup2.find(class_="rating_people").span.text.strip(' ')
                 except:
                     dic[bid]['评分人数']='0'
-        except:
+        except Exception as e:
             print('\r打开书籍页失败，失败的书籍链接：'+head+bid)
+            print(e)
             self.switch_header()
             return bid
     
-    def saw_store(self,saw):
+    def saw_store(self,saw,lastBid):
         for item in saw:
             date,star,comment,owntag,name,bid=self.saw_get(item)
+            if (lastBid == str(bid)):
+                return -1
             self.saw_dict[bid]={'书名':name,'封面':'','豆瓣链接':self.head+bid,\
-                '标记日期':date,'作者':'','译者':'','原作名':'','出版社':'',\
-                '出版年':'','页数':'','ISBN':'','评分':'','评分人数':'',\
-                '用户评分':star,'短评':comment,'用户标签':owntag}
+                '标记日期':date,'用户评分':star,'短评':comment,\
+                '用户标签':owntag,'subjectId':bid}
 
     def saw_get(self,saw):
         date=saw(class_=re.compile('date'))[0].get_text(strip=True)
@@ -164,8 +177,12 @@ def Saw(self):
         soup, status = self.get_soup(Sfirstpage)
         print(f'第{page}页',status)
 
+        lastBid = getLastBackUpItem(self.id,"读过")
+
         #get book name and id
-        self.saw_store(soup.find_all(class_='item'))
+        if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1):
+            self.feature_helper(self.saw_dict)
+            return self.saw_dict
         next_ = hasNextPage(soup)
 
         #get all saw list
@@ -175,21 +192,23 @@ def Saw(self):
             soup, status = self.get_soup(NextPage)
             page += 1
             print(f'第{page}页',status)
-            self.saw_store(soup.find_all(class_='item'))
+            if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1):
+                self.feature_helper(self.saw_dict)
+                return self.saw_dict
             next_ = hasNextPage(soup)
 
         #add feature for every book
         self.feature_helper(self.saw_dict)
-        
         return self.saw_dict
     
-    def save_helper(self, dic, save_type):
-        fw = open(fn(self.id+'-'+getFormatTime()+save_type+'plus.csv'),\
-            'a',endcoding='utf-8_sig')
-        fw.write(','.join(list(dic[list(dic.keys())[0]].keys()))+'\n')
-        for bid in dic.keys():
-            fw.write(','.join(list(map(noco, dic[bid].values())))+'\n')
-        fw.close()
+    def save_helper(self, dic, Type):
+        with open(fn(self.id+'-'+getFormatTime()+Type+'plus.csv'),\
+            'a',encoding='utf-8_sig') as f:
+            fieldNames = self.sawKeys if Type == '读过' else self.itemKeys
+            writer = csv.DictWriter(f, fieldnames=fieldNames, restval="...", extrasaction='ignore')
+            writer.writeheader()
+            for bid in dic.keys():
+                writer.writerow(dic[bid])
     
     def save_as_csv(self,choice):
         if choice in ['a','c']:
@@ -201,12 +220,17 @@ def switch_header(self):
         headers0['User-Agent']=choice(user_agent_list)
         self.s.headers.update(headers0)
 
+    def add_cookies(self,raw_cookies):
+        cookies=getCookie(raw_cookies)
+        self.s.cookies.update(cookies)
+
+
     def main(self):
         print('''
-以下为选项
-    A：想读列表
-    B：读过列表
-    C：想读+读过''')
+        以下为选项
+            A：想读列表
+            B：读过列表
+            C：想读+读过''')
         ans2=input('请输入你需要爬取的内容：')
         ans2=ans2.lower()
         if ans2=='a':
@@ -229,11 +253,19 @@ def main():
     if ans1=='yes':
         Douid=input('请输入你的豆瓣id： ')
         clawer=Douban_Book(doubanid=Douid)
+        # book.douban.com 有反爬，需要cookies
+        print("由于豆瓣图书的防爬虫机制，需要你提供cookies")
+        raw_cookies=input('输入cookies: ')
+        clawer.add_cookies(raw_cookies)
         clawer.main()
     print('\n问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
 
 
 if __name__ == '__main__':
-    main()
-    sleep(10)
-    over=input('按任意键退出')
\ No newline at end of file
+    try:
+        main()
+    except Exception as e:
+        traceback.print_exc()
+    finally:
+        sleep(10)
+        over=input('按任意键退出')
\ No newline at end of file
diff --git a/code/doubanUtils.py b/code/doubanUtils.py
index ebfbd18..8b560f9 100644
--- a/code/doubanUtils.py
+++ b/code/doubanUtils.py
@@ -1,5 +1,6 @@
 import requests
-import re
+import csv, os, os.path, re
+from functools import reduce
 from bs4 import BeautifulSoup
 from time import localtime,strftime,perf_counter,strptime
 
@@ -57,3 +58,61 @@ def getFormatTime():
 
 def string2Time(s):
     return strptime(s, '%Y-%m-%d %H-%M-%S')
+
+def fileTimeCompare(fn1, fn2):
+    fn1 = fn1.replace(".csv","").split('-',1)[1][:-6]
+    fn2 = fn2.replace(".csv","").split('-',1)[1][:-6]
+    return string2Time(fn1) > string2Time(fn2) 
+
+def getLastBackUpItem(douId,Type):
+    # 获取上次文件
+    matchFiles = []
+    # 文件名
+    fnMatch = r"iiid-\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}tttypeplus.csv"\
+        .replace('iiid',douId).replace('tttype',Type)
+    for _, _, files in os.walk("."):
+        for file in files:
+            # print(file)
+            if re.match(fnMatch,file):
+                matchFiles.append(file)
+    ## 得到最新的电影名
+    if len(matchFiles) != 0:
+        latest = reduce(lambda x,y: x if fileTimeCompare(x,y) else y,\
+            matchFiles)
+        with open(latest, 'r', encoding='utf-8_sig') as f:
+            reader = csv.DictReader(f)
+            # 获取第一行电影的id
+            try:
+                row = reader.__next__()
+                return row['subjectId']
+            except:
+                return None
+    else: 
+        return None 
+
+def getCookie(raw_cookies):
+    cookies={}
+    for line in raw_cookies.split(';'):
+        key,value=line.split('=',1) 
+        cookies[key]=value
+    return cookies   
+
+def getYear(raw):
+    yearRex = r'([1|2][9|0]\d{2})'
+    res = re.match(yearRex,raw)
+    try:
+        return res.group(1)
+    except:
+        return ''
+
+def getShortComments(comments):
+    res = ''
+    for com in comments:
+        # 先得到评价用户名
+        user = com.find(class_="comment-info").get_text(strip=True).replace('\xa0','').replace('\n','')
+        res += user
+        res += '：'
+        short = com.find(class_="short").get_text(strip=True).replace('\xa0','').replace('\n','')
+        res += short
+        res += '；  |  '
+    return res.replace("看过"," ")
\ No newline at end of file
diff --git a/code/moviev2.py b/code/moviev2.py
index e023963..a850c7b 100644
--- a/code/moviev2.py
+++ b/code/moviev2.py
@@ -1,10 +1,8 @@
 import requests, traceback
-import csv, os, os.path, re
 from bs4 import BeautifulSoup
 from time import sleep
 from random import uniform,choice
 from doubanUtils import *
-from functools import reduce
 
 headers0 = {'User-Agent':getAgent()}
 
@@ -19,12 +17,16 @@ def __init__(self,doubanid):
         self.id=doubanid
         #wish dict format: {movieid:[电影名,上映日期,导演,编剧,主演,制片国家/地区,片长,评分,评分人数,标记日期,豆瓣链接]}
         self.wish_dict={}
-        self.itemKeys=['subjectId','电影名','豆瓣链接','封面','上映日期','导演','编剧',\
+        self.itemKeys=['subjectId','电影名','年份','豆瓣链接','封面','上映日期','导演','编剧',\
             '主演','制片国家/地区','片长','豆瓣评分','评分人数','标记日期','IMDb链接',\
-            '语言','又名','类型']
+            '语言','又名','类型','短评们']
         self.sawKeys = self.itemKeys + ['用户标签','用户评分','短评']
         #saw dict format: {movieid:[电影名,上映日期,导演,编剧,主演,制片国家/地区,片长,评分,评分人数,用户评分,评论,标记日期,豆瓣链接]}
         self.saw_dict={}
+        self.proxies = {
+            'https': "http://95.179.219.61:8080",
+            'http': "http://61.7.138.240:8080"
+        }
     
     def get_soup(self,url):
         req = self.s.get(url)
@@ -57,7 +59,7 @@ def Wish(self):
 
         # 添加新特性，可以根据上次爬取历史中断重复爬取
         ## 要求上次爬取文件在当前脚本目录中
-        lastMid = self.getLastBackUpItem(Type='想看')
+        lastMid = getLastBackUpItem(self.id,'想看')
 
         # get movie name and id
         if (self.wish_store(soup.find_all(class_=['item']), lastMid) == -1):
@@ -92,13 +94,13 @@ def feature_helper(self, dic):
             count+=1
             if count%50==0:
                 sleep(15)
-            sleep(uniform(1,2.5))
+            sleep(uniform(1,2))
             timebar(30,st,count/total)
             fail.append(self.get_feature(mid,dic))
         print('\n再次尝试打开失败的电影页')
         for fmid in fail:
             if fmid!=None:
-                sleep(2)
+                sleep(1.5)
                 print()
                 self.get_feature(fmid,dic)
 
@@ -123,6 +125,14 @@ def get_feature(self,mid,dic):
                     dic[mid]['评分人数']=soup2.find(class_="rating_people").span.text
                 except:
                     dic[mid]['评分人数']='0'
+                try:
+                    dic[mid]['年份']=getYear(dic[mid]['上映日期'])
+                except:
+                    try:
+                        dic[mid]['年份']=getYear(dic[mid]['首播'])
+                    except:
+                        dic[mid]['年份']='...'
+                dic[mid]['短评们']=getShortComments(soup2.findAll(class_="comment"))
         except:
             print('\r打开电影页失败，失败的电影链接：'+subject_head+mid)
             self.switch_header()
@@ -167,7 +177,7 @@ def Saw(self):
         
         # 添加新特性，可以根据上次爬取历史中断重复爬取
         ## 要求上次爬取文件在当前脚本目录中
-        lastMid = self.getLastBackUpItem(Type='看过')
+        lastMid = getLastBackUpItem(self.id,'看过')
 
         #get movie name and id
         if (self.saw_store(soup.find_all(class_=['item']), lastMid) == -1):
@@ -197,10 +207,11 @@ def save_helper(self, dic, Type):
         with open(fn(self.id+'-'+getFormatTime()+Type+'plus.csv'),\
             'a',encoding='utf-8_sig') as f:
             fieldNames = self.sawKeys if Type == '看过' else self.itemKeys
-            writer = csv.DictWriter(f, fieldnames=fieldNames, restval="restval", extrasaction='ignore')
+            writer = csv.DictWriter(f, fieldnames=fieldNames, restval="...", extrasaction='ignore')
             writer.writeheader()
             for mid in dic.keys():
                 writer.writerow(dic[mid])
+        dic = {}
     
     def save_as_csv(self,choice):
         if choice in ['a','c']:
@@ -214,36 +225,9 @@ def switch_header(self):
         headers0['User-Agent']=choice(user_agent_list)
         self.s.headers.update(headers0)
 
-    def getLastBackUpItem(self,Type="想看"):
-        # 获取上次文件
-        matchFiles = []
-        # 文件名
-        fnMatch = r"iiid-\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}tttypeplus.csv"\
-            .replace('iiid',self.id).replace('tttype',Type)
-        for _, _, files in os.walk("."):
-            for file in files:
-                # print(file)
-                if re.match(fnMatch,file):
-                    matchFiles.append(file)
-        ## 得到最新的电影名
-        if len(matchFiles) != 0:
-            latest = reduce(lambda x,y: x if self.fileTimeCompare(x,y) else y,\
-                matchFiles)
-            with open(latest, 'r', encoding='utf-8_sig') as f:
-                reader = csv.DictReader(f)
-                # 获取第一行电影的id
-                try:
-                    row = reader.__next__()
-                    return row['subjectId']
-                except:
-                    return None
-        else: 
-            return None 
-    
-    def fileTimeCompare(self, fn1, fn2):
-        fn1 = fn1.replace(".csv","").split('-',1)[1][:-6]
-        fn2 = fn2.replace(".csv","").split('-',1)[1][:-6]
-        return string2Time(fn1) > string2Time(fn2) 
+    def add_cookies(self,raw_cookies):
+        cookies=getCookie(raw_cookies)
+        self.s.cookies.update(cookies)
 
 def movieMain():
     print('嘿，据说你想要备份你的豆瓣电影记录？')
@@ -256,11 +240,15 @@ def movieMain():
     if ans1=='yes':
         Douid=input('请输入你的豆瓣id： ')
         clawer=Douban_Movie(doubanid=Douid)
+        # 想要加cookies
+        if (input('想要添加cookies(爬取豆瓣隐藏条目)可以添加cookie,输入c: ').lower()=='c'):
+            raw_cookies = input("请输入cookies: ")
+            clawer.add_cookies(raw_cookies)
         print('''
-以下为选项
-    A：想看列表
-    B：看过列表
-    C：想看+看过''')
+    以下为选项
+        A：想看列表
+        B：看过列表
+        C：想看+看过''')
         ans2=input('请输入你需要爬取的内容：')
         ans2=ans2.lower()
         if ans2=='a':
diff --git a/code/personalCrawler.py b/code/personalCrawler.py
new file mode 100644
index 0000000..b725ece
--- /dev/null
+++ b/code/personalCrawler.py
@@ -0,0 +1,15 @@
+import bookv2, moviev2, traceback
+from time import sleep
+
+if __name__ == '__main__':
+    try:
+        choice = input("图书备份请输入[b]，电影备份输入[m]: ")
+        if (choice.lower() == 'b'):
+            bookv2.main()
+        elif (choice.lower() == 'm'):
+            moviev2.movieMain()
+    except Exception as e:
+        traceback.print_exc()
+        sleep(10)
+    finally:
+        over=input('按任意键退出')
\ No newline at end of file