From 3bcb685d37bfdc7b58144ce07cd1961d576490fc Mon Sep 17 00:00:00 2001 From: JimSunJing Date: Sat, 5 Sep 2020 09:13:51 +0800 Subject: [PATCH] moviev2+bookv2 update and merge --- code/bookv2.py | 108 ++++++++++++++++++++++++++-------------- code/doubanUtils.py | 61 ++++++++++++++++++++++- code/moviev2.py | 74 ++++++++++++--------------- code/personalCrawler.py | 15 ++++++ 4 files changed, 176 insertions(+), 82 deletions(-) create mode 100644 code/personalCrawler.py diff --git a/code/bookv2.py b/code/bookv2.py index c47d15f..8e0113d 100644 --- a/code/bookv2.py +++ b/code/bookv2.py @@ -1,7 +1,6 @@ -import requests +import requests, traceback from bs4 import BeautifulSoup -import re -from time import sleep,perf_counter +from time import sleep from random import uniform,choice from doubanUtils import * @@ -15,9 +14,10 @@ def __init__(self,doubanid): self.id=doubanid #wish dict format: {bookid:[书名,作者,译者,原作名,出版社,出版年,页数,ISBN,评分,评分人数]} self.wish_dict={} - self.Keys=['书名','作者','译者','原作名',\ - '出版社','出版年','页数','ISBN','评分','评分人数'] + self.itemKeys=['subjectId','书名','封面','作者','译者','原作名','丛书',\ + '出版社','出版年','页数','ISBN','评分','评分人数','标记日期','短评们'] #saw dict format: {bookid:[书名,作者,译者,出版社,出版年,页数,ISBN,评分,评分人数,用户评分,评论,标记日期]} + self.sawKeys = self.itemKeys + ['用户标签','用户评分','短评'] self.saw_dict={} self.head='https://book.douban.com/subject/' @@ -32,18 +32,19 @@ def wish_get(self,item): bid = url.split('/')[-2] return date,name,url,bid - def wish_store(self,wishes): + def wish_store(self,wishes,lastBid): for item in wishes: date,name,url,bid = self.wish_get(item) - self.wish_dict[bid]={'书名':name,'封面':'','豆瓣链接':url,\ - '标记日期':date,'作者':'','译者':'','原作名':'','出版社':'',\ - '出版年':'','页数':'','ISBN':'','评分':'','评分人数':''} + if (lastBid == str(bid)): + return -1 + self.wish_dict[bid]={'书名':name,'豆瓣链接':url,\ + '标记日期':date,'subjectId':bid} def Wish(self): # 豆瓣图书反爬机制 homepage='https://book.douban.com/people/'+self.id self.s.get(homepage) - self.s.get(homepage+'wish') + self.s.get(homepage+'/wish') print('\n开始爬取'+self.id+'的想读列表') beg,end = pageControl(10) @@ -54,17 +55,23 @@ def Wish(self): soup, status = self.get_soup(firstpage) print(f'第{page}页',status) + lastBid = getLastBackUpItem(self.id,"想读") + #get book name and id - self.wish_store(soup.find_all(class_='item')) + if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1): + self.feature_helper(self.wish_dict) + return self.wish_dict next_ = hasNextPage(soup) #get all wish list while (next_!=False) and (page < end): - NextPage = 'https://book.douban.com'+next + NextPage = 'https://book.douban.com'+next_ soup, status = self.get_soup(NextPage) page += 1 print(f'第{page}页',status) - self.wish_store(soup.find_all(class_='item')) + if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1): + self.feature_helper(self.wish_dict) + return self.wish_dict next_ = hasNextPage(soup) #add feature for every book @@ -106,9 +113,13 @@ def get_feature(self,bid,dic): if ':' in i : i=i.replace(' ','') key,value=i.split(':',1) - if key in self.Keys: - dic[bid][key]=value - dic[mid]['封面']=soup2.find('img').get('src') + dic[bid][key]=value + dic[bid]['封面']=soup2.find('img').get('src') + dic[bid]['出版年']=getYear(dic[bid]['出版年']) + try: + dic[bid]['短评们']=getShortComments(soup2.findAll(class_="comment")) + except: + dic[bid]['短评们']='...' try: dic[bid]['评分']=soup2.find(property=re.compile('average')).text.strip(' ') except: @@ -117,18 +128,20 @@ def get_feature(self,bid,dic): dic[bid]['评分人数']=soup2.find(class_="rating_people").span.text.strip(' ') except: dic[bid]['评分人数']='0' - except: + except Exception as e: print('\r打开书籍页失败,失败的书籍链接:'+head+bid) + print(e) self.switch_header() return bid - def saw_store(self,saw): + def saw_store(self,saw,lastBid): for item in saw: date,star,comment,owntag,name,bid=self.saw_get(item) + if (lastBid == str(bid)): + return -1 self.saw_dict[bid]={'书名':name,'封面':'','豆瓣链接':self.head+bid,\ - '标记日期':date,'作者':'','译者':'','原作名':'','出版社':'',\ - '出版年':'','页数':'','ISBN':'','评分':'','评分人数':'',\ - '用户评分':star,'短评':comment,'用户标签':owntag} + '标记日期':date,'用户评分':star,'短评':comment,\ + '用户标签':owntag,'subjectId':bid} def saw_get(self,saw): date=saw(class_=re.compile('date'))[0].get_text(strip=True) @@ -164,8 +177,12 @@ def Saw(self): soup, status = self.get_soup(Sfirstpage) print(f'第{page}页',status) + lastBid = getLastBackUpItem(self.id,"读过") + #get book name and id - self.saw_store(soup.find_all(class_='item')) + if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1): + self.feature_helper(self.saw_dict) + return self.saw_dict next_ = hasNextPage(soup) #get all saw list @@ -175,21 +192,23 @@ def Saw(self): soup, status = self.get_soup(NextPage) page += 1 print(f'第{page}页',status) - self.saw_store(soup.find_all(class_='item')) + if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1): + self.feature_helper(self.saw_dict) + return self.saw_dict next_ = hasNextPage(soup) #add feature for every book self.feature_helper(self.saw_dict) - return self.saw_dict - def save_helper(self, dic, save_type): - fw = open(fn(self.id+'-'+getFormatTime()+save_type+'plus.csv'),\ - 'a',endcoding='utf-8_sig') - fw.write(','.join(list(dic[list(dic.keys())[0]].keys()))+'\n') - for bid in dic.keys(): - fw.write(','.join(list(map(noco, dic[bid].values())))+'\n') - fw.close() + def save_helper(self, dic, Type): + with open(fn(self.id+'-'+getFormatTime()+Type+'plus.csv'),\ + 'a',encoding='utf-8_sig') as f: + fieldNames = self.sawKeys if Type == '读过' else self.itemKeys + writer = csv.DictWriter(f, fieldnames=fieldNames, restval="...", extrasaction='ignore') + writer.writeheader() + for bid in dic.keys(): + writer.writerow(dic[bid]) def save_as_csv(self,choice): if choice in ['a','c']: @@ -201,12 +220,17 @@ def switch_header(self): headers0['User-Agent']=choice(user_agent_list) self.s.headers.update(headers0) + def add_cookies(self,raw_cookies): + cookies=getCookie(raw_cookies) + self.s.cookies.update(cookies) + + def main(self): print(''' -以下为选项 - A:想读列表 - B:读过列表 - C:想读+读过''') + 以下为选项 + A:想读列表 + B:读过列表 + C:想读+读过''') ans2=input('请输入你需要爬取的内容:') ans2=ans2.lower() if ans2=='a': @@ -229,11 +253,19 @@ def main(): if ans1=='yes': Douid=input('请输入你的豆瓣id: ') clawer=Douban_Book(doubanid=Douid) + # book.douban.com 有反爬,需要cookies + print("由于豆瓣图书的防爬虫机制,需要你提供cookies") + raw_cookies=input('输入cookies: ') + clawer.add_cookies(raw_cookies) clawer.main() print('\n问题反馈:jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer') if __name__ == '__main__': - main() - sleep(10) - over=input('按任意键退出') \ No newline at end of file + try: + main() + except Exception as e: + traceback.print_exc() + finally: + sleep(10) + over=input('按任意键退出') \ No newline at end of file diff --git a/code/doubanUtils.py b/code/doubanUtils.py index ebfbd18..8b560f9 100644 --- a/code/doubanUtils.py +++ b/code/doubanUtils.py @@ -1,5 +1,6 @@ import requests -import re +import csv, os, os.path, re +from functools import reduce from bs4 import BeautifulSoup from time import localtime,strftime,perf_counter,strptime @@ -57,3 +58,61 @@ def getFormatTime(): def string2Time(s): return strptime(s, '%Y-%m-%d %H-%M-%S') + +def fileTimeCompare(fn1, fn2): + fn1 = fn1.replace(".csv","").split('-',1)[1][:-6] + fn2 = fn2.replace(".csv","").split('-',1)[1][:-6] + return string2Time(fn1) > string2Time(fn2) + +def getLastBackUpItem(douId,Type): + # 获取上次文件 + matchFiles = [] + # 文件名 + fnMatch = r"iiid-\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}tttypeplus.csv"\ + .replace('iiid',douId).replace('tttype',Type) + for _, _, files in os.walk("."): + for file in files: + # print(file) + if re.match(fnMatch,file): + matchFiles.append(file) + ## 得到最新的电影名 + if len(matchFiles) != 0: + latest = reduce(lambda x,y: x if fileTimeCompare(x,y) else y,\ + matchFiles) + with open(latest, 'r', encoding='utf-8_sig') as f: + reader = csv.DictReader(f) + # 获取第一行电影的id + try: + row = reader.__next__() + return row['subjectId'] + except: + return None + else: + return None + +def getCookie(raw_cookies): + cookies={} + for line in raw_cookies.split(';'): + key,value=line.split('=',1) + cookies[key]=value + return cookies + +def getYear(raw): + yearRex = r'([1|2][9|0]\d{2})' + res = re.match(yearRex,raw) + try: + return res.group(1) + except: + return '' + +def getShortComments(comments): + res = '' + for com in comments: + # 先得到评价用户名 + user = com.find(class_="comment-info").get_text(strip=True).replace('\xa0','').replace('\n','') + res += user + res += ':' + short = com.find(class_="short").get_text(strip=True).replace('\xa0','').replace('\n','') + res += short + res += '; | ' + return res.replace("看过"," ") \ No newline at end of file diff --git a/code/moviev2.py b/code/moviev2.py index e023963..a850c7b 100644 --- a/code/moviev2.py +++ b/code/moviev2.py @@ -1,10 +1,8 @@ import requests, traceback -import csv, os, os.path, re from bs4 import BeautifulSoup from time import sleep from random import uniform,choice from doubanUtils import * -from functools import reduce headers0 = {'User-Agent':getAgent()} @@ -19,12 +17,16 @@ def __init__(self,doubanid): self.id=doubanid #wish dict format: {movieid:[电影名,上映日期,导演,编剧,主演,制片国家/地区,片长,评分,评分人数,标记日期,豆瓣链接]} self.wish_dict={} - self.itemKeys=['subjectId','电影名','豆瓣链接','封面','上映日期','导演','编剧',\ + self.itemKeys=['subjectId','电影名','年份','豆瓣链接','封面','上映日期','导演','编剧',\ '主演','制片国家/地区','片长','豆瓣评分','评分人数','标记日期','IMDb链接',\ - '语言','又名','类型'] + '语言','又名','类型','短评们'] self.sawKeys = self.itemKeys + ['用户标签','用户评分','短评'] #saw dict format: {movieid:[电影名,上映日期,导演,编剧,主演,制片国家/地区,片长,评分,评分人数,用户评分,评论,标记日期,豆瓣链接]} self.saw_dict={} + self.proxies = { + 'https': "http://95.179.219.61:8080", + 'http': "http://61.7.138.240:8080" + } def get_soup(self,url): req = self.s.get(url) @@ -57,7 +59,7 @@ def Wish(self): # 添加新特性,可以根据上次爬取历史中断重复爬取 ## 要求上次爬取文件在当前脚本目录中 - lastMid = self.getLastBackUpItem(Type='想看') + lastMid = getLastBackUpItem(self.id,'想看') # get movie name and id if (self.wish_store(soup.find_all(class_=['item']), lastMid) == -1): @@ -92,13 +94,13 @@ def feature_helper(self, dic): count+=1 if count%50==0: sleep(15) - sleep(uniform(1,2.5)) + sleep(uniform(1,2)) timebar(30,st,count/total) fail.append(self.get_feature(mid,dic)) print('\n再次尝试打开失败的电影页') for fmid in fail: if fmid!=None: - sleep(2) + sleep(1.5) print() self.get_feature(fmid,dic) @@ -123,6 +125,14 @@ def get_feature(self,mid,dic): dic[mid]['评分人数']=soup2.find(class_="rating_people").span.text except: dic[mid]['评分人数']='0' + try: + dic[mid]['年份']=getYear(dic[mid]['上映日期']) + except: + try: + dic[mid]['年份']=getYear(dic[mid]['首播']) + except: + dic[mid]['年份']='...' + dic[mid]['短评们']=getShortComments(soup2.findAll(class_="comment")) except: print('\r打开电影页失败,失败的电影链接:'+subject_head+mid) self.switch_header() @@ -167,7 +177,7 @@ def Saw(self): # 添加新特性,可以根据上次爬取历史中断重复爬取 ## 要求上次爬取文件在当前脚本目录中 - lastMid = self.getLastBackUpItem(Type='看过') + lastMid = getLastBackUpItem(self.id,'看过') #get movie name and id if (self.saw_store(soup.find_all(class_=['item']), lastMid) == -1): @@ -197,10 +207,11 @@ def save_helper(self, dic, Type): with open(fn(self.id+'-'+getFormatTime()+Type+'plus.csv'),\ 'a',encoding='utf-8_sig') as f: fieldNames = self.sawKeys if Type == '看过' else self.itemKeys - writer = csv.DictWriter(f, fieldnames=fieldNames, restval="restval", extrasaction='ignore') + writer = csv.DictWriter(f, fieldnames=fieldNames, restval="...", extrasaction='ignore') writer.writeheader() for mid in dic.keys(): writer.writerow(dic[mid]) + dic = {} def save_as_csv(self,choice): if choice in ['a','c']: @@ -214,36 +225,9 @@ def switch_header(self): headers0['User-Agent']=choice(user_agent_list) self.s.headers.update(headers0) - def getLastBackUpItem(self,Type="想看"): - # 获取上次文件 - matchFiles = [] - # 文件名 - fnMatch = r"iiid-\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}tttypeplus.csv"\ - .replace('iiid',self.id).replace('tttype',Type) - for _, _, files in os.walk("."): - for file in files: - # print(file) - if re.match(fnMatch,file): - matchFiles.append(file) - ## 得到最新的电影名 - if len(matchFiles) != 0: - latest = reduce(lambda x,y: x if self.fileTimeCompare(x,y) else y,\ - matchFiles) - with open(latest, 'r', encoding='utf-8_sig') as f: - reader = csv.DictReader(f) - # 获取第一行电影的id - try: - row = reader.__next__() - return row['subjectId'] - except: - return None - else: - return None - - def fileTimeCompare(self, fn1, fn2): - fn1 = fn1.replace(".csv","").split('-',1)[1][:-6] - fn2 = fn2.replace(".csv","").split('-',1)[1][:-6] - return string2Time(fn1) > string2Time(fn2) + def add_cookies(self,raw_cookies): + cookies=getCookie(raw_cookies) + self.s.cookies.update(cookies) def movieMain(): print('嘿,据说你想要备份你的豆瓣电影记录?') @@ -256,11 +240,15 @@ def movieMain(): if ans1=='yes': Douid=input('请输入你的豆瓣id: ') clawer=Douban_Movie(doubanid=Douid) + # 想要加cookies + if (input('想要添加cookies(爬取豆瓣隐藏条目)可以添加cookie,输入c: ').lower()=='c'): + raw_cookies = input("请输入cookies: ") + clawer.add_cookies(raw_cookies) print(''' -以下为选项 - A:想看列表 - B:看过列表 - C:想看+看过''') + 以下为选项 + A:想看列表 + B:看过列表 + C:想看+看过''') ans2=input('请输入你需要爬取的内容:') ans2=ans2.lower() if ans2=='a': diff --git a/code/personalCrawler.py b/code/personalCrawler.py new file mode 100644 index 0000000..b725ece --- /dev/null +++ b/code/personalCrawler.py @@ -0,0 +1,15 @@ +import bookv2, moviev2, traceback +from time import sleep + +if __name__ == '__main__': + try: + choice = input("图书备份请输入[b],电影备份输入[m]: ") + if (choice.lower() == 'b'): + bookv2.main() + elif (choice.lower() == 'm'): + moviev2.movieMain() + except Exception as e: + traceback.print_exc() + sleep(10) + finally: + over=input('按任意键退出') \ No newline at end of file