Skip to content

Commit

Permalink
moviev2+bookv2 update and merge
Browse files Browse the repository at this point in the history
  • Loading branch information
JimSunJing committed Sep 5, 2020
1 parent 3e34833 commit 3bcb685
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 82 deletions.
108 changes: 70 additions & 38 deletions code/bookv2.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import requests
import requests, traceback
from bs4 import BeautifulSoup
import re
from time import sleep,perf_counter
from time import sleep
from random import uniform,choice
from doubanUtils import *

Expand All @@ -15,9 +14,10 @@ def __init__(self,doubanid):
self.id=doubanid
#wish dict format: {bookid:[书名,作者,译者,原作名,出版社,出版年,页数,ISBN,评分,评分人数]}
self.wish_dict={}
self.Keys=['书名','作者','译者','原作名',\
'出版社','出版年','页数','ISBN','评分','评分人数']
self.itemKeys=['subjectId','书名','封面','作者','译者','原作名','丛书',\
'出版社','出版年','页数','ISBN','评分','评分人数','标记日期','短评们']
#saw dict format: {bookid:[书名,作者,译者,出版社,出版年,页数,ISBN,评分,评分人数,用户评分,评论,标记日期]}
self.sawKeys = self.itemKeys + ['用户标签','用户评分','短评']
self.saw_dict={}
self.head='https://book.douban.com/subject/'

Expand All @@ -32,18 +32,19 @@ def wish_get(self,item):
bid = url.split('/')[-2]
return date,name,url,bid

def wish_store(self,wishes):
def wish_store(self,wishes,lastBid):
for item in wishes:
date,name,url,bid = self.wish_get(item)
self.wish_dict[bid]={'书名':name,'封面':'','豆瓣链接':url,\
'标记日期':date,'作者':'','译者':'','原作名':'','出版社':'',\
'出版年':'','页数':'','ISBN':'','评分':'','评分人数':''}
if (lastBid == str(bid)):
return -1
self.wish_dict[bid]={'书名':name,'豆瓣链接':url,\
'标记日期':date,'subjectId':bid}

def Wish(self):
# 豆瓣图书反爬机制
homepage='https://book.douban.com/people/'+self.id
self.s.get(homepage)
self.s.get(homepage+'wish')
self.s.get(homepage+'/wish')

print('\n开始爬取'+self.id+'的想读列表')
beg,end = pageControl(10)
Expand All @@ -54,17 +55,23 @@ def Wish(self):
soup, status = self.get_soup(firstpage)
print(f'第{page}页',status)

lastBid = getLastBackUpItem(self.id,"想读")

#get book name and id
self.wish_store(soup.find_all(class_='item'))
if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1):
self.feature_helper(self.wish_dict)
return self.wish_dict
next_ = hasNextPage(soup)

#get all wish list
while (next_!=False) and (page < end):
NextPage = 'https://book.douban.com'+next
NextPage = 'https://book.douban.com'+next_
soup, status = self.get_soup(NextPage)
page += 1
print(f'第{page}页',status)
self.wish_store(soup.find_all(class_='item'))
if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1):
self.feature_helper(self.wish_dict)
return self.wish_dict
next_ = hasNextPage(soup)

#add feature for every book
Expand Down Expand Up @@ -106,9 +113,13 @@ def get_feature(self,bid,dic):
if ':' in i :
i=i.replace(' ','')
key,value=i.split(':',1)
if key in self.Keys:
dic[bid][key]=value
dic[mid]['封面']=soup2.find('img').get('src')
dic[bid][key]=value
dic[bid]['封面']=soup2.find('img').get('src')
dic[bid]['出版年']=getYear(dic[bid]['出版年'])
try:
dic[bid]['短评们']=getShortComments(soup2.findAll(class_="comment"))
except:
dic[bid]['短评们']='...'
try:
dic[bid]['评分']=soup2.find(property=re.compile('average')).text.strip(' ')
except:
Expand All @@ -117,18 +128,20 @@ def get_feature(self,bid,dic):
dic[bid]['评分人数']=soup2.find(class_="rating_people").span.text.strip(' ')
except:
dic[bid]['评分人数']='0'
except:
except Exception as e:
print('\r打开书籍页失败,失败的书籍链接:'+head+bid)
print(e)
self.switch_header()
return bid

def saw_store(self,saw):
def saw_store(self,saw,lastBid):
for item in saw:
date,star,comment,owntag,name,bid=self.saw_get(item)
if (lastBid == str(bid)):
return -1
self.saw_dict[bid]={'书名':name,'封面':'','豆瓣链接':self.head+bid,\
'标记日期':date,'作者':'','译者':'','原作名':'','出版社':'',\
'出版年':'','页数':'','ISBN':'','评分':'','评分人数':'',\
'用户评分':star,'短评':comment,'用户标签':owntag}
'标记日期':date,'用户评分':star,'短评':comment,\
'用户标签':owntag,'subjectId':bid}

def saw_get(self,saw):
date=saw(class_=re.compile('date'))[0].get_text(strip=True)
Expand Down Expand Up @@ -164,8 +177,12 @@ def Saw(self):
soup, status = self.get_soup(Sfirstpage)
print(f'第{page}页',status)

lastBid = getLastBackUpItem(self.id,"读过")

#get book name and id
self.saw_store(soup.find_all(class_='item'))
if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1):
self.feature_helper(self.saw_dict)
return self.saw_dict
next_ = hasNextPage(soup)

#get all saw list
Expand All @@ -175,21 +192,23 @@ def Saw(self):
soup, status = self.get_soup(NextPage)
page += 1
print(f'第{page}页',status)
self.saw_store(soup.find_all(class_='item'))
if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1):
self.feature_helper(self.saw_dict)
return self.saw_dict
next_ = hasNextPage(soup)

#add feature for every book
self.feature_helper(self.saw_dict)

return self.saw_dict

def save_helper(self, dic, save_type):
fw = open(fn(self.id+'-'+getFormatTime()+save_type+'plus.csv'),\
'a',endcoding='utf-8_sig')
fw.write(','.join(list(dic[list(dic.keys())[0]].keys()))+'\n')
for bid in dic.keys():
fw.write(','.join(list(map(noco, dic[bid].values())))+'\n')
fw.close()
def save_helper(self, dic, Type):
with open(fn(self.id+'-'+getFormatTime()+Type+'plus.csv'),\
'a',encoding='utf-8_sig') as f:
fieldNames = self.sawKeys if Type == '读过' else self.itemKeys
writer = csv.DictWriter(f, fieldnames=fieldNames, restval="...", extrasaction='ignore')
writer.writeheader()
for bid in dic.keys():
writer.writerow(dic[bid])

def save_as_csv(self,choice):
if choice in ['a','c']:
Expand All @@ -201,12 +220,17 @@ def switch_header(self):
headers0['User-Agent']=choice(user_agent_list)
self.s.headers.update(headers0)

def add_cookies(self,raw_cookies):
cookies=getCookie(raw_cookies)
self.s.cookies.update(cookies)


def main(self):
print('''
以下为选项
A:想读列表
B:读过列表
C:想读+读过''')
以下为选项
A:想读列表
B:读过列表
C:想读+读过''')
ans2=input('请输入你需要爬取的内容:')
ans2=ans2.lower()
if ans2=='a':
Expand All @@ -229,11 +253,19 @@ def main():
if ans1=='yes':
Douid=input('请输入你的豆瓣id: ')
clawer=Douban_Book(doubanid=Douid)
# book.douban.com 有反爬,需要cookies
print("由于豆瓣图书的防爬虫机制,需要你提供cookies")
raw_cookies=input('输入cookies: ')
clawer.add_cookies(raw_cookies)
clawer.main()
print('\n问题反馈:[email protected] | https://github.com/JimSunJing/douban_clawer')


if __name__ == '__main__':
main()
sleep(10)
over=input('按任意键退出')
try:
main()
except Exception as e:
traceback.print_exc()
finally:
sleep(10)
over=input('按任意键退出')
61 changes: 60 additions & 1 deletion code/doubanUtils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import requests
import re
import csv, os, os.path, re
from functools import reduce
from bs4 import BeautifulSoup
from time import localtime,strftime,perf_counter,strptime

Expand Down Expand Up @@ -57,3 +58,61 @@ def getFormatTime():

def string2Time(s):
return strptime(s, '%Y-%m-%d %H-%M-%S')

def fileTimeCompare(fn1, fn2):
fn1 = fn1.replace(".csv","").split('-',1)[1][:-6]
fn2 = fn2.replace(".csv","").split('-',1)[1][:-6]
return string2Time(fn1) > string2Time(fn2)

def getLastBackUpItem(douId,Type):
# 获取上次文件
matchFiles = []
# 文件名
fnMatch = r"iiid-\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}tttypeplus.csv"\
.replace('iiid',douId).replace('tttype',Type)
for _, _, files in os.walk("."):
for file in files:
# print(file)
if re.match(fnMatch,file):
matchFiles.append(file)
## 得到最新的电影名
if len(matchFiles) != 0:
latest = reduce(lambda x,y: x if fileTimeCompare(x,y) else y,\
matchFiles)
with open(latest, 'r', encoding='utf-8_sig') as f:
reader = csv.DictReader(f)
# 获取第一行电影的id
try:
row = reader.__next__()
return row['subjectId']
except:
return None
else:
return None

def getCookie(raw_cookies):
cookies={}
for line in raw_cookies.split(';'):
key,value=line.split('=',1)
cookies[key]=value
return cookies

def getYear(raw):
yearRex = r'([1|2][9|0]\d{2})'
res = re.match(yearRex,raw)
try:
return res.group(1)
except:
return ''

def getShortComments(comments):
res = ''
for com in comments:
# 先得到评价用户名
user = com.find(class_="comment-info").get_text(strip=True).replace('\xa0','').replace('\n','')
res += user
res += ':'
short = com.find(class_="short").get_text(strip=True).replace('\xa0','').replace('\n','')
res += short
res += '; | '
return res.replace("看过"," ")
Loading

0 comments on commit 3bcb685

Please sign in to comment.