Skip to content

Commit

Permalink
fix some bug and add read, music clawer
Browse files Browse the repository at this point in the history
  • Loading branch information
JimSunJing committed Jun 18, 2019
1 parent 0cbe8f7 commit d6723b5
Show file tree
Hide file tree
Showing 6 changed files with 250 additions and 2 deletions.
Binary file not shown.
Binary file added dist/douban_music_backup.zip
Binary file not shown.
Binary file added dist/douban_read_backup.zip
Binary file not shown.
136 changes: 136 additions & 0 deletions doubanbook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import urllib.request
from bs4 import BeautifulSoup
import re
import time
import random
from selenium import webdriver

headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

def BWappend(BWdict,Items):
for i in range(len(Items)):
try:
title=Items[i](href=re.compile('subject'))[0].get_text(strip=True)
intro=Items[i](class_='intro')[0].get_text(strip=True).split('/')
author=intro[0]
publisher=intro[-3]
translater='/'.join(intro[1:-3])
BWdict[title]=[author,translater,publisher]
except:
title=Items[i](href=re.compile('subject'))[0].get_text(strip=True)
intro=Items[i](class_='intro')[0].get_text(strip=True).split(';')
author=intro[0]
publisher=intro[-1]
translater='/'.join(intro[1:-1])
BWdict[title]=[author,translater,publisher]

def bookwish(doubanid):
firstpage='https://book.douban.com/people/'+doubanid+'/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count'
request=urllib.request.urlopen(url=firstpage)
soup=BeautifulSoup(request.read())
page=1
print(f'第{page}页',request.reason)
bookwishdict={}
items=soup.find_all(class_='item')
BWappend(BWdict=bookwishdict,Items=items)
while 1:
try:
Nextpage='https://book.douban.com'+soup.find(class_='next').link.get('href')
except:
print('已到最终页')
break
else:
response=urllib.request.Request(url=Nextpage,headers=headers)
request=urllib.request.urlopen(response)
soup=BeautifulSoup(request.read())
page+=1
print(f'第{page}页',request.reason)
items2=soup.find_all(class_='item')
BWappend(BWdict=bookwishdict,Items=items2)
time.sleep(1)
fw=open(doubanid+'_TOread_List.csv','w',encoding='utf-8_sig')
fw.write('书名,作者,译者,出版社\n')
for title in bookwishdict.keys():
fw.write(title.replace(',','、').replace(',','、')+','+bookwishdict[title][0]+\
','+bookwishdict[title][1]+','+bookwishdict[title][2].replace(',','、').replace(',','、')+'\n')
fw.close()

def BRappend(BRdict,Items):
for i in range(len(Items)):
title=Items[i]('a')[0].get_text(strip=True)
date=Items[i](class_=re.compile('date'))[0].get_text(strip=True)
try:
intro=Items[i](class_=re.compile('intro'))[0].get_text(strip=True).split('/')
author=intro[0]
publisher=intro[-3]
translater='/'.join(intro[1:-3])
except:
intro=Items[i](class_=re.compile('intro'))[0].get_text(strip=True).replace(';','/').split('/')
author=intro[0]
publisher=intro[-1]
translater='/'.join(intro[1:-1])
try:
comment=Items[i](class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
except:
comment='Nah'
try:
stars=Items[i](class_=re.compile('rat'))[0]['class'][0][6]
except:
stars='Nah'
BRdict[title]=[author,translater,publisher,stars,date,comment]

def ReadBookList(doubanid):
mainpage='https://book.douban.com/people/'+doubanid
firstpage='https://book.douban.com/people/'+doubanid+'/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
browser = webdriver.Chrome()
browser.get(mainpage)
browser.get(firstpage)
soup = BeautifulSoup(browser.page_source, "html.parser")
items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
read_book={}
BRappend(BRdict=read_book,Items=items)
page=1
print(f"浏览器处理第{page}页")
while 1:
time.sleep(2)
try:
NextPage='https://book.douban.com'+soup.find(class_='next').link.get('href')
except:
print('已到最终页')
break
else:
browser.get(NextPage)
soup=BeautifulSoup(browser.page_source,"html.parser")
items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
page+=1
print(f"浏览器处理第{page}页")
BRappend(BRdict=read_book,Items=items)
fw=open(doubanid+'_READ_List.csv','w',encoding='utf-8_sig')
fw.write('书名,作者,译者,出版社,评分,日期,短评\n')
for title in read_book.keys():
fw.write(title.replace(',','、').replace(',','、')+','+read_book[title][0]+\
','+read_book[title][1]+','+read_book[title][2].replace(',','、').replace(',','、')+\
','+read_book[title][3]+','+read_book[title][4]+','+read_book[title][5].replace(',','、').replace(',','、')+'\n')
fw.close()
return read_book


def main():
print('注意:本脚本将会使用自动打开chrome浏览器的方法获取豆瓣阅读已读list')
print('''你的电脑应该需要装一个chrome,
第一次运行时可能会弹出防火墙提示导致失败,请选择信任再次运行该脚本''')
choice=input('请确定你要运行此脚本(yes/no):')
if choice=='yes':
douid=input('请输入想备份的豆瓣id:')
print('开始备份-想读-列表')
bookwish(doubanid=douid)
time.sleep(2)
print('开始备份-已读-列表')
ReadBookList(doubanid=douid)
print('程序结束,文件已存在该exe目录中')
print('问题反馈:[email protected] | https://github.com/JimSunJing/douban_clawer')
end=input('按任意键退出')
else:
print('bye')

main()
5 changes: 3 additions & 2 deletions doubanmovie.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def TCappend(TC,titandcom):
except:
star='Nah'
try:
comment=titandcom[i](class_=re.compile('comment'))[0].text
comment=titandcom[i](class_=re.compile('comment'))[0].text.replace('\n','-')
except:
comment='Nah'
TC[title]=[date,star,comment]
Expand Down Expand Up @@ -77,6 +77,7 @@ def main():
getWishList(doubanid=douid)
print('开始下载电影评分与短评,存储为'+douid+'_Watched_List.csv')
getSawList(doubanid=douid)
print('程序结束,有问题发:<[email protected]>')
print('程序结束,有问题发:<[email protected]> | https://github.com/JimSunJing/douban_clawer')
end=input('按任意键结束')

main()
111 changes: 111 additions & 0 deletions doubanmusic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import urllib.request
from bs4 import BeautifulSoup
import re
import time

def Musappend(Mdict,Items):
for it in Items:
title=it('a')[0].get_text(strip=True)
date=it(class_=re.compile('date'))[0].get_text(strip=True)
try:
stars=it(class_=re.compile('rat'))[0]['class'][0][6]
except:
stars='Nah'
try:
comment=it(class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
except:
comment='Nah'
intro=it(class_='intro')[0].get_text(strip=True)
Mdict[title]=[intro,date,stars,comment]

def HeardList(doubanid):
firstpage='https://music.douban.com/people/'+doubanid+'/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
request=urllib.request.urlopen(url=firstpage)
soup=BeautifulSoup(request.read())
items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
heard_dic={}
Musappend(Mdict=heard_dic,Items=items)
page=1
print(f'第{page}页',request.reason)
while 1:
time.sleep(1)
try:
NextPage=soup.find(class_='next').link.get('href')
except:
print('已到最终页')
break
else:
request=urllib.request.urlopen(url=NextPage)
soup=BeautifulSoup(request.read())
items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
Musappend(Mdict=heard_dic,Items=items)
page+=1
print(f'第{page}页',request.reason)
fw=open(doubanid+'_Heard_List.csv','w',encoding='utf-8_sig')
fw.write('专辑/单曲,简介,日期,评分,短评\n')
for title in heard_dic.keys():
fw.write(title.replace(',','、').replace(',','、')+','+heard_dic[title][0].replace(',','、').replace(',','、')+\
','+heard_dic[title][1]+','+heard_dic[title][2]+\
','+heard_dic[title][3].replace(',','、').replace(',','、')+'\n')
fw.close()

def WMusappend(Mdict,Items):
for it in Items:
title=it('a')[0].get_text(strip=True)
date=it(class_=re.compile('date'))[0].get_text(strip=True)
try:
comment=it(class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
except:
comment='Nah'
intro=it(class_='intro')[0].get_text(strip=True)
Mdict[title]=[intro,date,comment]


def WHeardList(doubanid):
firstpage='https://music.douban.com/people/'+doubanid+'/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count'
request=urllib.request.urlopen(url=firstpage)
soup=BeautifulSoup(request.read())
items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
whear_dic={}
WMusappend(Mdict=whear_dic,Items=items)
page=1
print(f'第{page}页',request.reason)
while 1:
time.sleep(1)
try:
NextPage=soup.find(class_='next').link.get('href')
except:
print('已到最终页')
break
else:
request=urllib.request.urlopen(url=NextPage)
soup=BeautifulSoup(request.read())
items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
Musappend(Mdict=whear_dic,Items=items)
page+=1
print(f'第{page}页',request.reason)
fw=open(doubanid+'_MusicWish_List.csv','w',encoding='utf-8_sig')
fw.write('专辑/单曲,简介,日期,留言\n')
for title in whear_dic.keys():
fw.write(title.replace(',','、').replace(',','、')+','+whear_dic[title][0].replace(',','、').replace(',','、')+\
','+whear_dic[title][1]+','+whear_dic[title][2].replace(',','、').replace(',','、')+'\n')
fw.close()


def main():
print('本程序备份用户的豆瓣音乐')
choice=input('请确定你要备份(yes/no):')
if choice == 'yes':
id=input('请输入你的豆瓣ID:')
print('开始备份听过列表')
HeardList(doubanid=id)
time.sleep(2)
print('开始备份想听列表')
WHeardList(doubanid=id)
print('备份已存在该exe所在目录下(如果没出错的话)')
print('问题反馈:[email protected] | https://github.com/JimSunJing/douban_clawer')
end=input('按任意键退出')
else:
print('bye')

main()

0 comments on commit d6723b5

Please sign in to comment.