-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix some bug and add read, music clawer
- Loading branch information
1 parent
0cbe8f7
commit d6723b5
Showing
6 changed files
with
250 additions
and
2 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import urllib.request | ||
from bs4 import BeautifulSoup | ||
import re | ||
import time | ||
import random | ||
from selenium import webdriver | ||
|
||
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} | ||
|
||
def BWappend(BWdict,Items): | ||
for i in range(len(Items)): | ||
try: | ||
title=Items[i](href=re.compile('subject'))[0].get_text(strip=True) | ||
intro=Items[i](class_='intro')[0].get_text(strip=True).split('/') | ||
author=intro[0] | ||
publisher=intro[-3] | ||
translater='/'.join(intro[1:-3]) | ||
BWdict[title]=[author,translater,publisher] | ||
except: | ||
title=Items[i](href=re.compile('subject'))[0].get_text(strip=True) | ||
intro=Items[i](class_='intro')[0].get_text(strip=True).split(';') | ||
author=intro[0] | ||
publisher=intro[-1] | ||
translater='/'.join(intro[1:-1]) | ||
BWdict[title]=[author,translater,publisher] | ||
|
||
def bookwish(doubanid): | ||
firstpage='https://book.douban.com/people/'+doubanid+'/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count' | ||
request=urllib.request.urlopen(url=firstpage) | ||
soup=BeautifulSoup(request.read()) | ||
page=1 | ||
print(f'第{page}页',request.reason) | ||
bookwishdict={} | ||
items=soup.find_all(class_='item') | ||
BWappend(BWdict=bookwishdict,Items=items) | ||
while 1: | ||
try: | ||
Nextpage='https://book.douban.com'+soup.find(class_='next').link.get('href') | ||
except: | ||
print('已到最终页') | ||
break | ||
else: | ||
response=urllib.request.Request(url=Nextpage,headers=headers) | ||
request=urllib.request.urlopen(response) | ||
soup=BeautifulSoup(request.read()) | ||
page+=1 | ||
print(f'第{page}页',request.reason) | ||
items2=soup.find_all(class_='item') | ||
BWappend(BWdict=bookwishdict,Items=items2) | ||
time.sleep(1) | ||
fw=open(doubanid+'_TOread_List.csv','w',encoding='utf-8_sig') | ||
fw.write('书名,作者,译者,出版社\n') | ||
for title in bookwishdict.keys(): | ||
fw.write(title.replace(',','、').replace(',','、')+','+bookwishdict[title][0]+\ | ||
','+bookwishdict[title][1]+','+bookwishdict[title][2].replace(',','、').replace(',','、')+'\n') | ||
fw.close() | ||
|
||
def BRappend(BRdict,Items): | ||
for i in range(len(Items)): | ||
title=Items[i]('a')[0].get_text(strip=True) | ||
date=Items[i](class_=re.compile('date'))[0].get_text(strip=True) | ||
try: | ||
intro=Items[i](class_=re.compile('intro'))[0].get_text(strip=True).split('/') | ||
author=intro[0] | ||
publisher=intro[-3] | ||
translater='/'.join(intro[1:-3]) | ||
except: | ||
intro=Items[i](class_=re.compile('intro'))[0].get_text(strip=True).replace(';','/').split('/') | ||
author=intro[0] | ||
publisher=intro[-1] | ||
translater='/'.join(intro[1:-1]) | ||
try: | ||
comment=Items[i](class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-') | ||
except: | ||
comment='Nah' | ||
try: | ||
stars=Items[i](class_=re.compile('rat'))[0]['class'][0][6] | ||
except: | ||
stars='Nah' | ||
BRdict[title]=[author,translater,publisher,stars,date,comment] | ||
|
||
def ReadBookList(doubanid): | ||
mainpage='https://book.douban.com/people/'+doubanid | ||
firstpage='https://book.douban.com/people/'+doubanid+'/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count' | ||
browser = webdriver.Chrome() | ||
browser.get(mainpage) | ||
browser.get(firstpage) | ||
soup = BeautifulSoup(browser.page_source, "html.parser") | ||
items=soup.find_all(class_=re.compile('item'),id=re.compile('li')) | ||
read_book={} | ||
BRappend(BRdict=read_book,Items=items) | ||
page=1 | ||
print(f"浏览器处理第{page}页") | ||
while 1: | ||
time.sleep(2) | ||
try: | ||
NextPage='https://book.douban.com'+soup.find(class_='next').link.get('href') | ||
except: | ||
print('已到最终页') | ||
break | ||
else: | ||
browser.get(NextPage) | ||
soup=BeautifulSoup(browser.page_source,"html.parser") | ||
items=soup.find_all(class_=re.compile('item'),id=re.compile('li')) | ||
page+=1 | ||
print(f"浏览器处理第{page}页") | ||
BRappend(BRdict=read_book,Items=items) | ||
fw=open(doubanid+'_READ_List.csv','w',encoding='utf-8_sig') | ||
fw.write('书名,作者,译者,出版社,评分,日期,短评\n') | ||
for title in read_book.keys(): | ||
fw.write(title.replace(',','、').replace(',','、')+','+read_book[title][0]+\ | ||
','+read_book[title][1]+','+read_book[title][2].replace(',','、').replace(',','、')+\ | ||
','+read_book[title][3]+','+read_book[title][4]+','+read_book[title][5].replace(',','、').replace(',','、')+'\n') | ||
fw.close() | ||
return read_book | ||
|
||
|
||
def main(): | ||
print('注意:本脚本将会使用自动打开chrome浏览器的方法获取豆瓣阅读已读list') | ||
print('''你的电脑应该需要装一个chrome, | ||
第一次运行时可能会弹出防火墙提示导致失败,请选择信任再次运行该脚本''') | ||
choice=input('请确定你要运行此脚本(yes/no):') | ||
if choice=='yes': | ||
douid=input('请输入想备份的豆瓣id:') | ||
print('开始备份-想读-列表') | ||
bookwish(doubanid=douid) | ||
time.sleep(2) | ||
print('开始备份-已读-列表') | ||
ReadBookList(doubanid=douid) | ||
print('程序结束,文件已存在该exe目录中') | ||
print('问题反馈:[email protected] | https://github.com/JimSunJing/douban_clawer') | ||
end=input('按任意键退出') | ||
else: | ||
print('bye') | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,7 +38,7 @@ def TCappend(TC,titandcom): | |
except: | ||
star='Nah' | ||
try: | ||
comment=titandcom[i](class_=re.compile('comment'))[0].text | ||
comment=titandcom[i](class_=re.compile('comment'))[0].text.replace('\n','-') | ||
except: | ||
comment='Nah' | ||
TC[title]=[date,star,comment] | ||
|
@@ -77,6 +77,7 @@ def main(): | |
getWishList(doubanid=douid) | ||
print('开始下载电影评分与短评,存储为'+douid+'_Watched_List.csv') | ||
getSawList(doubanid=douid) | ||
print('程序结束,有问题发:<[email protected]>') | ||
print('程序结束,有问题发:<[email protected]> | https://github.com/JimSunJing/douban_clawer') | ||
end=input('按任意键结束') | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import urllib.request | ||
from bs4 import BeautifulSoup | ||
import re | ||
import time | ||
|
||
def Musappend(Mdict,Items): | ||
for it in Items: | ||
title=it('a')[0].get_text(strip=True) | ||
date=it(class_=re.compile('date'))[0].get_text(strip=True) | ||
try: | ||
stars=it(class_=re.compile('rat'))[0]['class'][0][6] | ||
except: | ||
stars='Nah' | ||
try: | ||
comment=it(class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-') | ||
except: | ||
comment='Nah' | ||
intro=it(class_='intro')[0].get_text(strip=True) | ||
Mdict[title]=[intro,date,stars,comment] | ||
|
||
def HeardList(doubanid): | ||
firstpage='https://music.douban.com/people/'+doubanid+'/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count' | ||
request=urllib.request.urlopen(url=firstpage) | ||
soup=BeautifulSoup(request.read()) | ||
items=soup.find_all(class_=re.compile('item'),id=re.compile('li')) | ||
heard_dic={} | ||
Musappend(Mdict=heard_dic,Items=items) | ||
page=1 | ||
print(f'第{page}页',request.reason) | ||
while 1: | ||
time.sleep(1) | ||
try: | ||
NextPage=soup.find(class_='next').link.get('href') | ||
except: | ||
print('已到最终页') | ||
break | ||
else: | ||
request=urllib.request.urlopen(url=NextPage) | ||
soup=BeautifulSoup(request.read()) | ||
items=soup.find_all(class_=re.compile('item'),id=re.compile('li')) | ||
Musappend(Mdict=heard_dic,Items=items) | ||
page+=1 | ||
print(f'第{page}页',request.reason) | ||
fw=open(doubanid+'_Heard_List.csv','w',encoding='utf-8_sig') | ||
fw.write('专辑/单曲,简介,日期,评分,短评\n') | ||
for title in heard_dic.keys(): | ||
fw.write(title.replace(',','、').replace(',','、')+','+heard_dic[title][0].replace(',','、').replace(',','、')+\ | ||
','+heard_dic[title][1]+','+heard_dic[title][2]+\ | ||
','+heard_dic[title][3].replace(',','、').replace(',','、')+'\n') | ||
fw.close() | ||
|
||
def WMusappend(Mdict,Items): | ||
for it in Items: | ||
title=it('a')[0].get_text(strip=True) | ||
date=it(class_=re.compile('date'))[0].get_text(strip=True) | ||
try: | ||
comment=it(class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-') | ||
except: | ||
comment='Nah' | ||
intro=it(class_='intro')[0].get_text(strip=True) | ||
Mdict[title]=[intro,date,comment] | ||
|
||
|
||
def WHeardList(doubanid): | ||
firstpage='https://music.douban.com/people/'+doubanid+'/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count' | ||
request=urllib.request.urlopen(url=firstpage) | ||
soup=BeautifulSoup(request.read()) | ||
items=soup.find_all(class_=re.compile('item'),id=re.compile('li')) | ||
whear_dic={} | ||
WMusappend(Mdict=whear_dic,Items=items) | ||
page=1 | ||
print(f'第{page}页',request.reason) | ||
while 1: | ||
time.sleep(1) | ||
try: | ||
NextPage=soup.find(class_='next').link.get('href') | ||
except: | ||
print('已到最终页') | ||
break | ||
else: | ||
request=urllib.request.urlopen(url=NextPage) | ||
soup=BeautifulSoup(request.read()) | ||
items=soup.find_all(class_=re.compile('item'),id=re.compile('li')) | ||
Musappend(Mdict=whear_dic,Items=items) | ||
page+=1 | ||
print(f'第{page}页',request.reason) | ||
fw=open(doubanid+'_MusicWish_List.csv','w',encoding='utf-8_sig') | ||
fw.write('专辑/单曲,简介,日期,留言\n') | ||
for title in whear_dic.keys(): | ||
fw.write(title.replace(',','、').replace(',','、')+','+whear_dic[title][0].replace(',','、').replace(',','、')+\ | ||
','+whear_dic[title][1]+','+whear_dic[title][2].replace(',','、').replace(',','、')+'\n') | ||
fw.close() | ||
|
||
|
||
def main(): | ||
print('本程序备份用户的豆瓣音乐') | ||
choice=input('请确定你要备份(yes/no):') | ||
if choice == 'yes': | ||
id=input('请输入你的豆瓣ID:') | ||
print('开始备份听过列表') | ||
HeardList(doubanid=id) | ||
time.sleep(2) | ||
print('开始备份想听列表') | ||
WHeardList(doubanid=id) | ||
print('备份已存在该exe所在目录下(如果没出错的话)') | ||
print('问题反馈:[email protected] | https://github.com/JimSunJing/douban_clawer') | ||
end=input('按任意键退出') | ||
else: | ||
print('bye') | ||
|
||
main() |