-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmybbllib.py
170 lines (139 loc) · 7.98 KB
/
mybbllib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
class bbllib():
def parse_authors(soup):
'''
get authors from the url, may be located in head (indirectly) or in the html part
'''
#self.log.info("\n"+self.who,"in parse_authors(self, soup)")
# if soup.select_one(".livre_con") fails, an exception will be raised
sub_soup=soup.select_one(".livre_con")
# self.log.info(self.who,"sub_soup prettyfied # :\n", sub_soup.prettify()) # hide_it
authors_soup=sub_soup.select('span[itemprop="author"]')
bbl_authors=[]
for i in range(len(authors_soup)):
# self.log.info(self.who,"authors_soup prettyfied #",i," :\n", authors_soup[i].prettify()) # hide_it
tmp_thrs = authors_soup[i].select_one('span[itemprop="name"]').text.split()
thrs=" ".join(tmp_thrs)
bbl_authors.append(thrs)
# if self.debug:
# self.log.info(self.who,"return bbl_authors", bbl_authors)
return bbl_authors
def parse_rating(soup):
'''
get rating and number of votes from the url located in the html part
'''
#self.log.info("\n"+self.who,"in parse_rating(self, soup)")
# if soup.select_one('span[itemprop="aggregateRating"]') fails, an exception will be raised
rating_soup = soup.select_one('span[itemprop="aggregateRating"]').select_one('span[itemprop="ratingValue"]')
# if self.debug: self.log.info(self.who,"rating_soup prettyfied :\n",rating_soup.prettify()) # hide_it
bbl_rating = float(rating_soup.text.strip())
rating_cnt_soup = soup.select_one('span[itemprop="aggregateRating"]').select_one('span[itemprop="ratingCount"]')
# if self.debug: self.log.info(self.who,"rating_soup prettyfied :\n",rating_soup.prettify()) # hide_it
bbl_rating_cnt = int(rating_cnt_soup.text.strip())
if self.debug:
self.log.info(self.who,"parse_rating() returns bbl_rating : {}, bbl_rating_cnt : {}".format(bbl_rating, bbl_rating_cnt))
return bbl_rating, bbl_rating_cnt
def parse_comments(self, soup):
'''
get resume from soup, may need access to the page again.
Returns it with at title, html formatted.
'''
self.log.info("\n"+self.who,"in parse_comments(self, soup)")
comments_soup = soup.select_one('.livre_resume')
if comments_soup.select_one('a[onclick]'):
if self.debug:
self.log.info(self.who,"onclick : ",comments_soup.select_one('a[onclick]')['onclick'])
tmp_nclck = comments_soup.select_one('a[onclick]')['onclick'].split("(")[-1].split(")")[0].split(",")
rkt = {"type":tmp_nclck[1],"id_obj":tmp_nclck[2]}
url = "https://www.babelio.com/aj_voir_plus_a.php"
if self.debug:
self.log.info(self.who,"calling ret_soup(log, dbg_lvl, br, url, rkt=rkt, who=self.who")
self.log.info(self.who,"url : ",url)
self.log.info(self.who,"rkt : ",rkt)
comments_soup = ret_soup(self.log, self.dbg_lvl, self.br, url, rkt=rkt, who=self.who)[0]
# if self.debug: self.log.info(self.who,"comments prettyfied:\n", comments_soup.prettify()) # hide_it
return comments_soup
def parse_cover(self, soup):
'''
get cover address either from head or from html part
'''
self.log.info("\n"+self.who,"in parse_cover(self, soup)")
# if soup.select_one('link[rel="image_src"]') fails, an exception will be raised
cover_soup = soup.select_one('link[rel="image_src"]')
# if self.debug: self.log.info(self.who,"cover_soup prettyfied :\n", cover_soup.prettify()) # hide_it
bbl_cover = cover_soup['href']
if self.debug:
self.log.info(self.who,'parse_cover() returns bbl_cover : ', bbl_cover)
return bbl_cover
def parse_meta(self, soup):
'''
get publisher, isbn ref, publication date from html part
'''
self.log.info("\n"+self.who,"in parse_meta(self, soup)")
# if soup.select_one(".livre_refs.grey_light") fails it will produce an exception
# note: when a class name contains white characters use a dot instead of the space
# (blank means 2 subsequent classes for css selector)
meta_soup = soup.select_one(".livre_refs.grey_light")
# self.log.info(self.who,"meta_soup prettyfied :\n",meta_soup.prettify()) # hide_it
bbl_publisher = None
if meta_soup.select_one('a[href^="/editeur"]'):
bbl_publisher = meta_soup.select_one('a[href^="/editeur"]').text.strip()
if self.debug:
self.log.info(self.who,"bbl_publisher processed : ", bbl_publisher)
bbl_isbn, bbl_pubdate = None, None
for mta in (meta_soup.stripped_strings):
if "EAN" in mta:
tmp_sbn = mta.split()
bbl_isbn = check_isbn(tmp_sbn[-1])
if self.debug:
self.log.info(self.who,"bbl_isbn processed : ", bbl_isbn)
elif "/" in mta:
tmp_dt = mta.strip().replace("(","").replace(")","")
tmp_pbdt=tmp_dt.split("/")
# if self.debug: self.log.info(self.who,"tmp_pbdt : ", tmp_pbdt) # hide_it
for i in range(len(tmp_pbdt)):
if tmp_pbdt[i].isnumeric():
if i==0 and int(tmp_pbdt[i]) <= 31: continue
elif i==1 and int(tmp_pbdt[i]) <= 12 : continue
elif i==2 and int(tmp_pbdt[i]) > 1700: # reject year -1, assumes no book in with date < 1700
bbl_pubdate = datetime.datetime.strptime(tmp_dt,"%j/%m/%Y")
if self.debug:
self.log.info(self.who,"bbl_pubdate processed : ", bbl_pubdate)
if self.debug:
self.log.info(self.who,'parse_meta() returns bbl_isbn, bbl_publisher, bbl_pubdate : '
, bbl_isbn, bbl_publisher, bbl_pubdate)
return bbl_isbn, bbl_publisher, bbl_pubdate
def parse_tags(self, soup):
'''
get tags from html part, selecting first the category(ies) desired
before selecting the targeted relevance.
'''
self.log.info("\n"+self.who,"in parse_tags(self, soup)")
# if soup.select_one('.tags') fails it will produce an exception
bbl_tags=[]
tmp_bbl_tg_tc = [{}, {}, {}, {}]
bbl_tg_tc = [[], [], [], []]
tag_soup=soup.select_one('.tags')
# if self.debug: self.log.info(self.who,"tag_soup prettyfied :\n",tag_soup.prettify()) # hide_it
tag_soup = soup.select_one('.tags').select('a')
for j in range(len(tag_soup)):
ti, tk, tv = tag_soup[j]['class'][1], tag_soup[j]['class'][0], tag_soup[j].text.strip()
for i in range(len(tmp_bbl_tg_tc)):
if int(ti[-1]) == i:
if tmp_bbl_tg_tc[i].get(tk):
tv_lst = tmp_bbl_tg_tc[i].get(tk) # get tag value
tv_lst.append(tv) # update tag value list with tag value
tmp_tg = {tk : tv_lst} # update dictionary
else:
tmp_tg = {tk : [tv]} # create dicionary key and associate tag value list
tmp_bbl_tg_tc[i].update(tmp_tg) # update tmp_bbl_tg_tc[i] dictionary
bbl_tg_tc[0] = sorted(tmp_bbl_tg_tc[0].items())[-self.tag_genre:] if self.tag_genre else []
bbl_tg_tc[1] = sorted(tmp_bbl_tg_tc[1].items())[-self.tag_theme:] if self.tag_theme else []
bbl_tg_tc[2] = sorted(tmp_bbl_tg_tc[2].items())[-self.tag_lieu:] if self.tag_lieu else []
bbl_tg_tc[3] = sorted(tmp_bbl_tg_tc[3].items())[-self.tag_quand:] if self.tag_quand else []
for j in range(len(bbl_tg_tc)):
for i in range(len(bbl_tg_tc[j])):
bbl_tags.extend(bbl_tg_tc[j][i][1])
bbl_tags = list(map(fixcase, bbl_tags))
if self.debug:
self.log.info(self.who,"parse_tags() return bbl_tags", bbl_tags)
return bbl_tags