16
16
sensitive_words = f'({ f .read ()} )' .replace ('\n ' , '|' ).replace ('||' , '' )
17
17
sensitive_words = re .sub ('\|\|$' , '' , sensitive_words )
18
18
19
+ # 含有敏感词,返回False
20
+ async def check_sensitive (input ):
21
+ if re .search (f'(.*?){ sensitive_words } (.*?)(作者|送达者|$)' , input ):
22
+ return False
23
+ else :
24
+ return True
19
25
20
26
# 去除非法字符
21
27
async def del_illegal_words (string ):
@@ -37,33 +43,29 @@ async def del_illegal_words(string):
37
43
38
44
# 正文去掉网页元素
39
45
async def remove_html (text ):
40
- text = re .sub (r'<font(.*? )font>' , '' , text )
46
+ text = re .sub (r'<font([^\t]* )font>' , '' , text )
41
47
# text = re.sub(r'<center(.*?)center>', '', text)
42
48
text = re .sub (r'<br/>' , '\n ' , text )
43
- text = re .sub (r'<p>' , '\n ' , text )
49
+ text = re .sub (r'<p(.*?) >' , '\n ' , text )
44
50
text = re .sub (r'</p>' , '\n ' , text )
45
51
text = re .sub (r'<b>(.*?)</b>' , '\n ' , text )
46
52
text = text .replace ("<pre>" , '' )
53
+ text = text .replace ("<i>" , '' )
47
54
text = text .replace ("</pre>" , '' )
48
55
return text
49
56
50
57
51
- # 含有敏感词,返回False
52
- async def check_sensitive (input ):
53
- if re .search (f'(.*?){ sensitive_words } (.*?)(作者|送达者|$)' , input ):
54
- return False
55
- else :
56
- return True
57
58
58
59
59
60
# 返回文件副本名
60
61
async def copy_name (filename ):
61
62
num = 0
62
63
while os .path .exists (filename ):
63
- copy_num = re .search ('(.*?)((.*?)).txt' , filename )
64
+ copy_num = re .search ('(.*?) - 副本 ((.*?)).txt' , filename )
64
65
num += 1
65
66
if copy_num :
66
67
if copy_num .group (2 ).isnumeric ():
68
+ num = int (copy_num .group (2 ))+ 1
67
69
filename = copy_num .group (1 ) + ' - 副本(' + str (num ) + ').txt'
68
70
else :
69
71
filename = filename [:- 4 ] + ' - 副本(' + str (num ) + ').txt'
@@ -87,7 +89,7 @@ async def write_file(filename, content):
87
89
if not similarity_ratio >= 95 :
88
90
newname = await copy_name (filename )
89
91
# await loop.run_in_executor(None, os.rename, filename, newname)
90
- async with aiofiles .open (newname , 'w' , errors = 'ignore' ) as file :
92
+ async with aiofiles .open (newname , 'w' , encoding = 'utf-8' , errors = 'ignore' ) as file :
91
93
await file .write (content )
92
94
return True
93
95
else :
@@ -118,10 +120,10 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
118
120
link = Currect_tid
119
121
tid_num = int (re .search ('=\d+' ,link ).group (0 )[1 :])
120
122
first_pre_article_content = await remove_html (str (content [0 ])) + '\n '
121
- article_name = html .select ('td.show_content>center>font' )[0 ].text
123
+ article_name = await del_illegal_words ( html .select ('td.show_content>center>font' )[0 ].text )
122
124
# 检测文章名是否包含敏感词
123
125
if await check_sensitive (article_name ):
124
- article_filename = new_dir_name + '\\ ' + ( await del_illegal_words ( article_name )) + '.txt'
126
+ article_filename = new_dir_name + '\\ ' + article_name + '.txt'
125
127
valid = False
126
128
if len (first_pre_article_content ) > 0 :
127
129
valid = True
@@ -130,7 +132,7 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
130
132
for li in follow_up : # 遍历回帖,记录较多字符的回帖
131
133
new_a = li .select_one ('a' )
132
134
# 检查文章大小
133
- if int (byte .search (new_a .next_sibling ).group (0 )) > 10000 :
135
+ if int (byte .search (new_a .next_sibling ).group (1 )) > 10000 :
134
136
valid = True
135
137
if await check_sensitive (str (new_a .text )):
136
138
new_ = []
@@ -142,7 +144,7 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
142
144
except :
143
145
pass
144
146
else :
145
- if len (first_pre_article_content ) < 1500 : # 字数少,跳过
147
+ if len (first_pre_article_content ) < 2000 : # 字数少,跳过
146
148
valid = False
147
149
148
150
# 该页面字数较多,存为txt文件
@@ -153,9 +155,9 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
153
155
if follow_up_links :
154
156
follow_up_links = follow_up_links [::- 1 ]
155
157
# 重命名文章名,整合文章的小说名总不能带(1)之类的字
156
- bracket_exist = del_bracket .search (article_filename )
157
- if bracket_exist :
158
- article_filename = bracket_exist .group (1 ) + bracket_exist .group (5 ) + '.txt'
158
+ # bracket_exist = del_bracket.search(article_filename)
159
+ # if bracket_exist:
160
+ # article_filename = bracket_exist.group(1) + bracket_exist.group(5) + '.txt'
159
161
# 打开回帖,将文章内容加到列表中
160
162
for article_link in follow_up_links :
161
163
follow_up_file = str (re .search ('\d+' ,article_link [1 ]).group (0 ))+ '.html'
@@ -188,16 +190,14 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
188
190
file = await write_file (article_filename , text )
189
191
190
192
if not file :
191
- print (' ' * 100 ,end = '\r ' ,flush = True )
192
193
print (
193
194
f'【{ str (datetime .datetime .now ())[:16 ]} 】:tid={ tid_num } 保存过了 '
194
- f'无需保存 { article_name } ' , end = ' \r ' , flush = True
195
+ f'无需保存 { article_name } '
195
196
)
196
197
else :
197
- print (' ' * 100 ,end = '\r ' ,flush = True )
198
198
print (
199
199
f'【{ str (datetime .datetime .now ())[:16 ]} 】:tid={ tid_num } 已保存 '
200
- f'已保存 { article_name } ' , end = ' \r ' , flush = True
200
+ f' { article_name } '
201
201
)
202
202
# os.exit(0)
203
203
except :
@@ -238,7 +238,7 @@ async def consumer(file_name,new_dir_name,semaphore,byte,del_bracket,index):
238
238
239
239
240
240
async def consumers (concurrency_num : int ,new_dir_name ):
241
- byte = re .compile ('\d+ ' )
241
+ byte = re .compile ('\((\d+) bytes\) ' )
242
242
del_bracket = re .compile ('(.*?)(\(|()(.*?)(\)|))(.*?)$' )
243
243
index = 'https://www.cool18.com/bbs4/'
244
244
@@ -259,8 +259,8 @@ async def consumers(concurrency_num: int,new_dir_name):
259
259
260
260
261
261
async def main ():
262
- # 限制并行访问量
263
- concurrency_num = 3000
262
+ # 限制并行访问量为100
263
+ concurrency_num = 1000
264
264
265
265
new_dir_name = f'禁忌书屋小说 { str (datetime .datetime .now ())[:10 ]} '
266
266
if not os .path .exists (new_dir_name ):
@@ -275,6 +275,24 @@ async def main():
275
275
print (f'【{ str (datetime .datetime .now ())[:16 ]} 】:目录已创建' )
276
276
await consumers (concurrency_num ,new_dir_name )
277
277
278
+ async def main1 ():
279
+ # 限制并行访问量为100
280
+ concurrency_num = 1000
281
+
282
+ new_dir_name = f'禁忌书屋小说 { str (datetime .datetime .now ())[:10 ]} '
283
+ if not os .path .exists (new_dir_name ):
284
+ os .mkdir (new_dir_name )
285
+ target_path = '禁忌书屋'
286
+ temp_path = r'C:\Users\li\PycharmProjects\禁忌书屋'
287
+ temp_new_dir = r'C:\Users\li\Downloads\禁忌书屋'
288
+ if os .path .exists (temp_path ):
289
+ os .chdir (temp_path )
290
+ else :
291
+ print ('禁忌书屋目录并不存在!请确保你已完整保存禁忌书屋的全部帖子' )
292
+ sys .exit ()
293
+
294
+ print (f'【{ str (datetime .datetime .now ())[:16 ]} 】:目录已创建' )
295
+ await consumers (concurrency_num ,temp_new_dir )
278
296
279
297
280
298
if __name__ == '__main__' :
0 commit comments