Skip to content

Commit 6e0d659

Browse files
authored
Update and rename cool18/processing_file.py to cool18  酷18/processing_file.py
确保脚本路径存在“禁忌书屋”源码的文件夹
1 parent 22da41c commit 6e0d659

File tree

1 file changed

+42
-24
lines changed

1 file changed

+42
-24
lines changed

cool18/processing_file.py renamed to cool18  酷18/processing_file.py

+42-24
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@
1616
sensitive_words = f'({f.read()})'.replace('\n', '|').replace('||', '')
1717
sensitive_words = re.sub('\|\|$', '', sensitive_words)
1818

19+
# 含有敏感词,返回False
20+
async def check_sensitive(input):
21+
if re.search(f'(.*?){sensitive_words}(.*?)(作者|送达者|$)', input):
22+
return False
23+
else:
24+
return True
1925

2026
# 去除非法字符
2127
async def del_illegal_words(string):
@@ -37,33 +43,29 @@ async def del_illegal_words(string):
3743

3844
# 正文去掉网页元素
3945
async def remove_html(text):
40-
text = re.sub(r'<font(.*?)font>', '', text)
46+
text = re.sub(r'<font([^\t]*)font>', '', text)
4147
# text = re.sub(r'<center(.*?)center>', '', text)
4248
text = re.sub(r'<br/>', '\n', text)
43-
text = re.sub(r'<p>', '\n', text)
49+
text = re.sub(r'<p(.*?)>', '\n', text)
4450
text = re.sub(r'</p>', '\n', text)
4551
text = re.sub(r'<b>(.*?)</b>', '\n', text)
4652
text = text.replace("<pre>", '')
53+
text = text.replace("<i>", '')
4754
text = text.replace("</pre>", '')
4855
return text
4956

5057

51-
# 含有敏感词,返回False
52-
async def check_sensitive(input):
53-
if re.search(f'(.*?){sensitive_words}(.*?)(作者|送达者|$)', input):
54-
return False
55-
else:
56-
return True
5758

5859

5960
# 返回文件副本名
6061
async def copy_name(filename):
6162
num = 0
6263
while os.path.exists(filename):
63-
copy_num = re.search('(.*?)((.*?)).txt', filename)
64+
copy_num = re.search('(.*?) - 副本((.*?)).txt', filename)
6465
num += 1
6566
if copy_num:
6667
if copy_num.group(2).isnumeric():
68+
num=int(copy_num.group(2))+1
6769
filename = copy_num.group(1) + ' - 副本(' + str(num) + ').txt'
6870
else:
6971
filename = filename[:-4] + ' - 副本(' + str(num) + ').txt'
@@ -87,7 +89,7 @@ async def write_file(filename, content):
8789
if not similarity_ratio >= 95:
8890
newname = await copy_name(filename)
8991
# await loop.run_in_executor(None, os.rename, filename, newname)
90-
async with aiofiles.open(newname, 'w', errors='ignore') as file:
92+
async with aiofiles.open(newname, 'w', encoding='utf-8', errors='ignore') as file:
9193
await file.write(content)
9294
return True
9395
else:
@@ -118,10 +120,10 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
118120
link = Currect_tid
119121
tid_num=int(re.search('=\d+',link).group(0)[1:])
120122
first_pre_article_content = await remove_html(str(content[0])) + '\n'
121-
article_name = html.select('td.show_content>center>font')[0].text
123+
article_name = await del_illegal_words(html.select('td.show_content>center>font')[0].text)
122124
# 检测文章名是否包含敏感词
123125
if await check_sensitive(article_name):
124-
article_filename = new_dir_name+'\\'+(await del_illegal_words(article_name)) + '.txt'
126+
article_filename = new_dir_name+'\\'+article_name + '.txt'
125127
valid = False
126128
if len(first_pre_article_content) > 0:
127129
valid = True
@@ -130,7 +132,7 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
130132
for li in follow_up: # 遍历回帖,记录较多字符的回帖
131133
new_a = li.select_one('a')
132134
# 检查文章大小
133-
if int(byte.search(new_a.next_sibling).group(0)) > 10000:
135+
if int(byte.search(new_a.next_sibling).group(1)) > 10000:
134136
valid = True
135137
if await check_sensitive(str(new_a.text)):
136138
new_ = []
@@ -142,7 +144,7 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
142144
except:
143145
pass
144146
else:
145-
if len(first_pre_article_content) < 1500: # 字数少,跳过
147+
if len(first_pre_article_content) < 2000: # 字数少,跳过
146148
valid = False
147149

148150
# 该页面字数较多,存为txt文件
@@ -153,9 +155,9 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
153155
if follow_up_links:
154156
follow_up_links = follow_up_links[::-1]
155157
# 重命名文章名,整合文章的小说名总不能带(1)之类的字
156-
bracket_exist = del_bracket.search(article_filename)
157-
if bracket_exist:
158-
article_filename = bracket_exist.group(1) + bracket_exist.group(5) + '.txt'
158+
# bracket_exist = del_bracket.search(article_filename)
159+
# if bracket_exist:
160+
# article_filename = bracket_exist.group(1) + bracket_exist.group(5) + '.txt'
159161
# 打开回帖,将文章内容加到列表中
160162
for article_link in follow_up_links:
161163
follow_up_file=str(re.search('\d+',article_link[1]).group(0))+'.html'
@@ -188,16 +190,14 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
188190
file = await write_file(article_filename, text)
189191

190192
if not file:
191-
print(' '*100,end='\r',flush=True)
192193
print(
193194
f'【{str(datetime.datetime.now())[:16]}】:tid={tid_num} 保存过了 '
194-
f'无需保存 {article_name}',end='\r',flush=True
195+
f'无需保存 {article_name}'
195196
)
196197
else:
197-
print(' '*100,end='\r',flush=True)
198198
print(
199199
f'【{str(datetime.datetime.now())[:16]}】:tid={tid_num} 已保存 '
200-
f'已保存 {article_name}',end='\r',flush=True
200+
f' {article_name}'
201201
)
202202
# os.exit(0)
203203
except:
@@ -238,7 +238,7 @@ async def consumer(file_name,new_dir_name,semaphore,byte,del_bracket,index):
238238

239239

240240
async def consumers(concurrency_num: int,new_dir_name):
241-
byte = re.compile('\d+')
241+
byte = re.compile('\((\d+) bytes\)')
242242
del_bracket = re.compile('(.*?)(\(|()(.*?)(\)|))(.*?)$')
243243
index = 'https://www.cool18.com/bbs4/'
244244

@@ -259,8 +259,8 @@ async def consumers(concurrency_num: int,new_dir_name):
259259

260260

261261
async def main():
262-
# 限制并行访问量
263-
concurrency_num = 3000
262+
# 限制并行访问量为100
263+
concurrency_num = 1000
264264

265265
new_dir_name=f'禁忌书屋小说 {str(datetime.datetime.now())[:10]}'
266266
if not os.path.exists(new_dir_name):
@@ -275,6 +275,24 @@ async def main():
275275
print(f'【{str(datetime.datetime.now())[:16]}】:目录已创建')
276276
await consumers(concurrency_num,new_dir_name)
277277

278+
async def main1():
279+
# 限制并行访问量为100
280+
concurrency_num = 1000
281+
282+
new_dir_name=f'禁忌书屋小说 {str(datetime.datetime.now())[:10]}'
283+
if not os.path.exists(new_dir_name):
284+
os.mkdir(new_dir_name)
285+
target_path='禁忌书屋'
286+
temp_path=r'C:\Users\li\PycharmProjects\禁忌书屋'
287+
temp_new_dir=r'C:\Users\li\Downloads\禁忌书屋'
288+
if os.path.exists(temp_path):
289+
os.chdir(temp_path)
290+
else:
291+
print('禁忌书屋目录并不存在!请确保你已完整保存禁忌书屋的全部帖子')
292+
sys.exit()
293+
294+
print(f'【{str(datetime.datetime.now())[:16]}】:目录已创建')
295+
await consumers(concurrency_num,temp_new_dir)
278296

279297

280298
if __name__ == '__main__':

0 commit comments

Comments
 (0)