-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsangtacvietToTxt.py
82 lines (65 loc) · 2.45 KB
/
sangtacvietToTxt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# coding=utf-8
import re
import sys
import json
from Lib.Epub import Epub
def remove_html_tags(text):
"匹配并剔除除了 <img> 标签之外的所有 HTML 标签"
pattern = r'<(?!img\b)[^>]*>'
clean_text = re.sub(pattern, '', text)
return clean_text
def remove_all_html_tags(text):
clean = re.compile('<.*?>')
return re.sub(clean, '', text).replace(" ", "")
def find_src(k):
try:
return re.findall(r'''<img referrerpolicy="no-referrer" src="([\s\S]+?)">''', k)[0]
except:
print(f"[WARNING]:\t failed using Default regular expression \t {k}")
try:
return re.findall(r'''<img alt="([\s\S]+?)" src="([\s\S]+?)"''', k)[0][1]
except:
print(f"[WARNING]:\t failed using Senior regular expression \t {k}")
try:
tmp = re.findall(r'''src="([\s\S]+?)"''', k)[0]
print(f"[WARNING]:\t using Unsafety regular expression \t {k}")
return tmp
except:
print(
f"[Critical]:\t can not find the src with all regular expression \t {k}")
if len(sys.argv) != 2:
print("缺失json文件")
else:
file = sys.argv[1]
with open(file, "r", encoding="utf-8") as f:
raw = json.load(f)
def write_to_txt(filename, content):
with open(filename, 'w', encoding='utf-8') as f:
for line in content:
# 去掉行首的 'p:' 前缀
if line.startswith('p: '):
line = line[3:] # 去掉 'p: ' 的前缀
f.write(f"{line}\n")
# 假设 raw 是你的 JSON 数据
e = [] # 用于存储所有章节的内容
i = 1
while i < len(raw["chapterList"]):
Chaptername = raw["chapterList"][i]["name"]
for j in raw["chapterList"][i]["lists"]:
chap = raw["chapterList"][i]["lists"][j]
content = remove_html_tags(chap["content"]).split("\n")
# 将章节信息添加到 e 列表
e.append(f"{Chaptername} - {chap['name']}\n")
for k in content:
k = k.replace('\u3000', " ")
if "<img" not in k:
e.append(f"p: {k}") # 添加段落文本
else:
e.append(f"img: {find_src(k)}") # 添加图片链接
k = remove_all_html_tags(k) # 处理跟在图片后的文本
if k != "":
e.append(f"p: {k}") # 添加后续文本
i += 1
# 使用 raw["bookname"] 作为文件名
book_filename = f"{raw['bookname']}.txt" # 确保文件名有效
write_to_txt(book_filename, e)