-
Notifications
You must be signed in to change notification settings - Fork 0
/
notes.py
132 lines (107 loc) · 5.09 KB
/
notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import requests
import json
import yaml
import urllib.parse
from bs4 import BeautifulSoup
# 从 config.yml 加载配置
with open('config.yml', 'r') as file:
config = yaml.safe_load(file)
# 请求的URL
url = "https://www.yuque.com/api/modules/note/notes/NoteController/index?offset=0&q=&filter_type=all&status=0&merge_dynamic_data=0&order=content_updated_at&with_pinned_notes=true&limit=20"
# 从配置中获取请求头
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Cookie": config['headers']['Cookie'],
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"X-CSRF-Token": config['headers']['X-CSRF-Token'],
"X-Login": config['headers']['X-Login']
}
# 发送GET请求
response = requests.get(url, headers=headers)
def parse_card(value):
"""解析卡片元素的value"""
decoded_value = urllib.parse.unquote(value[5:]) # 去掉 'data:' 前缀并解码
return json.loads(decoded_value)
def convert_to_html_and_md(abstract):
"""解析abstract并转换为HTML和Markdown格式"""
soup = BeautifulSoup(abstract, 'html.parser')
html_output = []
md_output = []
for element in soup.descendants:
# 检查是否是最顶层的 <p><span></span></p>
if element.name == 'span' and element.parent.name == 'p' and element.parent.parent is soup:
text = element.get_text()
html_output.append(f"<p>{text}</p>")
md_output.append(text + "\n\n")
elif element.name == 'table':
html_output.append(str(element))
rows = element.find_all('tr')
if rows:
# 处理表格标题行
header_row = rows[0]
headers = '| ' + ' | '.join(
cell.get_text(strip=True) for cell in header_row.find_all(['td', 'th'])) + ' |'
separator = '| ' + ' | '.join('---' for _ in header_row.find_all(['td', 'th'])) + ' |'
md_output.append(headers)
md_output.append(separator)
# 处理表格数据行
for row in rows[1:]:
md_row = '| ' + ' | '.join(cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])) + ' |'
md_output.append(md_row)
md_output.append('')
elif element.name == 'blockquote':
if 'lake-alert-tips' in element.get('class', []):
text = element.get_text()
html_output.append(f"<blockquote class='lake-alert lake-alert-tips'>{text}</blockquote>")
md_output.append(f"!!! {text}\n\n")
else:
text = element.get_text()
html_output.append(f"<blockquote>{text}</blockquote>")
md_output.append(f"> {text}\n\n")
elif element.name == 'card':
card_type = element.get('name')
value = element.get('value')
if card_type in ['file', 'image']:
card_data = parse_card(value)
link = f'<a href="{card_data["src"]}" download>{card_data["name"]}</a>'
html_output.append(link)
md_output.append(f"[{card_data['name']}]({card_data['src']})\n\n")
elif card_type == 'hr':
html_output.append("<hr>")
md_output.append("---\n\n")
elif card_type == 'codeblock':
card_data = parse_card(value)
html_output.append(f"<pre><code class='{card_data['mode']}'>{card_data['code']}</code></pre>")
md_output.append(f"```{card_data['mode']}\n{card_data['code']}\n```\n\n")
return '\n'.join(html_output), '\n'.join(md_output)
# 检查请求是否成功
if response.status_code == 200:
data = response.json()
# 创建存储文件的文件夹
os.makedirs("notes/html", exist_ok=True)
os.makedirs("notes/md", exist_ok=True)
# 遍历每个note
for note in data.get("notes", []):
abstract = note['content']['abstract']
created_at = note['created_at']
tags = [tag['name'] for tag in note.get('tags', [])]
# 解析abstract并转换为HTML和Markdown
html_content, md_content = convert_to_html_and_md(abstract)
# 在文件开头添加created_at和tags
tags_str = ', '.join(tags)
html_header = f"<div><strong>Created At:</strong> {created_at}<br><strong>Tags:</strong> {tags_str}</div><hr>"
md_header = f"---\n\nCreated At: {created_at}\n\nTags: {tags_str}\n\n---\n\n"
# 创建文件名,使用note的id
html_file_name = f"notes/html/note_{note['id']}.html"
md_file_name = f"notes/md/note_{note['id']}.md"
# 写入HTML文件
with open(html_file_name, 'w', encoding='utf-8') as f:
f.write(html_header + html_content)
# 写入Markdown文件
with open(md_file_name, 'w', encoding='utf-8') as f:
f.write(md_header + md_content)
print(f'Saved: {html_file_name} and {md_file_name}')
else:
print(f"Failed to retrieve notes. Status code: {response.status_code}")