-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzhihu_crawler.py
More file actions
891 lines (742 loc) · 34.2 KB
/
zhihu_crawler.py
File metadata and controls
891 lines (742 loc) · 34.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
from selenium import webdriver # Selenium 的 webdriver 模块,用于启动和控制edge
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
import tkinter as tk #GUI 库 Tkinter
from tkinter import ttk, scrolledtext #导入 ttk(主题化小部件)和 scrolledtext(带滚动条的文本区域)
import threading #导入线程库,用于把耗时操作放到后台线程,避免阻塞 GUI 主线程
import time
import random
import os
import webbrowser # 导入系统默认浏览器打开 URL 的模块,GUI 中用来点击链接打开真实浏览器
import cx_Oracle # 导入 Oracle 数据库的 Python 驱动
from selenium.webdriver.support import expected_conditions as EC
class Zhihu:
def __init__(self): #构造和初始化类
# edge浏览器配置
msedgedriver_path = './msedgedriver.exe' #对应的浏览器驱动
service = Service(executable_path=msedgedriver_path)
# 反检测配置
self.edge_options = Options()
self.setup_anti_detection()
#创建浏览器实例
self.browser = webdriver.Edge(service=service, options=self.edge_options)
self.wait = WebDriverWait(self.browser, 15) #创建等待对象,时间最长15s
# 数据库连接
self.db_connection = None #初始化数据库连接变量为 None
self.setup_database() #调用方法连接数据库并创建表结构
def setup_database(self): # 连接 Oracle+初始化
"""连接 Oracle+初始化"""
database_servers = [
"localhost:1521/ORCL",
]
for ds in database_servers:
try:
self.db_connection = cx_Oracle.connect(
user="你的数据库名称",
password="你的数据库密码",
dsn=ds,
mode=cx_Oracle.SYSDBA
)
print(f"数据库连接成功: {ds}")
self.create_tables()
return
except Exception as e:
print(f"连接失败 {ds}: {e}")
try:
cursor = self.db_connection.cursor()
# 删除已存在的表
#tables = ['ZHIHU_ANSWERS', 'ZHIHU_QUESTIONS']
#for table in tables:
# try:
# cursor.execute(f"DROP TABLE {table} CASCADE CONSTRAINTS") # 不用加分号
# except:
# pass
# 创建序列
try:
cursor.execute("DROP SEQUENCE answer_id_seq") # 不用加分号
except:
pass
cursor.execute("CREATE SEQUENCE answer_id_seq START WITH 1 INCREMENT BY 1") # 不用加分号
# 创建问题表
cursor.execute("""
CREATE TABLE ZHIHU_QUESTIONS (
question_id VARCHAR2(100) PRIMARY KEY,
title VARCHAR2(500),
url VARCHAR2(500),
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""") # 多行字符串不需要分号
# 创建回答表
cursor.execute("""
CREATE TABLE ZHIHU_ANSWERS (
answer_id NUMBER PRIMARY KEY,
question_id VARCHAR2(100),
author VARCHAR2(100),
publish_time VARCHAR2(100),
content CLOB,
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (question_id) REFERENCES ZHIHU_QUESTIONS(question_id)
)
""")
self.db_connection.commit()
cursor.close()
print("数据表创建成功")
except Exception as e:
print(f"数据表创建失败: {e}")
import traceback
traceback.print_exc() # 打印错误信息
def save_to_database(self, question_data):
"""保存数据到Oracle数据库"""
if not self.db_connection:
return
try:
cursor = self.db_connection.cursor()
# 首先保存问题
cursor.execute("""
INSERT INTO ZHIHU_QUESTIONS (question_id, title, url)
VALUES (:1, :2, :3)
""", (
question_data['question_id'],
question_data['title'],
question_data['url']
))
# 然后保存回答
for answer in question_data['answers']:
# 获取下一个answer_id
cursor.execute("SELECT answer_id_seq.NEXTVAL FROM DUAL")
answer_id = cursor.fetchone()[0]
cursor.execute("""
INSERT INTO ZHIHU_ANSWERS (answer_id, question_id, author, publish_time, content)
VALUES (:1, :2, :3, :4, :5)
""", (
answer_id,
question_data['question_id'],
answer['author'],
answer['publish_time'],
answer['content']
))
self.db_connection.commit()
cursor.close()
print("数据已保存到数据库")
except Exception as e:
print(f"保存到数据库失败: {e}")
self.db_connection.rollback()
def setup_anti_detection(self):
"""设置反检测配置"""
configs = {
'args': [
f"--user-data-dir={os.path.abspath('./edge_user_data')}", #设置用户数据目录路径
'--disable-blink-features=AutomationControlled', #移除浏览器中暴露自动化操作的属性
'--disable-web-security', #避免跨域请求被阻止
'--no-sandbox', #提高稳定性
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
],
'experimental_options': {
"excludeSwitches": ["enable-automation", "enable-logging"],
'useAutomationExtension': False
}
}
for arg in configs['args']:
self.edge_options.add_argument(arg)
for key, value in configs['experimental_options'].items():
self.edge_options.add_experimental_option(key, value)
def human_like_delay(self, min_time=1, max_time=3):
"""人类随机延迟"""
time.sleep(random.uniform(min_time, max_time))
def check_login_status(self):
"""检查登录状态"""
try:
self.browser.get('https://www.zhihu.com/')
self.human_like_delay(2, 4)
# 多种方式检查登录状态
selectors = [
'//div[contains(@class, "AppHeader-profile")]',
'//a[contains(@href, "/people/")]',
'//div[contains(text(), "写回答")]',
'//span[contains(text(), "我的")]',
'//button[contains(text(), "提问")]'
]
for selector in selectors:
try:
element = self.browser.find_element(By.XPATH, selector)
print("已登录")
return True
except:
continue
print("未登录")
return False
except Exception as e:
print(f"检查登录状态出错: {e}")
return False
def manual_login(self):
# 在新窗口中打开登录页面
login_window = tk.Toplevel(self.root)
login_window.title("知乎登录")
login_window.geometry("500x400")
# 添加说明文本
instruction_text = """
登录说明:
1. 浏览器窗口已打开,请在新窗口中进行登录
2. 完成登录后回到此界面点击'已完成登录'
3. 如果登录失败,点击'重新登录'
注意:登录成功后请不要关闭浏览器窗口!
"""
ttk.Label(login_window, text=instruction_text, justify="left").pack(pady=10)
# 打开浏览器进行登录
self.browser.get('https://www.zhihu.com/signin')
def check_login_status():
"""检查登录状态"""
if self.crawler.check_login_status():
self.status_var.set("登录成功!")
login_window.destroy()
# 保存登录状态到配置文件
self.save_login_status()
else:
tk.messagebox.showwarning("登录失败", "登录验证失败,请重试")
def retry_login():
"""重新登录"""
self.browser.get('https://www.zhihu.com/signin')
# 按钮框架
btn_frame = ttk.Frame(login_window)
btn_frame.pack(pady=20)
ttk.Button(btn_frame, text="已完成登录", command=check_login_status).pack(side="left", padx=5)
ttk.Button(btn_frame, text="重新登录", command=retry_login).pack(side="left", padx=5)
ttk.Button(btn_frame, text="取消", command=login_window.destroy).pack(side="left", padx=5)
def save_login_status(self):
"""保存登录状态"""
try:
with open('login_status.cfg', 'w') as f:
f.write('logged_in=true\n')
except:
pass
def search_topics(self, keyword):
"""搜索话题"""
try:
# 直接访问搜索URL
search_url = f"https://www.zhihu.com/search?type=content&q={keyword}"
print(f"正在搜索: {keyword}")
self.browser.get(search_url)
time.sleep(3)
topics = []
# 查找问题链接
question_links = self.browser.find_elements(By.XPATH, '//a[contains(@href, "/question/")]')
seen_urls = set()
for link in question_links:
try:
url = link.get_attribute('href')
title = link.text.strip()
if (url and '/question/' in url and
url not in seen_urls and
title and len(title) > 5):
seen_urls.add(url)
question_id = url.split('/question/')[-1].split('?')[0]
topics.append({
'id': question_id,
'title': title,
'url': url
})
except Exception as e:
continue
print(f"找到 {len(topics)} 个话题")
return topics # 返回所有结果
except Exception as e:
print(f"搜索失败: {e}")
return []
def get_topic_detail(self, question_id):
"""获取话题详情"""
try:
url = f"https://www.zhihu.com/question/{question_id}"
self.browser.get(url)
time.sleep(3)
# 获取问题标题
question_title = "未知标题"
try:
title_elem = self.browser.find_element(By.XPATH, '//h1[contains(@class, "QuestionHeader-title")]')
question_title = title_elem.text
except:
pass
# 获取回答列表
answers = self.get_answers()
result = {
'question_id': question_id,
'title': question_title,
'url': url,
'answers': answers
}
# 保存到数据库
self.save_to_database(result)
return result
except Exception as e:
print(f"获取话题详情失败: {e}")
return None
def get_answers(self):
"""获取最新的 10 个回答"""
answers = []
try:
# 1.尝试点击时间排序
try:
sort_btn = WebDriverWait(self.browser, 3).until(
EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "时间排序")]'))
)
sort_btn.click()
time.sleep(2)
except:
print("")
# 2.多次滚动触发加载
for _ in range(8):
self.browser.execute_script("window.scrollBy(0, 1000)")
time.sleep(0.8)
# 3. 使用 XPath
answer_cards = self.browser.find_elements(
By.XPATH,
'//div[contains(@class, "AnswerItem")]'
)
print(f"找到 {len(answer_cards)} 个回答")
if not answer_cards:
return []
# 取前 5 个
answer_cards = answer_cards[:5]
# 4. 处理每个回答
for i, card in enumerate(answer_cards, start=1):
print(f"\n正在处理第 {i} 个回答...")
# 展开阅读全文
try:
expand_btn = card.find_element(By.XPATH, './/button[contains(text(), "展开")]')
self.browser.execute_script("arguments[0].click();", expand_btn)
time.sleep(1)
except:
pass
author_name = self.extract_author(card, i)
publish_time = self.extract_publish_time(card, i)
content = self.extract_content(card, i)
comments = self.get_comments(card)
content = self.extract_content(card, i)
# 如果没有正文,跳过这个卡片(通常是图片卡片、广告、空白卡片)
if not content or len(content.strip()) < 5:
print(f"跳过无内容回答")
continue
answers.append({
"author": author_name,
"publish_time": publish_time,
"content": content,
"comments": comments
})
print(f"第 {i} 个回答处理完成: {author_name}")
except Exception as e:
print("获取回答列表失败:", e)
print(f"总共成功提取 {len(answers)} 个回答")
return answers
def extract_author(self, card, index):
"""使用 XPath 提取作者"""
author_name = "未知作者"
try:
# 合并 XPath 写法(多条件)
xpaths = (
'.//a[contains(@class, "UserLink-link")] | '
'.//span[contains(@class, "UserLink")] | '
'.//div[contains(@class, "AuthorInfo")]//a | '
'.//meta[@itemprop="author"]'
)
elems = card.find_elements(By.XPATH, xpaths)
for el in elems:
name = (el.get_attribute("content") or el.text or "").strip()
if name and name != "匿名用户" and len(name) > 1:
author_name = name
print(f"第{index}个回答作者: {author_name}")
return author_name
except Exception as e:
print(f"第{index}个回答提取作者失败: {e}")
return author_name
def extract_publish_time(self, card, index):
"""使用 XPath 提取发布时间(只保留年月日)"""
publish_time = "未知时间"
try:
xpaths = (
'.//meta[@itemprop="dateCreated"] | '
'.//meta[@itemprop="dateModified"] | '
'.//span[contains(text(), "发布于")] | '
'.//span[contains(text(), "编辑于")] | '
'.//a[contains(@class, "ContentItem-time")]'
)
elems = card.find_elements(By.XPATH, xpaths)
for el in elems:
time_text = (el.get_attribute("content") or el.text or "").strip()
if not time_text:
continue
# 处理 ISO 格式时间
if "T" in time_text and "Z" in time_text:
# 2022-05-26T01:53:59.000Z → 2022-05-26
publish_time = time_text.split("T")[0]
print(f"第{index}个回答时间: {publish_time}")
return publish_time
# 普通文本格式
if "发布于" in time_text or "编辑于" in time_text:
# 发布于 2022-05-26
# 编辑于 2022-05-26
# 只保留后面的 yyyy-mm-dd
for fmt in ["发布于", "编辑于"]:
if fmt in time_text:
publish_time = time_text.replace(fmt, "").strip()
print(f"第{index}个回答时间: {publish_time}")
return publish_time
except Exception as e:
print(f"第{index}个回答提取时间失败: {e}")
return publish_time
def extract_content(self, card, index):
"""提取回答内容"""
content = "无内容"
try:
# 先尝试点击展开按钮
try:
expand_buttons = [
'.//button[contains(text(), "展开阅读全文")]',
'.//button[contains(text(), "显示全部")]',
'.//button[contains(text(), "展开")]'
]
for button_selector in expand_buttons:
try:
expand_btn = card.find_element(By.XPATH, button_selector)
self.browser.execute_script("arguments[0].click();", expand_btn)
time.sleep(1)
print(f"第{index}个回答已点击展开按钮")
break
except:
continue
except:
pass
# 获取完整内容
content_selectors = [
'.//span[contains(@class, "RichText")]',
'.//div[@itemprop="text"]',
'.//div[contains(@class, "CopyrightRichText")]'
'//div[contains(@class, "AnswerItem")]//span[contains(@class, "RichText")]'
]
full_content = []
for content_selector in content_selectors:
try:
content_elems = card.find_elements(By.XPATH, content_selector)
for content_elem in content_elems:
text = content_elem.text.strip()
if text and len(text) > 10:
full_content.append(text)
except:
continue
if full_content:
content = "\n".join(full_content)
print(f"第{index}个回答内容长度: {len(content)} 字符")
else:
content = "无内容"
except Exception as e:
print(f"第{index}个回答提取内容失败: {e}")
return content
def get_comments(self, card):
comments = []
try:
# 尝试找到"展开评论"按钮(相对定位)
try:
expand_btn = card.find_element(
By.XPATH, './/button[contains(@class, "Button") and contains(text(), "评论")]'
)
self.browser.execute_script("arguments[0].click();", expand_btn)
time.sleep(1)
except:
pass # 没按钮就继续(可能本来就展开)
# 找到所有评论项
comment_blocks = card.find_elements(
By.XPATH, './/div[contains(@class, "css-jp4314")]'
)
for block in comment_blocks:
try:
text = block.text.strip()
if text and len(text) > 0:
comments.append(text)
except:
continue
except Exception as e:
print("获取评论失败:", e)
return comments
def close(self):
"""关闭浏览器和数据库连接"""
if self.browser:
self.browser.quit()
if self.db_connection:
self.db_connection.close()
class ZhihuGUI:
def __init__(self, root):
self.root = root
self.root.title("查看知乎内容-界面")
self.root.geometry("650x600")
self.crawler = Zhihu()
self.current_page = "search"
self.topics_data = []
self.detail_data = None
self.setup_ui()
self.check_login()
def update_status_display(self, message, color="blue"):
"""更新状态显示"""
self.status_display.config(text=message, foreground=color)
# 同时更新底部状态栏(保持兼容)
self.status_var.set(message)
def setup_ui(self):
"""设置UI界面"""
# 搜索框架
self.search_frame = ttk.Frame(self.root)
self.search_frame.pack(pady=10, padx=20, fill="x")
ttk.Label(self.search_frame, text="搜索关键词:").pack(side="left")
self.search_entry = ttk.Entry(self.search_frame, width=30)
self.search_entry.pack(side="left", padx=5)
self.search_entry.bind("<Return>", lambda e: self.search_topics())
self.search_btn = ttk.Button(self.search_frame, text="搜索", command=self.search_topics)
self.search_btn.pack(side="left", padx=5)
# 返回按钮
self.back_btn = ttk.Button(self.search_frame, text="返回", command=self.go_back)
# 明显的状态显示区域 ===
self.status_display_frame = ttk.Frame(self.root)
self.status_display_frame.pack(pady=5, padx=20, fill="x")
# 状态标签 - 更明显的样式
self.status_display = ttk.Label(
self.status_display_frame,
text="正在检查登录状态……",
font=("Arial", 10, "bold"),
foreground="blue", # 蓝色文字
background="#f0f0f0", # 浅灰色背景
relief="solid", # 实线边框
borderwidth=1,
padding=(10, 5) # 内边距
)
self.status_display.pack(fill="x")
# 结束
# 结果显示区域
self.result_frame = ttk.Frame(self.root)
self.result_frame.pack(pady=10, padx=20, fill="both", expand=True)
# 话题列表
self.topics_listbox = tk.Listbox(self.result_frame, height=15)
self.topics_listbox.pack(fill="both", expand=True)
self.topics_listbox.bind("<<ListboxSelect>>", self.on_topic_select)
# 详情显示区域 - 使用Canvas和Frame实现滚动
self.detail_canvas = tk.Canvas(self.result_frame)
self.scrollbar = ttk.Scrollbar(self.result_frame, orient="vertical", command=self.detail_canvas.yview)
self.scrollable_frame = ttk.Frame(self.detail_canvas)
self.scrollable_frame.bind(
"<Configure>",
lambda e: self.detail_canvas.configure(scrollregion=self.detail_canvas.bbox("all"))
)
self.detail_canvas.create_window((0, 0), window=self.scrollable_frame, anchor="nw")
self.detail_canvas.configure(yscrollcommand=self.scrollbar.set)
# 状态栏
self.status_var = tk.StringVar()
self.status_var.set("正在检查登录状态...")
self.status_bar = ttk.Label(self.root, textvariable=self.status_var, relief="sunken")
self.status_bar.pack(side="bottom", fill="x")
def create_clickable_link(self, parent, text, url):
"""创建可点击的链接标签"""
link_label = ttk.Label(parent, text=text, foreground="blue",
cursor="hand2", font=("Arial", 9))
link_label.pack(pady=2, padx=10, anchor="w")
link_label.bind("<Button-1>", lambda e: webbrowser.open(url))
return link_label
def open_link(self, event):
"""点击链接打开浏览器"""
try:
index = self.detail_text.index(f"@{event.x},{event.y}")
tags = self.detail_text.tag_names(index)
if "link" in tags:
# 获取链接文本
start = self.detail_text.index(f"{index} linestart")
end = self.detail_text.index(f"{index} lineend")
line_text = self.detail_text.get(start, end)
# 提取URL
if "链接:" in line_text:
url = line_text.split("链接:")[1].strip()
webbrowser.open(url)
except Exception as e:
print(f"打开链接失败: {e}")
def check_login(self):
"""检查登录状态"""
def login_thread():
if not self.crawler.check_login_status():
self.root.after(0, lambda: self.show_login_dialog())
else:
self.root.after(0, lambda: self.update_status_display("就绪 - 已登录", "blue"))
thread = threading.Thread(target=login_thread)
thread.daemon = True
thread.start()
def show_login_dialog(self):
"""显示登录对话框"""
self.status_var.set("未登录状态,请手动登录")
import tkinter.messagebox
result = tkinter.messagebox.askyesno("登录提示", "检测到未登录状态,是否现在登录?")
if result:
self.manual_login()
def manual_login(self):
"""手动登录"""
def login_thread():
if self.crawler.manual_login():
self.root.after(0, lambda: self.status_var.set("就绪 - 已登录"))
else:
self.root.after(0, lambda: self.status_var.set("登录失败,请重试"))
thread = threading.Thread(target=login_thread)
thread.daemon = True
thread.start()
def search_topics(self):
"""搜索话题"""
keyword = self.search_entry.get().strip()
if not keyword:
return
self.update_status_display("搜索中...", "blue")
self.search_btn.config(state="disabled")
thread = threading.Thread(target=self._search_thread, args=(keyword,))
thread.daemon = True
thread.start()
def _search_thread(self, keyword):
"""搜索"""
try:
self.topics_data = self.crawler.search_topics(keyword)
self.root.after(0, self._update_topics_list)
except Exception as e:
error_msg = f"搜索失败: {e}"
self.root.after(0, lambda msg=error_msg: self.status_var.set(msg))
finally:
self.root.after(0, lambda: self.search_btn.config(state="normal"))
def _update_topics_list(self):
"""更新话题列表"""
self.topics_listbox.delete(0, tk.END)
if not self.topics_data:
self.topics_listbox.insert(tk.END, "未找到相关话题")
self.update_status_display("搜索完成 - 无结果", "orange")
return
for i, topic in enumerate(self.topics_data, 1):
self.topics_listbox.insert(tk.END, f"{i}. {topic['title']}")
self.update_status_display(f"搜索完成 - 找到 {len(self.topics_data)} 个话题", "blue")
self.current_page = "topics"
def on_topic_select(self, event):
"""选择话题事件"""
selection = self.topics_listbox.curselection()
if not selection:
return
index = selection[0]
if index < len(self.topics_data):
topic = self.topics_data[index]
self.show_topic_detail(topic['id'], topic['title'])
def show_topic_detail(self, question_id, topic_title=None):
"""显示话题详情"""
self.update_status_display("加载话题详情中... (全力加速)", "blue")
thread = threading.Thread(target=self._load_detail_thread, args=(question_id, topic_title))
thread.daemon = True
thread.start()
def _load_detail_thread(self, question_id, topic_title=None):
"""加载详情线程"""
try:
# 从网页获取详情数据
detail_data = self.crawler.get_topic_detail(question_id)
# 在这里使用传递的标题
if detail_data and topic_title:
# 用搜索时获取的正确标题替换可能不准确的标题
detail_data['title'] = topic_title
print(f"使用搜索标题: {topic_title}")
self.detail_data = detail_data
self.root.after(0, self._update_detail_display)
except Exception as e:
error_msg = f"加载详情失败: {e}"
self.root.after(0, lambda msg=error_msg: self.update_status_display(msg, "red"))
def _update_detail_display(self):
"""更新详情显示"""
if not self.detail_data:
self.status_var.set("加载详情失败")
return
# 隐藏话题列表,显示详情区域
self.topics_listbox.pack_forget()
self.detail_canvas.pack(side="left", fill="both", expand=True)
self.scrollbar.pack(side="right", fill="y")
# 显示返回按钮
self.back_btn.pack(side="left", padx=5)
# 清除滚动区域内容
for widget in self.scrollable_frame.winfo_children():
widget.destroy()
# 显示问题标题和链接
title_label = ttk.Label(self.scrollable_frame,
text=f"问题: {self.detail_data['title']}",
font=("Arial", 14, "bold"),
cursor="hand2") # 添加手型光标
title_label.pack(pady=10, padx=10, anchor="w")
# 添加点击事件,点击问题标题也可以打开链接
title_label.bind("<Button-1>", lambda e: webbrowser.open(self.detail_data['url']))
url_label = ttk.Label(self.scrollable_frame,
text=f"链接: {self.detail_data['url']}",
foreground="blue", cursor="hand2")
url_label.pack(pady=5, padx=10, anchor="w")
url_label.bind("<Button-1>", lambda e: webbrowser.open(self.detail_data['url']))
separator = ttk.Separator(self.scrollable_frame, orient="horizontal")
separator.pack(fill="x", pady=10, padx=10)
# 显示回答列表
for i, answer in enumerate(self.detail_data['answers'], 1):
# 回答标题
answer_title = ttk.Label(self.scrollable_frame,
text=f"回答 {i}:",
font=("Arial", 12, "bold"))
answer_title.pack(pady=5, padx=10, anchor="w")
# 作者和时间
author_label = ttk.Label(self.scrollable_frame,
text=f"作者: {answer['author']} | 时间: {answer['publish_time']}")
author_label.pack(pady=2, padx=10, anchor="w")
# 内容
if answer['content'] != "无内容":
content_label = ttk.Label(self.scrollable_frame,
text="内容:",
font=("Arial", 10, "bold"))
content_label.pack(pady=5, padx=10, anchor="w")
# 内容文本区域
content_text = scrolledtext.ScrolledText(self.scrollable_frame,
height=8,
wrap=tk.WORD,
font=("Arial", 9))
content_text.insert(1.0, answer['content'])
content_text.config(state="disabled")
content_text.pack(pady=5, padx=10, fill="x")
# 评论
if answer['comments']:
comments_label = ttk.Label(self.scrollable_frame,
text=f"评论 ({len(answer['comments'])} 条):",
font=("Arial", 10, "bold"))
comments_label.pack(pady=10, padx=10, anchor="w")
for j, comment in enumerate(answer['comments'], 1):
comment_label = ttk.Label(self.scrollable_frame,
text=f" {j}. {comment['author']}: {comment['content']}",
wraplength=800,
justify="left")
comment_label.pack(pady=2, padx=20, anchor="w")
# 回答分隔符
answer_separator = ttk.Separator(self.scrollable_frame, orient="horizontal")
answer_separator.pack(fill="x", pady=15, padx=10)
self.current_page = "detail"
self.update_status_display("详情加载完成!", "blue")
# 更新滚动区域
self.scrollable_frame.update_idletasks()
self.detail_canvas.configure(scrollregion=self.detail_canvas.bbox("all"))
def go_back(self):
"""返回上一级"""
if self.current_page == "detail":
# 隐藏详情区域,显示话题列表
self.detail_canvas.pack_forget()
self.scrollbar.pack_forget()
self.topics_listbox.pack(fill="both", expand=True)
self.back_btn.pack_forget()
self.current_page = "topics"
self.update_status_display("返回到话题列表", "blue")
elif self.current_page == "topics":
self.topics_listbox.delete(0, tk.END)
self.current_page = "search"
self.update_status_display("正在检查登录状态……", "blue")
def on_closing(self):
"""程序关闭"""
self.crawler.close()
self.root.destroy()
def main(): #整个程序的主入口和GUI启动代码
root = tk.Tk()
app = ZhihuGUI(root)
root.protocol("WM_DELETE_WINDOW", app.on_closing)
root.mainloop()
if __name__ == "__main__":
main()