From 990362f8d4f05bc4a5b8d1a4ea8b8c67aef323e1 Mon Sep 17 00:00:00 2001 From: yym68686 Date: Wed, 13 Dec 2023 21:46:57 +0800 Subject: [PATCH] Modify the web crawler filtering mechanism, extract table contents to web crawler result --- test/test_Web_crawler.py | 25 +++++++++++++++++++------ utils/agent.py | 20 +++++++++++++++----- utils/chatgpt2api.py | 1 + 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/test/test_Web_crawler.py b/test/test_Web_crawler.py index b3a48d70..252decfb 100644 --- a/test/test_Web_crawler.py +++ b/test/test_Web_crawler.py @@ -63,7 +63,7 @@ # print('\033[0m') # return result -def Web_crawler(url: str) -> str: +def Web_crawler(url: str, isSearch=False) -> str: """返回链接网址url正文内容,必须是合法的网址""" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" @@ -71,22 +71,33 @@ def Web_crawler(url: str) -> str: result = '' try: requests.packages.urllib3.disable_warnings() - response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True) + response = requests.get(url, headers=headers, verify=False, timeout=3, stream=True) if response.status_code == 404: print("Page not found:", url) - return "" + return "抱歉,网页不存在,目前无法访问该网页。@Trash@" content_length = int(response.headers.get('Content-Length', 0)) if content_length > 5000000: print("Skipping large file:", url) return result soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8') + + table_contents = "" + tables = soup.find_all('table') + for table in tables: + table_contents += table.get_text() + table.decompose() body = "".join(soup.find('body').get_text().split('\n')) - result = body + result = table_contents + body + if result == '' and not isSearch: + result = "抱歉,可能反爬虫策略,目前无法访问该网页。@Trash@" + if result.count("\"") > 1000: + result = "" except Exception as e: print('\033[31m') print("error url", url) print("error", e) print('\033[0m') + print("url content", result + "\n\n") return result # def Web_crawler(url: str) -> str: @@ -121,14 +132,16 @@ def Web_crawler(url: str) -> str: # TODO 没办法访问 # for url in ['https://s.weibo.com/top/summary?cate=realtimehot']: # for url in ['https://tophub.today/n/KqndgxeLl9']: -for url in ['https://support.apple.com/zh-cn/HT213931']: +# for url in ['https://support.apple.com/zh-cn/HT213931']: +for url in ['https://developer.aliyun.com/article/721836']: +# for url in ['https://cn.aliyun.com/page-source/price/detail/machinelearning_price']: # for url in ['https://mp.weixin.qq.com/s/Itad7Y-QBcr991JkF3SrIg']: # for url in ['https://zhidao.baidu.com/question/317577832.html']: # for url in ['https://www.cnn.com/2023/09/06/tech/huawei-mate-60-pro-phone/index.html']: # for url in ['https://www.reddit.com/r/China_irl/comments/15qojkh/46%E6%9C%88%E5%A4%96%E8%B5%84%E5%AF%B9%E4%B8%AD%E5%9B%BD%E7%9B%B4%E6%8E%A5%E6%8A%95%E8%B5%84%E5%87%8F87/', 'https://www.apple.com.cn/job-creation/Apple_China_CSR_Report_2020.pdf', 'https://hdr.undp.org/system/files/documents/hdr2013chpdf.pdf']: # for url in ['https://www.airuniversity.af.edu/JIPA/Display/Article/3111127/the-uschina-trade-war-vietnam-emerges-as-the-greatest-winner/']: # for url in ['https://zhuanlan.zhihu.com/p/646786536', 'https://zh.wikipedia.org/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD', 'https://stock.finance.sina.com.cn/usstock/quotes/aapl.html']: - print(Web_crawler(url)) + Web_crawler(url) print('-----------------------------') end_time = time.time() run_time = end_time - start_time diff --git a/utils/agent.py b/utils/agent.py index 45a4fa69..f667783d 100644 --- a/utils/agent.py +++ b/utils/agent.py @@ -264,7 +264,7 @@ def join(self): super().join() return self._return -def Web_crawler(url: str) -> str: +def Web_crawler(url: str, isSearch=False) -> str: """返回链接网址url正文内容,必须是合法的网址""" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" @@ -272,7 +272,7 @@ def Web_crawler(url: str) -> str: result = '' try: requests.packages.urllib3.disable_warnings() - response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True) + response = requests.get(url, headers=headers, verify=False, timeout=3, stream=True) if response.status_code == 404: print("Page not found:", url) return "抱歉,网页不存在,目前无法访问该网页。@Trash@" @@ -281,10 +281,18 @@ def Web_crawler(url: str) -> str: print("Skipping large file:", url) return result soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8') + + table_contents = "" + tables = soup.find_all('table') + for table in tables: + table_contents += table.get_text() + table.decompose() body = "".join(soup.find('body').get_text().split('\n')) - result = body - if result == '': + result = table_contents + body + if result == '' and not isSearch: result = "抱歉,可能反爬虫策略,目前无法访问该网页。@Trash@" + if result.count("\"") > 1000: + result = "" except Exception as e: print('\033[31m') print("error url", url) @@ -451,7 +459,7 @@ def get_search_results(prompt: str, context_max_tokens: int): threads = [] for url in url_set_list: - url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,)) + url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,True,)) url_search_thread.start() threads.append(url_search_thread) @@ -494,6 +502,8 @@ def get_search_results(prompt: str, context_max_tokens: int): # # 搜索 # for i in search_web_and_summary("今天的微博热搜有哪些?"): + # for i in search_web_and_summary("阿里云24核96G的云主机价格是多少"): + # for i in search_web_and_summary("话说葬送的芙莉莲动漫是半年番还是季番?完结没?"): # for i in search_web_and_summary("周海媚事件进展"): # for i in search_web_and_summary("macos 13.6 有什么新功能"): # for i in search_web_and_summary("用python写个网络爬虫给我"): diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py index 29c75d51..b66dffa3 100644 --- a/utils/chatgpt2api.py +++ b/utils/chatgpt2api.py @@ -617,6 +617,7 @@ def ask_stream( if self.conversation[convo_id][-1 - index]["role"] == "user": self.conversation[convo_id][-1 - index]["content"] = self.conversation[convo_id][-1 - index]["content"].replace("search: ", "") prompt = self.conversation[convo_id][-1 - index]["content"] + prompt = " ".join([prompt, json.loads(full_response)["prompt"]]) print("\n\nprompt", prompt) break # prompt = self.conversation[convo_id][-1]["content"]