diff --git a/requirements.txt b/requirements.txt index 31a3e886..37d85751 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ python-telegram-bot[webhooks,rate-limiter]==20.6 fake_useragent openai==0.28.1 google-api-python-client -duckduckgo-search==3.9.6 +duckduckgo-search==4.1.0 langchain==0.0.271 oauth2client==3.0.0 pdfminer.six diff --git a/test/test_ddg.py b/test/test_ddg.py index 31ae94b9..547a1974 100644 --- a/test/test_ddg.py +++ b/test/test_ddg.py @@ -1,17 +1,67 @@ import re import time import requests +import json import os from bs4 import BeautifulSoup from langchain.tools import DuckDuckGoSearchResults +from duckduckgo_search import DDGS +def getddgsearchurl1(result, numresults=3): + requrl = f"https://html.duckduckgo.com/html?q={result}&kl=us-en&s=0&dc=0" + try: + response = requests.get(requrl) + soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8') + print(soup) + urls = [] + for link in soup.findAll('a', attrs={'href': re.compile("^http://")}): + urls.append(link.get('href')) + urls = urls[:numresults] + except Exception as e: + print('\033[31m') + print("duckduckgo error", e) + print('\033[0m') + urls = [] + return urls + +def search_duckduckgo(query): + url = 'https://duckduckgo.com/html/' + params = { + 'q': query, + 'ia': 'web' + } + + response = requests.get(url, params=params) + soup = BeautifulSoup(response.text, 'html.parser') + print(soup) + for link in soup.find_all('a', class_='result__url'): + print(link.get('href')) + + + +from duckduckgo_search import DDGS + +def getddg(result, numresults=3): + with DDGS(timeout=2) as ddgs: + results = [r["href"] for r in ddgs.text(result, max_results=numresults)] + # print(json.dumps(results, ensure_ascii=False, indent=4)) + return results + def getddgsearchurl(result, numresults=3): - search = DuckDuckGoSearchResults(num_results=numresults) - webresult = search.run(result) - urls = re.findall(r"(https?://\S+)\]", webresult, re.MULTILINE) + try: + # webresult = getddg(result, numresults) + search = DuckDuckGoSearchResults(num_results=numresults) + webresult = search.run(result) + print(webresult) + if webresult == None: + return [] + urls = re.findall(r"(https?://\S+)\]", webresult, re.MULTILINE) + except Exception as e: + print('\033[31m') + print("duckduckgo error", e) + print('\033[0m') + urls = [] return urls -urls = getddgsearchurl("你知道今天有什么热点新闻吗") -print(urls) def Web_crawler(url: str) -> str: """返回链接网址url正文内容,必须是合法的网址""" @@ -39,12 +89,21 @@ def Web_crawler(url: str) -> str: print('\033[0m') return result -start_time = time.time() +if __name__ == '__main__': + start_time = time.time() -for url in urls: - print(Web_crawler(url)) - print('-----------------------------') -end_time = time.time() -run_time = end_time - start_time -# 打印运行时间 -print(f"程序运行时间:{run_time}秒") \ No newline at end of file + # search_duckduckgo('python programming') + # print(getddg("尊嘟假嘟 含义")) + # urls = getddgsearchurl("python programming") + # urls = getddgsearchurl1("test") + # urls = getddgsearchurl("你知道今天有什么热点新闻吗") + # urls = getddg("尊嘟假嘟 含义") + urls = getddgsearchurl("它会返回一个包含搜索结果的列表") + print(urls) + # for url in urls: + # print(Web_crawler(url)) + # print('-----------------------------') + end_time = time.time() + run_time = end_time - start_time + # 打印运行时间 + print(f"程序运行时间:{run_time}秒") \ No newline at end of file diff --git a/test/test_tikitoken.py b/test/test_tikitoken.py index 84e85d95..38d3f5ec 100644 --- a/test/test_tikitoken.py +++ b/test/test_tikitoken.py @@ -1,4 +1,19 @@ import tiktoken -# tiktoken.get_encoding("cl100k_base") +tiktoken.get_encoding("cl100k_base") tiktoken.model.MODEL_TO_ENCODING["claude-2.1"] = "cl100k_base" -encoding = tiktoken.encoding_for_model("claude-2.1") \ No newline at end of file +tiktoken.get_encoding("cl100k_base") +encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-16k") +# encoding = tiktoken.encoding_for_model("claude-2.1") +encode_web_text_list = [] +if encode_web_text_list == []: + encode_web_text_list = encoding.encode("Hello, my dog is cute") + print("len", len(encode_web_text_list)) +function_response = encoding.decode(encode_web_text_list[:2]) +print(function_response) +encode_web_text_list = encode_web_text_list[2:] +print(encode_web_text_list) +encode_web_text_list = [856, 5679, 374, 19369] +tiktoken.get_encoding("cl100k_base") +encoding1 = tiktoken.encoding_for_model("gpt-3.5-turbo-16k") +function_response = encoding1.decode(encode_web_text_list[:2]) +print(function_response) \ No newline at end of file diff --git a/utils/agent.py b/utils/agent.py index 79759b92..ca980b3d 100644 --- a/utils/agent.py +++ b/utils/agent.py @@ -302,7 +302,7 @@ def Web_crawler(url: str, isSearch=False) -> str: # print("url content", result + "\n\n") return result -def getddgsearchurl(result, numresults=3): +def getddgsearchurl(result, numresults=4): try: search = DuckDuckGoSearchResults(num_results=numresults) webresult = search.run(result) @@ -314,6 +314,7 @@ def getddgsearchurl(result, numresults=3): print("duckduckgo error", e) print('\033[0m') urls = [] + # print("ddg urls", urls) return urls def getgooglesearchurl(result, numresults=3): @@ -332,6 +333,7 @@ def getgooglesearchurl(result, numresults=3): if "rateLimitExceeded" in str(e): print("Google API 每日调用频率已达上限,请明日再试!") config.USE_GOOGLE = False + # print("google urls", urls) return urls def get_search_url(prompt, chainllm): @@ -390,28 +392,32 @@ def get_search_url(prompt, chainllm): search_threads = [] urls_set = [] if len(keywords) == 3: - search_url_num = 8 + search_url_num = 4 if len(keywords) == 2: - search_url_num = 12 + search_url_num = 6 if len(keywords) == 1: - search_url_num = 24 + search_url_num = 12 + # print(keywords) if config.USE_GOOGLE: search_thread = ThreadWithReturnValue(target=getgooglesearchurl, args=(keywords[0],search_url_num,)) search_thread.start() search_threads.append(search_thread) - keywords = keywords.pop(0) - + keywords.pop(0) + # print(keywords) for keyword in keywords: search_thread = ThreadWithReturnValue(target=getddgsearchurl, args=(keyword,search_url_num,)) search_thread.start() search_threads.append(search_thread) + # exit(0) for t in search_threads: tmp = t.join() urls_set += tmp url_set_list = sorted(set(urls_set), key=lambda x: urls_set.index(x)) + # cut_num = int(len(url_set_list) * 2 / 3) url_pdf_set_list = [item for item in url_set_list if item.endswith(".pdf")] url_set_list = [item for item in url_set_list if not item.endswith(".pdf")] + # return url_set_list[:cut_num], url_pdf_set_list return url_set_list, url_pdf_set_list def concat_url(threads): diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py index 8c7b0b9f..67d0952e 100644 --- a/utils/chatgpt2api.py +++ b/utils/chatgpt2api.py @@ -615,7 +615,8 @@ def ask_stream( if self.conversation[convo_id][-1 - index]["role"] == "user": self.conversation[convo_id][-1 - index]["content"] = self.conversation[convo_id][-1 - index]["content"].replace("search: ", "") prompt = self.conversation[convo_id][-1 - index]["content"] - prompt = " ".join([prompt, json.loads(full_response)["prompt"].strip()]).strip() + if json.loads(full_response)["prompt"].strip() != prompt: + prompt = " ".join([prompt, json.loads(full_response)["prompt"].strip()]).strip() print("\n\nprompt", prompt) break tiktoken.get_encoding("cl100k_base")