From 47e1731844ae4d687ca202d2159f046beaa11e89 Mon Sep 17 00:00:00 2001 From: yym68686 Date: Sat, 9 Dec 2023 17:08:01 +0800 Subject: [PATCH] 1. Modified the prompt for the failure of dalle3 generation. 2. Optimized the keyword extraction prompt 3. Fixed the issue where the gpt-4-1106-preview function call search API may return Unicode escape sequences instead of directly returning Unicode characters. 4. Remove Wikipedia dependency --- bot.py | 2 +- requirements.txt | 3 ++- test/test_jieba.py | 32 ++++++++++++++++++++++++++++++++ utils/agent.py | 30 +++++++++++++++++++++++------- utils/chatgpt2api.py | 9 ++++++++- 5 files changed, 66 insertions(+), 10 deletions(-) create mode 100644 test/test_jieba.py diff --git a/bot.py b/bot.py index c6f060fc..76acf1ab 100644 --- a/bot.py +++ b/bot.py @@ -244,7 +244,7 @@ async def image(update, context): start_messageid = '' config.API = '' if "content_policy_violation" in str(e): - await context.bot.edit_message_text(chat_id=chatid, message_id=start_messageid, text="当前 prompt 未能成功生成图片,可能涉及版权等违规内容😣,换句话试试吧~", parse_mode='MarkdownV2', disable_web_page_preview=True) + await context.bot.edit_message_text(chat_id=chatid, message_id=start_messageid, text="当前 prompt 未能成功生成图片,可能因为版权,政治,色情,暴力,种族歧视等违反 OpenAI 的内容政策😣,换句话试试吧~", parse_mode='MarkdownV2', disable_web_page_preview=True) if "server is busy" in str(e): await context.bot.edit_message_text(chat_id=chatid, message_id=start_messageid, text="当前服务器繁忙,请稍后再试~", parse_mode='MarkdownV2', disable_web_page_preview=True) result += f"`出错啦!{e}`" diff --git a/requirements.txt b/requirements.txt index 2d5fd8eb..d48fe09a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ --index-url https://pypi.python.org/simple/ tiktoken requests +# jieba python-dotenv python-telegram-bot[webhooks,rate-limiter]==20.6 @@ -8,7 +9,7 @@ python-telegram-bot[webhooks,rate-limiter]==20.6 # chromadb # unstructured[md,pdf] # unstructured[md,pdf] -wikipedia +# wikipedia fake_useragent openai==0.28.1 google-api-python-client diff --git a/test/test_jieba.py b/test/test_jieba.py new file mode 100644 index 00000000..ee29e1d3 --- /dev/null +++ b/test/test_jieba.py @@ -0,0 +1,32 @@ +import jieba +import jieba.analyse + +# 加载文本 +# text = "话说葬送的芙莉莲动漫是半年番还是季番?完结没?" +# text = "民进党当初为什么支持柯文哲选台北市长?" +text = "今天的微博热搜有哪些?" +# text = "How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?" + +# 使用TF-IDF算法提取关键词 +keywords_tfidf = jieba.analyse.extract_tags(text, topK=10, withWeight=False, allowPOS=()) + +# 使用TextRank算法提取关键词 +keywords_textrank = jieba.analyse.textrank(text, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) + +print("TF-IDF算法提取的关键词:", keywords_tfidf) +print("TextRank算法提取的关键词:", keywords_textrank) + + +seg_list = jieba.cut(text, cut_all=True) +print("Full Mode: " + " ".join(seg_list)) # 全模式 + +seg_list = jieba.cut(text, cut_all=False) +print("Default Mode: " + " ".join(seg_list)) # 精确模式 + +seg_list = jieba.cut(text) # 默认是精确模式 +print(" ".join(seg_list)) + +seg_list = jieba.cut_for_search(text) # 搜索引擎模式 +result = " ".join(seg_list) + +print([result] * 3) \ No newline at end of file diff --git a/utils/agent.py b/utils/agent.py index 557cfefe..fee8c4bf 100644 --- a/utils/agent.py +++ b/utils/agent.py @@ -4,6 +4,7 @@ import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import config +# import jieba import asyncio import tiktoken @@ -296,7 +297,7 @@ def getddgsearchurl(result, numresults=3): urls = re.findall(r"(https?://\S+)\]", webresult, re.MULTILINE) except Exception as e: print('\033[31m') - print("error", e) + print("duckduckgo error", e) print('\033[0m') urls = [] return urls @@ -307,7 +308,7 @@ def getgooglesearchurl(result, numresults=3): try: googleresult = google_search.results(result, numresults) for i in googleresult: - if "No good Google Search Result was found" in i: + if "No good Google Search Result was found" in i or "google.com" in i["link"]: continue urls.append(i["link"]) except Exception as e: @@ -336,23 +337,32 @@ def get_search_url(prompt, chainllm): keyword_prompt = PromptTemplate( input_variables=["source"], template=( - "根据我的问题,总结最少的关键词概括,给出三行不同的关键词组合,每行的关键词用空格连接,至少有一行关键词里面有中文,至少有一行关键词里面有英文。只要直接给出这三行关键词,不需要其他任何解释,不要出现其他符号。" - "下面是示例:" - "问题1:How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?" + "根据我的问题,总结最少的关键词概括问题,输出要求如下:" + "1. 给出三行不同的关键词组合,每行的关键词用空格连接。" + "2. 至少有一行关键词里面有中文,至少有一行关键词里面有英文。" + "3. 只要直接给出这三行关键词,不需要其他任何解释,不要出现其他符号和内容。" + "4. 如果问题有关于日漫,至少有一行关键词里面有日文。" + "下面是一些根据问题提取关键词的示例:" + "问题 1:How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?" "三行关键词是:" "zeabur price" "zeabur documentation" "zeabur 价格" - "问题2:pplx API 怎么使用?" + "问题 2:pplx API 怎么使用?" "三行关键词是:" "pplx API demo" "pplx API" "pplx API 使用方法" - "问题3:以色列哈马斯的最新情况" + "问题 3:以色列哈马斯的最新情况" "三行关键词是:" "以色列 哈马斯 最新情况" "Israel Hamas situation" "哈马斯 以色列 冲突" + "问题 4:话说葬送的芙莉莲动漫是半年番还是季番?完结没?" + "三行关键词是:" + "葬送的芙莉莲" + "葬送のフリーレン" + "Frieren: Beyond Journey's End" "这是我的问题:{source}" ), ) @@ -364,6 +374,12 @@ def get_search_url(prompt, chainllm): keywords = [item.replace("三行关键词是:", "") for item in keywords if "\\x" not in item] print("select keywords", keywords) + # # seg_list = jieba.cut_for_search(prompt) # 搜索引擎模式 + # seg_list = jieba.cut(prompt, cut_all=True) + # result = " ".join(seg_list) + # keywords = [result] * 3 + # print("keywords", keywords) + search_threads = [] urls_set = [] if config.USE_GOOGLE: diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py index 01478bb1..70d7a343 100644 --- a/utils/chatgpt2api.py +++ b/utils/chatgpt2api.py @@ -508,7 +508,14 @@ def ask_stream( max_context_tokens = self.truncate_limit - self.get_token_count(convo_id) - 500 response_role = "function" if function_call_name == "get_search_results": - prompt = json.loads(full_response)["prompt"] + # g4t 提取的 prompt 有问题 + # prompt = json.loads(full_response)["prompt"] + for index in range(len(self.conversation[convo_id])): + if self.conversation[convo_id][-1 - index]["role"] == "user": + prompt = self.conversation[convo_id][-1 - index]["content"] + print("prompt", prompt) + break + # prompt = self.conversation[convo_id][-1]["content"] # print(self.truncate_limit, self.get_token_count(convo_id), max_context_tokens) function_response = eval(function_call_name)(prompt, max_context_tokens) function_response = "web search results: \n" + function_response