From 47e1731844ae4d687ca202d2159f046beaa11e89 Mon Sep 17 00:00:00 2001
From: yym68686 <yym68686@outlook.com>
Date: Sat, 9 Dec 2023 17:08:01 +0800
Subject: [PATCH] 1. Modified the prompt for the failure of dalle3 generation.

2. Optimized the keyword extraction prompt

3. Fixed the issue where the gpt-4-1106-preview function call search API may return Unicode escape sequences instead of directly returning Unicode characters.

4. Remove Wikipedia dependency
---
 bot.py               |  2 +-
 requirements.txt     |  3 ++-
 test/test_jieba.py   | 32 ++++++++++++++++++++++++++++++++
 utils/agent.py       | 30 +++++++++++++++++++++++-------
 utils/chatgpt2api.py |  9 ++++++++-
 5 files changed, 66 insertions(+), 10 deletions(-)
 create mode 100644 test/test_jieba.py

diff --git a/bot.py b/bot.py
index c6f060fc..76acf1ab 100644
--- a/bot.py
+++ b/bot.py
@@ -244,7 +244,7 @@ async def image(update, context):
             start_messageid = ''
             config.API = ''
         if "content_policy_violation" in str(e):
-            await context.bot.edit_message_text(chat_id=chatid, message_id=start_messageid, text="当前 prompt 未能成功生成图片，可能涉及版权等违规内容😣，换句话试试吧～", parse_mode='MarkdownV2', disable_web_page_preview=True)
+            await context.bot.edit_message_text(chat_id=chatid, message_id=start_messageid, text="当前 prompt 未能成功生成图片，可能因为版权，政治，色情，暴力，种族歧视等违反 OpenAI 的内容政策😣，换句话试试吧～", parse_mode='MarkdownV2', disable_web_page_preview=True)
         if "server is busy" in str(e):
             await context.bot.edit_message_text(chat_id=chatid, message_id=start_messageid, text="当前服务器繁忙，请稍后再试～", parse_mode='MarkdownV2', disable_web_page_preview=True)
         result += f"`出错啦！{e}`"
diff --git a/requirements.txt b/requirements.txt
index 2d5fd8eb..d48fe09a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 --index-url https://pypi.python.org/simple/
 tiktoken
 requests
+# jieba
 python-dotenv
 python-telegram-bot[webhooks,rate-limiter]==20.6
 
@@ -8,7 +9,7 @@ python-telegram-bot[webhooks,rate-limiter]==20.6
 # chromadb
 # unstructured[md,pdf]
 # unstructured[md,pdf]
-wikipedia
+# wikipedia
 fake_useragent
 openai==0.28.1
 google-api-python-client
diff --git a/test/test_jieba.py b/test/test_jieba.py
new file mode 100644
index 00000000..ee29e1d3
--- /dev/null
+++ b/test/test_jieba.py
@@ -0,0 +1,32 @@
+import jieba
+import jieba.analyse
+
+# 加载文本
+# text = "话说葬送的芙莉莲动漫是半年番还是季番？完结没？"
+# text = "民进党当初为什么支持柯文哲选台北市长？"
+text = "今天的微博热搜有哪些？"
+# text = "How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?"
+
+# 使用TF-IDF算法提取关键词
+keywords_tfidf = jieba.analyse.extract_tags(text, topK=10, withWeight=False, allowPOS=())
+
+# 使用TextRank算法提取关键词
+keywords_textrank = jieba.analyse.textrank(text, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
+
+print("TF-IDF算法提取的关键词：", keywords_tfidf)
+print("TextRank算法提取的关键词：", keywords_textrank)
+
+
+seg_list = jieba.cut(text, cut_all=True)
+print("Full Mode: " + " ".join(seg_list))  # 全模式
+
+seg_list = jieba.cut(text, cut_all=False)
+print("Default Mode: " + " ".join(seg_list))  # 精确模式
+
+seg_list = jieba.cut(text)  # 默认是精确模式
+print(" ".join(seg_list))
+
+seg_list = jieba.cut_for_search(text)  # 搜索引擎模式
+result = " ".join(seg_list)
+
+print([result] * 3)
\ No newline at end of file
diff --git a/utils/agent.py b/utils/agent.py
index 557cfefe..fee8c4bf 100644
--- a/utils/agent.py
+++ b/utils/agent.py
@@ -4,6 +4,7 @@
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import config
+# import jieba
 
 import asyncio
 import tiktoken
@@ -296,7 +297,7 @@ def getddgsearchurl(result, numresults=3):
         urls = re.findall(r"(https?://\S+)\]", webresult, re.MULTILINE)
     except Exception as e:
         print('\033[31m')
-        print("error", e)
+        print("duckduckgo error", e)
         print('\033[0m')
         urls = []
     return urls
@@ -307,7 +308,7 @@ def getgooglesearchurl(result, numresults=3):
     try:
         googleresult = google_search.results(result, numresults)
         for i in googleresult:
-            if "No good Google Search Result was found" in i:
+            if "No good Google Search Result was found" in i or "google.com" in i["link"]:
                 continue
             urls.append(i["link"])
     except Exception as e:
@@ -336,23 +337,32 @@ def get_search_url(prompt, chainllm):
     keyword_prompt = PromptTemplate(
         input_variables=["source"],
         template=(
-            "根据我的问题，总结最少的关键词概括，给出三行不同的关键词组合，每行的关键词用空格连接，至少有一行关键词里面有中文，至少有一行关键词里面有英文。只要直接给出这三行关键词，不需要其他任何解释，不要出现其他符号。"
-            "下面是示例："
-            "问题1：How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?"
+            "根据我的问题，总结最少的关键词概括问题，输出要求如下："
+            "1. 给出三行不同的关键词组合，每行的关键词用空格连接。"
+            "2. 至少有一行关键词里面有中文，至少有一行关键词里面有英文。"
+            "3. 只要直接给出这三行关键词，不需要其他任何解释，不要出现其他符号和内容。"
+            "4. 如果问题有关于日漫，至少有一行关键词里面有日文。"
+            "下面是一些根据问题提取关键词的示例："
+            "问题 1：How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?"
             "三行关键词是："
             "zeabur price"
             "zeabur documentation"
             "zeabur 价格"
-            "问题2：pplx API 怎么使用？"
+            "问题 2：pplx API 怎么使用？"
             "三行关键词是："
             "pplx API demo"
             "pplx API"
             "pplx API 使用方法"
-            "问题3：以色列哈马斯的最新情况"
+            "问题 3：以色列哈马斯的最新情况"
             "三行关键词是："
             "以色列 哈马斯 最新情况"
             "Israel Hamas situation"
             "哈马斯 以色列 冲突"
+            "问题 4：话说葬送的芙莉莲动漫是半年番还是季番？完结没？"
+            "三行关键词是："
+            "葬送的芙莉莲"
+            "葬送のフリーレン"
+            "Frieren: Beyond Journey's End"
             "这是我的问题：{source}"
         ),
     )
@@ -364,6 +374,12 @@ def get_search_url(prompt, chainllm):
     keywords = [item.replace("三行关键词是：", "") for item in keywords if "\\x" not in item]
     print("select keywords", keywords)
 
+    # # seg_list = jieba.cut_for_search(prompt)  # 搜索引擎模式
+    # seg_list = jieba.cut(prompt, cut_all=True)
+    # result = " ".join(seg_list)
+    # keywords = [result] * 3
+    # print("keywords", keywords)
+
     search_threads = []
     urls_set = []
     if config.USE_GOOGLE:
diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py
index 01478bb1..70d7a343 100644
--- a/utils/chatgpt2api.py
+++ b/utils/chatgpt2api.py
@@ -508,7 +508,14 @@ def ask_stream(
             max_context_tokens = self.truncate_limit - self.get_token_count(convo_id) - 500
             response_role = "function"
             if function_call_name == "get_search_results":
-                prompt = json.loads(full_response)["prompt"]
+                # g4t 提取的 prompt 有问题
+                # prompt = json.loads(full_response)["prompt"]
+                for index in range(len(self.conversation[convo_id])):
+                    if self.conversation[convo_id][-1 - index]["role"] == "user":
+                        prompt = self.conversation[convo_id][-1 - index]["content"]
+                        print("prompt", prompt)
+                        break
+                # prompt = self.conversation[convo_id][-1]["content"]
                 # print(self.truncate_limit, self.get_token_count(convo_id), max_context_tokens)
                 function_response = eval(function_call_name)(prompt, max_context_tokens)
                 function_response = "web search results: \n" + function_response