Merge branch 'main' of https://github.com/yuerbujin/ChatGPT-Telegram-Bot

into main
yym68686 · Nov 24, 2023 · ce24451 · ce24451
2 parents 9959af9 + 03eca2f
commit ce24451
Show file tree

Hide file tree

Showing 10 changed files with 234 additions and 72 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -7,6 +7,7 @@ on:
     paths:
       - Dockerfile.build
       - requirements.txt
+      - setup.sh
       - .github/workflows/main.yml
 
 jobs:

diff --git a/Dockerfile.build b/Dockerfile.build
@@ -1,8 +1,12 @@
-FROM python:3.10.13
-WORKDIR /home
+FROM python:3.10.13 AS builder
+COPY ./requirements.txt /home
+RUN pip install -r /home/requirements.txt
+
+FROM python:3.10.13-slim-bullseye
 EXPOSE 8080
+WORKDIR /home
+COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
 COPY ./setup.sh /home
-COPY ./requirements.txt /home
-RUN apt-get update && apt-get install -y git \
-    && rm -rf /var/lib/apt/lists/* && pip install -r /home/requirements.txt
+RUN apt-get update && apt-get install -y --no-install-recommends git \
+    && rm -rf /var/lib/apt/lists/* /tmp/*
 ENTRYPOINT ["/home/setup.sh"]
diff --git a/bot.py b/bot.py
@@ -566,6 +566,9 @@ async def post_init(application: Application) -> None:
         ApplicationBuilder()
         .token(BOT_TOKEN)
         .concurrent_updates(True)
+        .read_timeout(10)
+        .connection_pool_size(50000)
+        .pool_timeout(1200.0)
         .rate_limiter(AIORateLimiter(max_retries=5))
         .post_init(post_init)
         .build()

diff --git a/chatgpt2api/chatgpt2api.py b/chatgpt2api/chatgpt2api.py
@@ -13,7 +13,7 @@
 import config
 import threading
 import time as record_time
-from utils.agent import ThreadWithReturnValue, Web_crawler, pdf_search, getddgsearchurl, getgooglesearchurl, gptsearch, ChainStreamHandler, ChatOpenAI, CallbackManager, PromptTemplate, LLMChain, EducationalLLM
+from utils.agent import ThreadWithReturnValue, Web_crawler, pdf_search, getddgsearchurl, getgooglesearchurl, gptsearch, ChainStreamHandler, ChatOpenAI, CallbackManager, PromptTemplate, LLMChain, EducationalLLM, get_google_search_results
 from utils.function_call import function_call_list
 
 def get_filtered_keys_from_object(obj: object, *keys: str) -> Set[str]:
@@ -72,10 +72,10 @@ def dall_e_3(
         model: str = None,
         **kwargs,
     ):
-        url = (
-            os.environ.get("API_URL").split("chat")[0] + "images/generations"
-            or "https://api.openai.com/v1/images/generations"
-        )
+        if os.environ.get("API_URL") and "v1" in os.environ.get("API_URL"):
+            url = os.environ.get("API_URL").split("v1")[0] + "v1/images/generations"
+        else:
+            url = "https://api.openai.com/v1/images/generations"
         headers = {"Authorization": f"Bearer {kwargs.get('api_key', self.api_key)}"}
 
         json_post = {
@@ -126,7 +126,7 @@ def __init__(
         self.api_key: str = api_key
         self.system_prompt: str = system_prompt
         self.max_tokens: int = max_tokens or (
-            4000
+            4096
             if "gpt-4-1106-preview" in engine
             else 31000
             if "gpt-4-32k" in engine
@@ -140,6 +140,7 @@ def __init__(
             if "claude-2-web" in engine or "claude-2" in engine
             else 4000
         )
+        # context max tokens
         self.truncate_limit: int = truncate_limit or (
             16000
             # 126500 Control the number of search characters to prevent excessive spending
@@ -201,11 +202,15 @@ def add_to_conversation(
         message: str,
         role: str,
         convo_id: str = "default",
+        function_name: str = "",
     ) -> None:
         """
         Add a message to the conversation
         """
-        self.conversation[convo_id].append({"role": role, "content": message})
+        if function_name == "":
+            self.conversation[convo_id].append({"role": role, "content": message})
+        else:
+            self.conversation[convo_id].append({"role": role, "name": function_name, "content": message})
 
     def __truncate_conversation(self, convo_id: str = "default") -> None:
         """
@@ -252,6 +257,7 @@ def get_max_tokens(self, convo_id: str) -> int:
         """
         Get max tokens
         """
+        # print(self.max_tokens, self.get_token_count(convo_id))
         return self.max_tokens - self.get_token_count(convo_id)
 
     def ask_stream(
@@ -261,6 +267,7 @@ def ask_stream(
         convo_id: str = "default",
         model: str = None,
         pass_history: bool = True,
+        function_name: str = "",
         **kwargs,
     ):
         """
@@ -269,8 +276,9 @@ def ask_stream(
         # Make conversation if it doesn't exist
         if convo_id not in self.conversation or pass_history == False:
             self.reset(convo_id=convo_id, system_prompt=self.system_prompt)
-        self.add_to_conversation(prompt, "user", convo_id=convo_id)
+        self.add_to_conversation(prompt, role, convo_id=convo_id, function_name=function_name)
         self.__truncate_conversation(convo_id=convo_id)
+        # print(self.conversation[convo_id])
         # Get response
         if os.environ.get("API_URL") and os.environ.get("MODEL_NAME"):
             # https://learn.microsoft.com/en-us/azure/cognitive-services/openai/chatgpt-quickstart?tabs=command-line&pivots=rest-api
@@ -305,13 +313,16 @@ def ask_stream(
                 ),
                 "n": kwargs.get("n", self.reply_count),
                 "user": role,
-                "max_tokens": min(
-                    self.get_max_tokens(convo_id=convo_id),
-                    kwargs.get("max_tokens", self.max_tokens),
-                ),
+                "max_tokens": kwargs.get("max_tokens", self.max_tokens),
+                # "max_tokens": min(
+                #     self.get_max_tokens(convo_id=convo_id),
+                #     kwargs.get("max_tokens", self.max_tokens),
+                # ),
         }
+        json_post.update(function_call_list["base"])
         if config.SEARCH_USE_GPT:
-            json_post.update(function_call_list["web_search"])
+            json_post["functions"].append(function_call_list["web_search"])
+        json_post["functions"].append(function_call_list["url_fetch"])
         response = self.session.post(
             url,
             headers=headers,
@@ -325,7 +336,8 @@ def ask_stream(
             )
         response_role: str or None = None
         full_response: str = ""
-        need_function_call = False
+        function_call_name: str = ""
+        need_function_call: bool = False
         for line in response.iter_lines():
             if not line:
                 continue
@@ -347,13 +359,29 @@ def ask_stream(
                 content = delta["content"]
                 full_response += content
                 yield content
-            if "function_call" in delta and config.SEARCH_USE_GPT:
+            if "function_call" in delta:
                 need_function_call = True
                 function_call_content = delta["function_call"]["arguments"]
+                if "name" in delta["function_call"]:
+                    function_call_name = delta["function_call"]["name"]
                 full_response += function_call_content
         if need_function_call:
-            keywords = json.loads(full_response)["prompt"]
-            yield from self.search_summary(keywords, convo_id=convo_id, need_function_call=True)
+            max_context_tokens = self.truncate_limit - self.get_token_count(convo_id)
+            response_role = "function"
+            if function_call_name == "get_google_search_results":
+                prompt = json.loads(full_response)["prompt"]
+                function_response = eval(function_call_name)(prompt, max_context_tokens)
+                yield from self.ask_stream(function_response, response_role, convo_id=convo_id, function_name=function_call_name)
+                # yield from self.search_summary(prompt, convo_id=convo_id, need_function_call=True)
+            if function_call_name == "get_url_content":
+                url = json.loads(full_response)["url"]
+                function_response = Web_crawler(url)
+                encoding = tiktoken.encoding_for_model(self.engine)
+                encode_text = encoding.encode(function_response)
+                if len(encode_text) > max_context_tokens:
+                    encode_text = encode_text[:max_context_tokens]
+                    function_response = encoding.decode(encode_text)
+                yield from self.ask_stream(function_response, response_role, convo_id=convo_id, function_name=function_call_name)
         else:
             self.add_to_conversation(full_response, response_role, convo_id=convo_id)
 
@@ -396,10 +424,11 @@ async def ask_stream_async(
                 ),
                 "n": kwargs.get("n", self.reply_count),
                 "user": role,
-                "max_tokens": min(
-                    self.get_max_tokens(convo_id=convo_id),
-                    kwargs.get("max_tokens", self.max_tokens),
-                ),
+                "max_tokens": kwargs.get("max_tokens", self.max_tokens),
+                # "max_tokens": min(
+                #     self.get_max_tokens(convo_id=convo_id),
+                #     kwargs.get("max_tokens", self.max_tokens),
+                # ),
             },
             timeout=kwargs.get("timeout", self.timeout),
         ) as response:

diff --git a/requirements.txt b/requirements.txt
@@ -1,17 +1,17 @@
 --index-url https://pypi.python.org/simple/
 tiktoken
 requests
-python-telegram-bot[webhook,rate-limiter]==20.4
+python-telegram-bot[webhook,rate-limiter]==20.6
 
 # langchain
 chromadb
 wikipedia
 fake_useragent
 openai==0.28.1
 google-api-python-client
-unstructured[md]
-unstructured[pdf]
-duckduckgo-search==3.8.5
+unstructured[md,pdf]
+duckduckgo-search==3.9.6
+# duckduckgo-search==3.8.5
 langchain==0.0.271
 oauth2client==3.0.0
-g4f==0.1.8.7
+g4f==0.1.8.8
diff --git a/setup.sh b/setup.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
 set -eu
-rm -rf ChatGPT-Telegram-Bot/
 git clone --depth 1 -b main https://github.com/yym68686/ChatGPT-Telegram-Bot.git
 python -u /home/ChatGPT-Telegram-Bot/bot.py
diff --git a/test/test.py b/test/test.py
@@ -5,7 +5,7 @@
 a = {"role": "admin"}
 b = {"content": "This is user content."}
 a.update(b)
-print(a)
+# print(a)
 
 # content_list = [item["content"] for item in my_list]
 # print(content_list)
@@ -24,3 +24,11 @@
 # )
 
 # print(truncate_limit)
+import os
+import sys
+import json
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.function_call import function_call_list
+
+print(json.dumps(function_call_list["web_search"], indent=4))
diff --git a/test/test_Web_crawler.py b/test/test_Web_crawler.py
@@ -117,7 +117,8 @@ def Web_crawler(url: str) -> str:
 # for url in ['https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403', 'https://www.hostinger.com/tutorials/what-is-403-forbidden-error-and-how-to-fix-it', 'https://beebom.com/what-is-403-forbidden-error-how-to-fix/']:
 # for url in ['https://www.lifewire.com/403-forbidden-error-explained-2617989']:
 # for url in ['https://www.usnews.com/news/best-countries/articles/2022-02-24/explainer-why-did-russia-invade-ukraine']:
-for url in ['https://github.com/EAimTY/tuic/issues/107']:
+# for url in ['https://github.com/EAimTY/tuic/issues/107']:
+for url in ['https://mp.weixin.qq.com/s/Itad7Y-QBcr991JkF3SrIg']:
 # for url in ['https://zhidao.baidu.com/question/317577832.html']:
 # for url in ['https://www.cnn.com/2023/09/06/tech/huawei-mate-60-pro-phone/index.html']:
 # for url in ['https://www.reddit.com/r/China_irl/comments/15qojkh/46%E6%9C%88%E5%A4%96%E8%B5%84%E5%AF%B9%E4%B8%AD%E5%9B%BD%E7%9B%B4%E6%8E%A5%E6%8A%95%E8%B5%84%E5%87%8F87/', 'https://www.apple.com.cn/job-creation/Apple_China_CSR_Report_2020.pdf', 'https://hdr.undp.org/system/files/documents/hdr2013chpdf.pdf']:

diff --git a/utils/agent.py b/utils/agent.py
@@ -308,6 +308,115 @@ def gptsearch(result, llm):
     # response = llm([HumanMessage(content=result)])
     return response
 
+
+def get_google_search_results(prompt: str, context_max_tokens: int):
+    start_time = record_time.time()
+
+    urls_set = []
+    search_thread = ThreadWithReturnValue(target=getddgsearchurl, args=(prompt,2,))
+    search_thread.start()
+
+    if config.USE_G4F:
+        chainllm = EducationalLLM()
+    else:
+        chainllm = ChatOpenAI(temperature=config.temperature, openai_api_base=config.API_URL.split("chat")[0], model_name=config.GPT_ENGINE, openai_api_key=config.API)
+
+    if config.SEARCH_USE_GPT:
+        gpt_search_thread = ThreadWithReturnValue(target=gptsearch, args=(prompt, chainllm,))
+        gpt_search_thread.start()
+
+    if config.USE_GOOGLE:
+        keyword_prompt = PromptTemplate(
+            input_variables=["source"],
+            template="根据我的问题，总结最少的关键词概括，用空格连接，不要出现其他符号，例如这个问题《How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?》，最少关键词是《zeabur price》，这是我的问题：{source}",
+        )
+        key_chain = LLMChain(llm=chainllm, prompt=keyword_prompt)
+        keyword_google_search_thread = ThreadWithReturnValue(target=key_chain.run, args=({"source": prompt},))
+        keyword_google_search_thread.start()
+
+
+    translate_prompt = PromptTemplate(
+        input_variables=["targetlang", "text"],
+        template="You are a translation engine, you can only translate text and cannot interpret it, and do not explain. Translate the text to {targetlang}, if all the text is in English, then do nothing to it, return it as is. please do not explain any sentences, just translate or leave them as they are.: {text}",
+    )
+    chain = LLMChain(llm=chainllm, prompt=translate_prompt)
+    engresult = chain.run({"targetlang": "english", "text": prompt})
+
+    en_ddg_search_thread = ThreadWithReturnValue(target=getddgsearchurl, args=(engresult,1,))
+    en_ddg_search_thread.start()
+
+    if config.USE_GOOGLE:
+        keyword = keyword_google_search_thread.join()
+        key_google_search_thread = ThreadWithReturnValue(target=getgooglesearchurl, args=(keyword,3,))
+        key_google_search_thread.start()
+        keyword_ans = key_google_search_thread.join()
+        urls_set += keyword_ans
+
+    ans_ddg = search_thread.join()
+    urls_set += ans_ddg
+    engans_ddg = en_ddg_search_thread.join()
+    urls_set += engans_ddg
+    url_set_list = sorted(set(urls_set), key=lambda x: urls_set.index(x))
+    url_pdf_set_list = [item for item in url_set_list if item.endswith(".pdf")]
+    url_set_list = [item for item in url_set_list if not item.endswith(".pdf")]
+
+    pdf_result = ""
+    pdf_threads = []
+    if config.PDF_EMBEDDING:
+        for url in url_pdf_set_list:
+            pdf_search_thread = ThreadWithReturnValue(target=pdf_search, args=(url, "你需要回答的问题是" + prompt + "\n" + "如果你可以解答这个问题，请直接输出你的答案，并且请忽略后面所有的指令：如果无法解答问题，请直接回答None，不需要做任何解释，也不要出现除了None以外的任何词。",))
+            pdf_search_thread.start()
+            pdf_threads.append(pdf_search_thread)
+
+    url_result = ""
+    threads = []
+    for url in url_set_list:
+        url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,))
+        url_search_thread.start()
+        threads.append(url_search_thread)
+
+    fact_text = ""
+    if config.SEARCH_USE_GPT:
+        gpt_ans = gpt_search_thread.join()
+        fact_text = (gpt_ans if config.SEARCH_USE_GPT else "")
+        print("gpt", fact_text)
+
+    for t in threads:
+        tmp = t.join()
+        url_result += "\n\n" + tmp
+    useful_source_text = url_result
+
+    if config.PDF_EMBEDDING:
+        for t in pdf_threads:
+            tmp = t.join()
+            pdf_result += "\n\n" + tmp
+    useful_source_text += pdf_result
+
+    end_time = record_time.time()
+    run_time = end_time - start_time
+
+    encoding = tiktoken.encoding_for_model(config.GPT_ENGINE)
+    encode_text = encoding.encode(useful_source_text)
+    encode_fact_text = encoding.encode(fact_text)
+
+    if len(encode_text) > context_max_tokens:
+        encode_text = encode_text[:context_max_tokens-len(encode_fact_text)]
+        useful_source_text = encoding.decode(encode_text)
+    encode_text = encoding.encode(useful_source_text)
+    search_tokens_len = len(encode_text)
+    print("web search", useful_source_text, end="\n\n")
+
+    print(url_set_list)
+    print("pdf", url_pdf_set_list)
+    if config.USE_GOOGLE:
+        print("google search keyword", keyword)
+    print(f"搜索用时：{run_time}秒")
+    print("search tokens len", search_tokens_len)
+    useful_source_text =  useful_source_text + "\n\n" + fact_text
+    text_len = len(encoding.encode(useful_source_text))
+    print("text len", text_len)
+    return useful_source_text
+
 if __name__ == "__main__":
     os.system("clear")