Add automatic summarization of web pages based on URL

yym68686 · Nov 23, 2023 · d277f5b · d277f5b
1 parent 84c9347
commit d277f5b
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 16 deletions.
diff --git a/chatgpt2api/chatgpt2api.py b/chatgpt2api/chatgpt2api.py
@@ -126,7 +126,7 @@ def __init__(
         self.api_key: str = api_key
         self.system_prompt: str = system_prompt
         self.max_tokens: int = max_tokens or (
-            4000
+            4096
             if "gpt-4-1106-preview" in engine
             else 31000
             if "gpt-4-32k" in engine
@@ -140,6 +140,7 @@ def __init__(
             if "claude-2-web" in engine or "claude-2" in engine
             else 4000
         )
+        # context max tokens
         self.truncate_limit: int = truncate_limit or (
             16000
             # 126500 Control the number of search characters to prevent excessive spending
@@ -201,11 +202,15 @@ def add_to_conversation(
         message: str,
         role: str,
         convo_id: str = "default",
+        function_name: str = "",
     ) -> None:
         """
         Add a message to the conversation
         """
-        self.conversation[convo_id].append({"role": role, "content": message})
+        if function_name == "":
+            self.conversation[convo_id].append({"role": role, "content": message})
+        else:
+            self.conversation[convo_id].append({"role": role, "name": function_name, "content": message})
 
     def __truncate_conversation(self, convo_id: str = "default") -> None:
         """
@@ -252,6 +257,7 @@ def get_max_tokens(self, convo_id: str) -> int:
         """
         Get max tokens
         """
+        # print(self.max_tokens, self.get_token_count(convo_id))
         return self.max_tokens - self.get_token_count(convo_id)
 
     def ask_stream(
@@ -261,6 +267,7 @@ def ask_stream(
         convo_id: str = "default",
         model: str = None,
         pass_history: bool = True,
+        function_name: str = "",
         **kwargs,
     ):
         """
@@ -269,8 +276,9 @@ def ask_stream(
         # Make conversation if it doesn't exist
         if convo_id not in self.conversation or pass_history == False:
             self.reset(convo_id=convo_id, system_prompt=self.system_prompt)
-        self.add_to_conversation(prompt, "user", convo_id=convo_id)
+        self.add_to_conversation(prompt, role, convo_id=convo_id, function_name=function_name)
         self.__truncate_conversation(convo_id=convo_id)
+        # print(self.conversation[convo_id])
         # Get response
         if os.environ.get("API_URL") and os.environ.get("MODEL_NAME"):
             # https://learn.microsoft.com/en-us/azure/cognitive-services/openai/chatgpt-quickstart?tabs=command-line&pivots=rest-api
@@ -305,13 +313,15 @@ def ask_stream(
                 ),
                 "n": kwargs.get("n", self.reply_count),
                 "user": role,
-                "max_tokens": min(
-                    self.get_max_tokens(convo_id=convo_id),
-                    kwargs.get("max_tokens", self.max_tokens),
-                ),
+                "max_tokens": kwargs.get("max_tokens", self.max_tokens),
+                # "max_tokens": min(
+                #     self.get_max_tokens(convo_id=convo_id),
+                #     kwargs.get("max_tokens", self.max_tokens),
+                # ),
         }
         if config.SEARCH_USE_GPT:
             json_post.update(function_call_list["web_search"])
+        json_post.update(function_call_list["url_fetch"])
         response = self.session.post(
             url,
             headers=headers,
@@ -325,7 +335,8 @@ def ask_stream(
             )
         response_role: str or None = None
         full_response: str = ""
-        need_function_call = False
+        function_call_name: str = ""
+        need_function_call: bool = False
         for line in response.iter_lines():
             if not line:
                 continue
@@ -347,13 +358,21 @@ def ask_stream(
                 content = delta["content"]
                 full_response += content
                 yield content
-            if "function_call" in delta and config.SEARCH_USE_GPT:
+            if "function_call" in delta:
                 need_function_call = True
                 function_call_content = delta["function_call"]["arguments"]
+                if "name" in delta["function_call"]:
+                    function_call_name = delta["function_call"]["name"]
                 full_response += function_call_content
         if need_function_call:
-            keywords = json.loads(full_response)["prompt"]
-            yield from self.search_summary(keywords, convo_id=convo_id, need_function_call=True)
+            response_role = "function"
+            if function_call_name == "get_web_search_results":
+                keywords = json.loads(full_response)["prompt"]
+                yield from self.search_summary(keywords, convo_id=convo_id, need_function_call=True)
+            if function_call_name == "get_url_content":
+                url = json.loads(full_response)["url"]
+                function_response = Web_crawler(url)
+                yield from self.ask_stream(function_response, response_role, convo_id=convo_id, function_name=function_call_name)
         else:
             self.add_to_conversation(full_response, response_role, convo_id=convo_id)
 
@@ -396,10 +415,11 @@ async def ask_stream_async(
                 ),
                 "n": kwargs.get("n", self.reply_count),
                 "user": role,
-                "max_tokens": min(
-                    self.get_max_tokens(convo_id=convo_id),
-                    kwargs.get("max_tokens", self.max_tokens),
-                ),
+                "max_tokens": kwargs.get("max_tokens", self.max_tokens),
+                # "max_tokens": min(
+                #     self.get_max_tokens(convo_id=convo_id),
+                #     kwargs.get("max_tokens", self.max_tokens),
+                # ),
             },
             timeout=kwargs.get("timeout", self.timeout),
         ) as response:

diff --git a/test/test_Web_crawler.py b/test/test_Web_crawler.py
@@ -117,7 +117,8 @@ def Web_crawler(url: str) -> str:
 # for url in ['https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403', 'https://www.hostinger.com/tutorials/what-is-403-forbidden-error-and-how-to-fix-it', 'https://beebom.com/what-is-403-forbidden-error-how-to-fix/']:
 # for url in ['https://www.lifewire.com/403-forbidden-error-explained-2617989']:
 # for url in ['https://www.usnews.com/news/best-countries/articles/2022-02-24/explainer-why-did-russia-invade-ukraine']:
-for url in ['https://github.com/EAimTY/tuic/issues/107']:
+# for url in ['https://github.com/EAimTY/tuic/issues/107']:
+for url in ['https://mp.weixin.qq.com/s/Itad7Y-QBcr991JkF3SrIg']:
 # for url in ['https://zhidao.baidu.com/question/317577832.html']:
 # for url in ['https://www.cnn.com/2023/09/06/tech/huawei-mate-60-pro-phone/index.html']:
 # for url in ['https://www.reddit.com/r/China_irl/comments/15qojkh/46%E6%9C%88%E5%A4%96%E8%B5%84%E5%AF%B9%E4%B8%AD%E5%9B%BD%E7%9B%B4%E6%8E%A5%E6%8A%95%E8%B5%84%E5%87%8F87/', 'https://www.apple.com.cn/job-creation/Apple_China_CSR_Report_2020.pdf', 'https://hdr.undp.org/system/files/documents/hdr2013chpdf.pdf']:

diff --git a/utils/function_call.py b/utils/function_call.py
@@ -41,6 +41,25 @@
         ],
         "function_call": "auto"
   },
+  "url_fetch": {
+      "functions": [
+          {
+            "name": "get_url_content",
+            "description": "Get the webpage content of a URL",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "url": {
+                  "type": "string",
+                  "description": "The url to get the webpage content"
+                }
+              },
+              "required": ["url"]
+            }
+          }
+        ],
+        "function_call": "auto"
+  },
   # "web_search": {
   #     "functions": [
   #         {