1. Remove the dependency on the vector database

2. Fix the problem of extra backslashes when displaying LaTeX formulas 3. Support all model document Q&A(include pdf, txt)
yym68686 · Dec 7, 2023 · 6145f88 · 6145f88
1 parent 4f74fa7
commit 6145f88
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 62 deletions.
diff --git a/bot.py b/bot.py
@@ -8,7 +8,7 @@
 from utils.chatgpt2api import Chatbot as GPT
 from utils.chatgpt2api import claudebot
 from telegram.constants import ChatAction
-from utils.agent import docQA, get_doc_from_local, claudeQA
+from utils.agent import docQA, get_doc_from_local, Document_extract, pdfQA
 from telegram import BotCommand, InlineKeyboardButton, InlineKeyboardMarkup
 from telegram.ext import CommandHandler, MessageHandler, ApplicationBuilder, filters, CallbackQueryHandler, Application, AIORateLimiter
 from config import WEB_HOOK, PORT, BOT_TOKEN
@@ -76,10 +76,7 @@ async def command_bot(update, context, language=None, prompt=translator_prompt,
 
         file_name = pdf_file.file_name
         docpath = os.getcwd() + "/" + file_name
-        if  "cluade" in config.GPT_ENGINE:
-            result = await claudeQA(file_url, question)
-        else:
-            result = await pdfQA(file_url, docpath, question)
+        result = await pdfQA(file_url, docpath, question)
         print(result)
         await context.bot.send_message(chat_id=update.message.chat_id, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True)
 
@@ -517,58 +514,27 @@ async def info(update, context):
     messageid = message.message_id
     await context.bot.delete_message(chat_id=update.effective_chat.id, message_id=update.message.message_id)
 
-from utils.agent import pdfQA, getmd5, persist_emdedding_pdf, get_doc_from_url
-from pdfminer.high_level import extract_text
 @decorators.Authorization
 async def handle_pdf(update, context):
     # 获取接收到的文件
     pdf_file = update.message.document
     # 得到文件的url
-    # file_name = pdf_file.file_name
-    # docpath = os.getcwd() + "/" + file_name
     file_id = pdf_file.file_id
     new_file = await context.bot.get_file(file_id)
     file_url = new_file.file_path
-    filename = get_doc_from_url(file_url)
-    docpath = os.getcwd() + "/" + filename
-    if config.ClaudeAPI:
-        text = extract_text(docpath)
-        prompt = (
-            "Here is the document, inside <document></document> XML tags:"
-            "<document>"
-            "{}"
-            "</document>"
-        )
-        # print(prompt.format(text))
-        config.claudeBot.add_to_conversation(prompt.format(text), "Human", str(update.effective_chat.id))
-        message = (
-            f"文档上传成功！\n\n"
-        )
-        os.remove(docpath)
-        await context.bot.send_message(chat_id=update.message.chat_id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)
-
-    # persist_db_path = getmd5(docpath)
-    # match_embedding = os.path.exists(persist_db_path)
-    # file_id = pdf_file.file_id
-    # new_file = await context.bot.get_file(file_id)
-    # file_url = new_file.file_path
-
-    # question = update.message.caption
-    # if question is None:
-    #     if not match_embedding:
-            # persist_emdedding_pdf(file_url, persist_db_path)
-    #     message = (
-    #         f"已成功解析文档！\n\n"
-    #         f"请输入 `要问的问题`\n\n"
-    #         f"例如已经上传某文档 ，问题是 蘑菇怎么分类？\n\n"
-    #         f"先左滑文档进入回复模式，并在聊天框里面输入 `蘑菇怎么分类？`\n\n"
-    #     )
-    #     await context.bot.send_message(chat_id=update.effective_chat.id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)
-    #     return
-
-    # result = await pdfQA(file_url, docpath, question)
-    # print(result)
-    # await context.bot.send_message(chat_id=update.message.chat_id, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True)
+    extracted_text_with_prompt = Document_extract(file_url)
+    # print(extracted_text_with_prompt)
+    if config.ClaudeAPI and "claude" in config.GPT_ENGINE:
+        robot = config.claudeBot
+        role = "Human"
+    else:
+        robot = config.ChatGPTbot
+        role = "user"
+    robot.add_to_conversation(extracted_text_with_prompt, role, str(update.effective_chat.id))
+    message = (
+        f"文档上传成功！\n\n"
+    )
+    await context.bot.send_message(chat_id=update.message.chat_id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)
 
 @decorators.Authorization
 async def qa(update, context):
@@ -651,7 +617,7 @@ async def post_init(application: Application) -> None:
     application.add_handler(CommandHandler("zh2en", lambda update, context: command_bot(update, context, "english", robot=config.ChatGPTbot)))
     application.add_handler(CommandHandler("info", info))
     application.add_handler(CommandHandler("qa", qa))
-    application.add_handler(MessageHandler(filters.Document.MimeType('application/pdf'), handle_pdf))
+    application.add_handler(MessageHandler(filters.Document.PDF | filters.Document.TXT | filters.Document.DOC, handle_pdf))
     application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, lambda update, context: command_bot(update, context, prompt=None, title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.ChatGPTbot, has_command=False)))
     application.add_handler(MessageHandler(filters.COMMAND, unknown))
     application.add_error_handler(error)

diff --git a/requirements.txt b/requirements.txt
@@ -4,12 +4,13 @@ requests
 python-telegram-bot[webhooks,rate-limiter]==20.6
 
 # langchain
-chromadb
+# chromadb
+# unstructured[md,pdf]
+# unstructured[md,pdf]
 wikipedia
 fake_useragent
 openai==0.28.1
 google-api-python-client
-unstructured[md,pdf]
 duckduckgo-search==3.9.6
 # duckduckgo-search==3.8.5
 langchain==0.0.271

diff --git a/test/test_pdf.py b/test/test_pdf.py
@@ -1,5 +1,6 @@
 from pdfminer.high_level import extract_text
-text = extract_text('/Users/yanyuming/Library/Mobile Documents/iCloud~QReader~MarginStudy/Documents/论文/VersatileGait- A Large-Scale Synthetic Gait Dataset with Fine-Grained Attributes and Complicated Scenarios.pdf')
+text = extract_text('/Users/yanyuming/Desktop/中国计算机学会推荐中文科技期刊目录.pdf')
+# text = extract_text('/Users/yanyuming/Library/Mobile Documents/iCloud~QReader~MarginStudy/Documents/论文/VersatileGait- A Large-Scale Synthetic Gait Dataset with Fine-Grained Attributes and Complicated Scenarios.pdf')
 # print(repr(text))
 print(text)
 

diff --git a/utils/agent.py b/utils/agent.py
@@ -162,15 +162,6 @@ async def pdfQA(docurl, docpath, query_message, model="gpt-3.5-turbo"):
     result = qa({"query": query_message})
     return result['result']
 
-async def claudeQA(docurl, query_message):
-    from pdfminer.high_level import extract_text
-    filename = get_doc_from_url(docurl)
-    docpath = os.getcwd() + "/" + filename
-    text = extract_text(docpath)
-    print(text)
-    prompt = f"""你需要回答的问题是：{query_message}"""
-    return text
-
 def pdf_search(docurl, query_message, model="gpt-3.5-turbo"):
     chatllm = ChatOpenAI(temperature=0.5, openai_api_base=config.bot_api_url.v1_url, model_name=model, openai_api_key=os.environ.get('API', None))
     embeddings = OpenAIEmbeddings(openai_api_base=config.bot_api_url.v1_url, openai_api_key=os.environ.get('API', None))
@@ -194,6 +185,24 @@ def pdf_search(docurl, query_message, model="gpt-3.5-turbo"):
     result = qa({"query": query_message})
     return result['result']
 
+def Document_extract(docurl):
+    filename = get_doc_from_url(docurl)
+    docpath = os.getcwd() + "/" + filename
+    if filename[-3:] == "pdf":
+        from pdfminer.high_level import extract_text
+        text = extract_text(docpath)
+    if filename[-3:] == "txt":
+        with open(docpath, 'r') as f:
+            text = f.read()
+    prompt = (
+        "Here is the document, inside <document></document> XML tags:"
+        "<document>"
+        "{}"
+        "</document>"
+    ).format(text)
+    os.remove(docpath)
+    return prompt
+
 from typing import Optional, List
 from langchain.llms.base import LLM
 import g4f

diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py
@@ -351,6 +351,8 @@ def add_to_conversation(
         """
         Add a message to the conversation
         """
+        if convo_id not in self.conversation:
+            self.reset(convo_id=convo_id)
         if function_name == "" and message != "":
             self.conversation[convo_id].append({"role": role, "content": message})
         else:

diff --git a/utils/md2tgmd.py b/utils/md2tgmd.py
@@ -44,6 +44,10 @@ def escape(text, flag=0):
     # In all other places characters
     # _ * [ ] ( ) ~ ` > # + - = | { } . !
     # must be escaped with the preceding character '\'.
+    text = re.sub(r"\\\[", '@->@', text)
+    text = re.sub(r"\\\]", '@<-@', text)
+    text = re.sub(r"\\\(", '@-->@', text)
+    text = re.sub(r"\\\)", '@<--@', text)
     if flag:
         text = re.sub(r"\\\\", '@@@', text)
     text = re.sub(r"\\", r"\\\\", text)
@@ -59,6 +63,10 @@ def escape(text, flag=0):
     text = re.sub(r"\]", '\]', text)
     text = re.sub(r"\(", '\(', text)
     text = re.sub(r"\)", '\)', text)
+    text = re.sub(r"\@\-\>\@", '\[', text)
+    text = re.sub(r"\@\<\-\@", '\]', text)
+    text = re.sub(r"\@\-\-\>\@", '\(', text)
+    text = re.sub(r"\@\<\-\-\@", '\)', text)
     text = re.sub(r"\@{3}(.*?)\@{3}\^{3}(.*?)\^{3}", '[\\1](\\2)', text)
     text = re.sub(r"~", '\~', text)
     text = re.sub(r">", '\>', text)
@@ -130,6 +138,7 @@ def escape(text, flag=0):
 Cxy = abs (Pxy)**2/ (Pxx*Pyy)
 
 `a`a-b-c`n`
+\[ E[X^4] = \int_{-\infty}^{\infty} x^4 f(x) dx \]
 
 `-a----++++`++a-b-c`-n-`
 `[^``]*`a``b-c``d``