diff --git a/bot.py b/bot.py index a09fac68..8aabecdf 100644 --- a/bot.py +++ b/bot.py @@ -8,7 +8,7 @@ from utils.chatgpt2api import Chatbot as GPT from utils.chatgpt2api import claudebot from telegram.constants import ChatAction -from utils.agent import docQA, get_doc_from_local, claudeQA +from utils.agent import docQA, get_doc_from_local, Document_extract, pdfQA from telegram import BotCommand, InlineKeyboardButton, InlineKeyboardMarkup from telegram.ext import CommandHandler, MessageHandler, ApplicationBuilder, filters, CallbackQueryHandler, Application, AIORateLimiter from config import WEB_HOOK, PORT, BOT_TOKEN @@ -76,10 +76,7 @@ async def command_bot(update, context, language=None, prompt=translator_prompt, file_name = pdf_file.file_name docpath = os.getcwd() + "/" + file_name - if "cluade" in config.GPT_ENGINE: - result = await claudeQA(file_url, question) - else: - result = await pdfQA(file_url, docpath, question) + result = await pdfQA(file_url, docpath, question) print(result) await context.bot.send_message(chat_id=update.message.chat_id, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True) @@ -517,58 +514,27 @@ async def info(update, context): messageid = message.message_id await context.bot.delete_message(chat_id=update.effective_chat.id, message_id=update.message.message_id) -from utils.agent import pdfQA, getmd5, persist_emdedding_pdf, get_doc_from_url -from pdfminer.high_level import extract_text @decorators.Authorization async def handle_pdf(update, context): # 获取接收到的文件 pdf_file = update.message.document # 得到文件的url - # file_name = pdf_file.file_name - # docpath = os.getcwd() + "/" + file_name file_id = pdf_file.file_id new_file = await context.bot.get_file(file_id) file_url = new_file.file_path - filename = get_doc_from_url(file_url) - docpath = os.getcwd() + "/" + filename - if config.ClaudeAPI: - text = extract_text(docpath) - prompt = ( - "Here is the document, inside XML tags:" - "" - "{}" - "" - ) - # print(prompt.format(text)) - config.claudeBot.add_to_conversation(prompt.format(text), "Human", str(update.effective_chat.id)) - message = ( - f"文档上传成功!\n\n" - ) - os.remove(docpath) - await context.bot.send_message(chat_id=update.message.chat_id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True) - - # persist_db_path = getmd5(docpath) - # match_embedding = os.path.exists(persist_db_path) - # file_id = pdf_file.file_id - # new_file = await context.bot.get_file(file_id) - # file_url = new_file.file_path - - # question = update.message.caption - # if question is None: - # if not match_embedding: - # persist_emdedding_pdf(file_url, persist_db_path) - # message = ( - # f"已成功解析文档!\n\n" - # f"请输入 `要问的问题`\n\n" - # f"例如已经上传某文档 ,问题是 蘑菇怎么分类?\n\n" - # f"先左滑文档进入回复模式,并在聊天框里面输入 `蘑菇怎么分类?`\n\n" - # ) - # await context.bot.send_message(chat_id=update.effective_chat.id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True) - # return - - # result = await pdfQA(file_url, docpath, question) - # print(result) - # await context.bot.send_message(chat_id=update.message.chat_id, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True) + extracted_text_with_prompt = Document_extract(file_url) + # print(extracted_text_with_prompt) + if config.ClaudeAPI and "claude" in config.GPT_ENGINE: + robot = config.claudeBot + role = "Human" + else: + robot = config.ChatGPTbot + role = "user" + robot.add_to_conversation(extracted_text_with_prompt, role, str(update.effective_chat.id)) + message = ( + f"文档上传成功!\n\n" + ) + await context.bot.send_message(chat_id=update.message.chat_id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True) @decorators.Authorization async def qa(update, context): @@ -651,7 +617,7 @@ async def post_init(application: Application) -> None: application.add_handler(CommandHandler("zh2en", lambda update, context: command_bot(update, context, "english", robot=config.ChatGPTbot))) application.add_handler(CommandHandler("info", info)) application.add_handler(CommandHandler("qa", qa)) - application.add_handler(MessageHandler(filters.Document.MimeType('application/pdf'), handle_pdf)) + application.add_handler(MessageHandler(filters.Document.PDF | filters.Document.TXT | filters.Document.DOC, handle_pdf)) application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, lambda update, context: command_bot(update, context, prompt=None, title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.ChatGPTbot, has_command=False))) application.add_handler(MessageHandler(filters.COMMAND, unknown)) application.add_error_handler(error) diff --git a/requirements.txt b/requirements.txt index 7ff7d953..1c78e9a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,12 +4,13 @@ requests python-telegram-bot[webhooks,rate-limiter]==20.6 # langchain -chromadb +# chromadb +# unstructured[md,pdf] +# unstructured[md,pdf] wikipedia fake_useragent openai==0.28.1 google-api-python-client -unstructured[md,pdf] duckduckgo-search==3.9.6 # duckduckgo-search==3.8.5 langchain==0.0.271 diff --git a/test/test_pdf.py b/test/test_pdf.py index 05bb5491..03a447e2 100644 --- a/test/test_pdf.py +++ b/test/test_pdf.py @@ -1,5 +1,6 @@ from pdfminer.high_level import extract_text -text = extract_text('/Users/yanyuming/Library/Mobile Documents/iCloud~QReader~MarginStudy/Documents/论文/VersatileGait- A Large-Scale Synthetic Gait Dataset with Fine-Grained Attributes and Complicated Scenarios.pdf') +text = extract_text('/Users/yanyuming/Desktop/中国计算机学会推荐中文科技期刊目录.pdf') +# text = extract_text('/Users/yanyuming/Library/Mobile Documents/iCloud~QReader~MarginStudy/Documents/论文/VersatileGait- A Large-Scale Synthetic Gait Dataset with Fine-Grained Attributes and Complicated Scenarios.pdf') # print(repr(text)) print(text) diff --git a/utils/agent.py b/utils/agent.py index d26056ff..f156a3e4 100644 --- a/utils/agent.py +++ b/utils/agent.py @@ -162,15 +162,6 @@ async def pdfQA(docurl, docpath, query_message, model="gpt-3.5-turbo"): result = qa({"query": query_message}) return result['result'] -async def claudeQA(docurl, query_message): - from pdfminer.high_level import extract_text - filename = get_doc_from_url(docurl) - docpath = os.getcwd() + "/" + filename - text = extract_text(docpath) - print(text) - prompt = f"""你需要回答的问题是:{query_message}""" - return text - def pdf_search(docurl, query_message, model="gpt-3.5-turbo"): chatllm = ChatOpenAI(temperature=0.5, openai_api_base=config.bot_api_url.v1_url, model_name=model, openai_api_key=os.environ.get('API', None)) embeddings = OpenAIEmbeddings(openai_api_base=config.bot_api_url.v1_url, openai_api_key=os.environ.get('API', None)) @@ -194,6 +185,24 @@ def pdf_search(docurl, query_message, model="gpt-3.5-turbo"): result = qa({"query": query_message}) return result['result'] +def Document_extract(docurl): + filename = get_doc_from_url(docurl) + docpath = os.getcwd() + "/" + filename + if filename[-3:] == "pdf": + from pdfminer.high_level import extract_text + text = extract_text(docpath) + if filename[-3:] == "txt": + with open(docpath, 'r') as f: + text = f.read() + prompt = ( + "Here is the document, inside XML tags:" + "" + "{}" + "" + ).format(text) + os.remove(docpath) + return prompt + from typing import Optional, List from langchain.llms.base import LLM import g4f diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py index ce8c6cd6..01478bb1 100644 --- a/utils/chatgpt2api.py +++ b/utils/chatgpt2api.py @@ -351,6 +351,8 @@ def add_to_conversation( """ Add a message to the conversation """ + if convo_id not in self.conversation: + self.reset(convo_id=convo_id) if function_name == "" and message != "": self.conversation[convo_id].append({"role": role, "content": message}) else: diff --git a/utils/md2tgmd.py b/utils/md2tgmd.py index 8c1fb102..ea9052b1 100644 --- a/utils/md2tgmd.py +++ b/utils/md2tgmd.py @@ -44,6 +44,10 @@ def escape(text, flag=0): # In all other places characters # _ * [ ] ( ) ~ ` > # + - = | { } . ! # must be escaped with the preceding character '\'. + text = re.sub(r"\\\[", '@->@', text) + text = re.sub(r"\\\]", '@<-@', text) + text = re.sub(r"\\\(", '@-->@', text) + text = re.sub(r"\\\)", '@<--@', text) if flag: text = re.sub(r"\\\\", '@@@', text) text = re.sub(r"\\", r"\\\\", text) @@ -59,6 +63,10 @@ def escape(text, flag=0): text = re.sub(r"\]", '\]', text) text = re.sub(r"\(", '\(', text) text = re.sub(r"\)", '\)', text) + text = re.sub(r"\@\-\>\@", '\[', text) + text = re.sub(r"\@\<\-\@", '\]', text) + text = re.sub(r"\@\-\-\>\@", '\(', text) + text = re.sub(r"\@\<\-\-\@", '\)', text) text = re.sub(r"\@{3}(.*?)\@{3}\^{3}(.*?)\^{3}", '[\\1](\\2)', text) text = re.sub(r"~", '\~', text) text = re.sub(r">", '\>', text) @@ -130,6 +138,7 @@ def escape(text, flag=0): Cxy = abs (Pxy)**2/ (Pxx*Pyy) `a`a-b-c`n` +\[ E[X^4] = \int_{-\infty}^{\infty} x^4 f(x) dx \] `-a----++++`++a-b-c`-n-` `[^``]*`a``b-c``d``