diff --git a/bot.py b/bot.py
index a09fac68..8aabecdf 100644
--- a/bot.py
+++ b/bot.py
@@ -8,7 +8,7 @@
from utils.chatgpt2api import Chatbot as GPT
from utils.chatgpt2api import claudebot
from telegram.constants import ChatAction
-from utils.agent import docQA, get_doc_from_local, claudeQA
+from utils.agent import docQA, get_doc_from_local, Document_extract, pdfQA
from telegram import BotCommand, InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import CommandHandler, MessageHandler, ApplicationBuilder, filters, CallbackQueryHandler, Application, AIORateLimiter
from config import WEB_HOOK, PORT, BOT_TOKEN
@@ -76,10 +76,7 @@ async def command_bot(update, context, language=None, prompt=translator_prompt,
file_name = pdf_file.file_name
docpath = os.getcwd() + "/" + file_name
- if "cluade" in config.GPT_ENGINE:
- result = await claudeQA(file_url, question)
- else:
- result = await pdfQA(file_url, docpath, question)
+ result = await pdfQA(file_url, docpath, question)
print(result)
await context.bot.send_message(chat_id=update.message.chat_id, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True)
@@ -517,58 +514,27 @@ async def info(update, context):
messageid = message.message_id
await context.bot.delete_message(chat_id=update.effective_chat.id, message_id=update.message.message_id)
-from utils.agent import pdfQA, getmd5, persist_emdedding_pdf, get_doc_from_url
-from pdfminer.high_level import extract_text
@decorators.Authorization
async def handle_pdf(update, context):
# 获取接收到的文件
pdf_file = update.message.document
# 得到文件的url
- # file_name = pdf_file.file_name
- # docpath = os.getcwd() + "/" + file_name
file_id = pdf_file.file_id
new_file = await context.bot.get_file(file_id)
file_url = new_file.file_path
- filename = get_doc_from_url(file_url)
- docpath = os.getcwd() + "/" + filename
- if config.ClaudeAPI:
- text = extract_text(docpath)
- prompt = (
- "Here is the document, inside XML tags:"
- ""
- "{}"
- ""
- )
- # print(prompt.format(text))
- config.claudeBot.add_to_conversation(prompt.format(text), "Human", str(update.effective_chat.id))
- message = (
- f"文档上传成功!\n\n"
- )
- os.remove(docpath)
- await context.bot.send_message(chat_id=update.message.chat_id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)
-
- # persist_db_path = getmd5(docpath)
- # match_embedding = os.path.exists(persist_db_path)
- # file_id = pdf_file.file_id
- # new_file = await context.bot.get_file(file_id)
- # file_url = new_file.file_path
-
- # question = update.message.caption
- # if question is None:
- # if not match_embedding:
- # persist_emdedding_pdf(file_url, persist_db_path)
- # message = (
- # f"已成功解析文档!\n\n"
- # f"请输入 `要问的问题`\n\n"
- # f"例如已经上传某文档 ,问题是 蘑菇怎么分类?\n\n"
- # f"先左滑文档进入回复模式,并在聊天框里面输入 `蘑菇怎么分类?`\n\n"
- # )
- # await context.bot.send_message(chat_id=update.effective_chat.id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)
- # return
-
- # result = await pdfQA(file_url, docpath, question)
- # print(result)
- # await context.bot.send_message(chat_id=update.message.chat_id, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True)
+ extracted_text_with_prompt = Document_extract(file_url)
+ # print(extracted_text_with_prompt)
+ if config.ClaudeAPI and "claude" in config.GPT_ENGINE:
+ robot = config.claudeBot
+ role = "Human"
+ else:
+ robot = config.ChatGPTbot
+ role = "user"
+ robot.add_to_conversation(extracted_text_with_prompt, role, str(update.effective_chat.id))
+ message = (
+ f"文档上传成功!\n\n"
+ )
+ await context.bot.send_message(chat_id=update.message.chat_id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)
@decorators.Authorization
async def qa(update, context):
@@ -651,7 +617,7 @@ async def post_init(application: Application) -> None:
application.add_handler(CommandHandler("zh2en", lambda update, context: command_bot(update, context, "english", robot=config.ChatGPTbot)))
application.add_handler(CommandHandler("info", info))
application.add_handler(CommandHandler("qa", qa))
- application.add_handler(MessageHandler(filters.Document.MimeType('application/pdf'), handle_pdf))
+ application.add_handler(MessageHandler(filters.Document.PDF | filters.Document.TXT | filters.Document.DOC, handle_pdf))
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, lambda update, context: command_bot(update, context, prompt=None, title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.ChatGPTbot, has_command=False)))
application.add_handler(MessageHandler(filters.COMMAND, unknown))
application.add_error_handler(error)
diff --git a/requirements.txt b/requirements.txt
index 7ff7d953..1c78e9a2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,12 +4,13 @@ requests
python-telegram-bot[webhooks,rate-limiter]==20.6
# langchain
-chromadb
+# chromadb
+# unstructured[md,pdf]
+# unstructured[md,pdf]
wikipedia
fake_useragent
openai==0.28.1
google-api-python-client
-unstructured[md,pdf]
duckduckgo-search==3.9.6
# duckduckgo-search==3.8.5
langchain==0.0.271
diff --git a/test/test_pdf.py b/test/test_pdf.py
index 05bb5491..03a447e2 100644
--- a/test/test_pdf.py
+++ b/test/test_pdf.py
@@ -1,5 +1,6 @@
from pdfminer.high_level import extract_text
-text = extract_text('/Users/yanyuming/Library/Mobile Documents/iCloud~QReader~MarginStudy/Documents/论文/VersatileGait- A Large-Scale Synthetic Gait Dataset with Fine-Grained Attributes and Complicated Scenarios.pdf')
+text = extract_text('/Users/yanyuming/Desktop/中国计算机学会推荐中文科技期刊目录.pdf')
+# text = extract_text('/Users/yanyuming/Library/Mobile Documents/iCloud~QReader~MarginStudy/Documents/论文/VersatileGait- A Large-Scale Synthetic Gait Dataset with Fine-Grained Attributes and Complicated Scenarios.pdf')
# print(repr(text))
print(text)
diff --git a/utils/agent.py b/utils/agent.py
index d26056ff..f156a3e4 100644
--- a/utils/agent.py
+++ b/utils/agent.py
@@ -162,15 +162,6 @@ async def pdfQA(docurl, docpath, query_message, model="gpt-3.5-turbo"):
result = qa({"query": query_message})
return result['result']
-async def claudeQA(docurl, query_message):
- from pdfminer.high_level import extract_text
- filename = get_doc_from_url(docurl)
- docpath = os.getcwd() + "/" + filename
- text = extract_text(docpath)
- print(text)
- prompt = f"""你需要回答的问题是:{query_message}"""
- return text
-
def pdf_search(docurl, query_message, model="gpt-3.5-turbo"):
chatllm = ChatOpenAI(temperature=0.5, openai_api_base=config.bot_api_url.v1_url, model_name=model, openai_api_key=os.environ.get('API', None))
embeddings = OpenAIEmbeddings(openai_api_base=config.bot_api_url.v1_url, openai_api_key=os.environ.get('API', None))
@@ -194,6 +185,24 @@ def pdf_search(docurl, query_message, model="gpt-3.5-turbo"):
result = qa({"query": query_message})
return result['result']
+def Document_extract(docurl):
+ filename = get_doc_from_url(docurl)
+ docpath = os.getcwd() + "/" + filename
+ if filename[-3:] == "pdf":
+ from pdfminer.high_level import extract_text
+ text = extract_text(docpath)
+ if filename[-3:] == "txt":
+ with open(docpath, 'r') as f:
+ text = f.read()
+ prompt = (
+ "Here is the document, inside XML tags:"
+ ""
+ "{}"
+ ""
+ ).format(text)
+ os.remove(docpath)
+ return prompt
+
from typing import Optional, List
from langchain.llms.base import LLM
import g4f
diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py
index ce8c6cd6..01478bb1 100644
--- a/utils/chatgpt2api.py
+++ b/utils/chatgpt2api.py
@@ -351,6 +351,8 @@ def add_to_conversation(
"""
Add a message to the conversation
"""
+ if convo_id not in self.conversation:
+ self.reset(convo_id=convo_id)
if function_name == "" and message != "":
self.conversation[convo_id].append({"role": role, "content": message})
else:
diff --git a/utils/md2tgmd.py b/utils/md2tgmd.py
index 8c1fb102..ea9052b1 100644
--- a/utils/md2tgmd.py
+++ b/utils/md2tgmd.py
@@ -44,6 +44,10 @@ def escape(text, flag=0):
# In all other places characters
# _ * [ ] ( ) ~ ` > # + - = | { } . !
# must be escaped with the preceding character '\'.
+ text = re.sub(r"\\\[", '@->@', text)
+ text = re.sub(r"\\\]", '@<-@', text)
+ text = re.sub(r"\\\(", '@-->@', text)
+ text = re.sub(r"\\\)", '@<--@', text)
if flag:
text = re.sub(r"\\\\", '@@@', text)
text = re.sub(r"\\", r"\\\\", text)
@@ -59,6 +63,10 @@ def escape(text, flag=0):
text = re.sub(r"\]", '\]', text)
text = re.sub(r"\(", '\(', text)
text = re.sub(r"\)", '\)', text)
+ text = re.sub(r"\@\-\>\@", '\[', text)
+ text = re.sub(r"\@\<\-\@", '\]', text)
+ text = re.sub(r"\@\-\-\>\@", '\(', text)
+ text = re.sub(r"\@\<\-\-\@", '\)', text)
text = re.sub(r"\@{3}(.*?)\@{3}\^{3}(.*?)\^{3}", '[\\1](\\2)', text)
text = re.sub(r"~", '\~', text)
text = re.sub(r">", '\>', text)
@@ -130,6 +138,7 @@ def escape(text, flag=0):
Cxy = abs (Pxy)**2/ (Pxx*Pyy)
`a`a-b-c`n`
+\[ E[X^4] = \int_{-\infty}^{\infty} x^4 f(x) dx \]
`-a----++++`++a-b-c`-n-`
`[^``]*`a``b-c``d``