Skip to content

Commit

Permalink
1. Remove the dependency on the vector database
Browse files Browse the repository at this point in the history
2. Fix the problem of extra backslashes when displaying LaTeX formulas

3. Support all model document Q&A(include pdf, txt)
  • Loading branch information
yym68686 committed Dec 7, 2023
1 parent 4f74fa7 commit 6145f88
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 62 deletions.
66 changes: 16 additions & 50 deletions bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from utils.chatgpt2api import Chatbot as GPT
from utils.chatgpt2api import claudebot
from telegram.constants import ChatAction
from utils.agent import docQA, get_doc_from_local, claudeQA
from utils.agent import docQA, get_doc_from_local, Document_extract, pdfQA
from telegram import BotCommand, InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import CommandHandler, MessageHandler, ApplicationBuilder, filters, CallbackQueryHandler, Application, AIORateLimiter
from config import WEB_HOOK, PORT, BOT_TOKEN
Expand Down Expand Up @@ -76,10 +76,7 @@ async def command_bot(update, context, language=None, prompt=translator_prompt,

file_name = pdf_file.file_name
docpath = os.getcwd() + "/" + file_name
if "cluade" in config.GPT_ENGINE:
result = await claudeQA(file_url, question)
else:
result = await pdfQA(file_url, docpath, question)
result = await pdfQA(file_url, docpath, question)
print(result)
await context.bot.send_message(chat_id=update.message.chat_id, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True)

Expand Down Expand Up @@ -517,58 +514,27 @@ async def info(update, context):
messageid = message.message_id
await context.bot.delete_message(chat_id=update.effective_chat.id, message_id=update.message.message_id)

from utils.agent import pdfQA, getmd5, persist_emdedding_pdf, get_doc_from_url
from pdfminer.high_level import extract_text
@decorators.Authorization
async def handle_pdf(update, context):
# 获取接收到的文件
pdf_file = update.message.document
# 得到文件的url
# file_name = pdf_file.file_name
# docpath = os.getcwd() + "/" + file_name
file_id = pdf_file.file_id
new_file = await context.bot.get_file(file_id)
file_url = new_file.file_path
filename = get_doc_from_url(file_url)
docpath = os.getcwd() + "/" + filename
if config.ClaudeAPI:
text = extract_text(docpath)
prompt = (
"Here is the document, inside <document></document> XML tags:"
"<document>"
"{}"
"</document>"
)
# print(prompt.format(text))
config.claudeBot.add_to_conversation(prompt.format(text), "Human", str(update.effective_chat.id))
message = (
f"文档上传成功!\n\n"
)
os.remove(docpath)
await context.bot.send_message(chat_id=update.message.chat_id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)

# persist_db_path = getmd5(docpath)
# match_embedding = os.path.exists(persist_db_path)
# file_id = pdf_file.file_id
# new_file = await context.bot.get_file(file_id)
# file_url = new_file.file_path

# question = update.message.caption
# if question is None:
# if not match_embedding:
# persist_emdedding_pdf(file_url, persist_db_path)
# message = (
# f"已成功解析文档!\n\n"
# f"请输入 `要问的问题`\n\n"
# f"例如已经上传某文档 ,问题是 蘑菇怎么分类?\n\n"
# f"先左滑文档进入回复模式,并在聊天框里面输入 `蘑菇怎么分类?`\n\n"
# )
# await context.bot.send_message(chat_id=update.effective_chat.id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)
# return

# result = await pdfQA(file_url, docpath, question)
# print(result)
# await context.bot.send_message(chat_id=update.message.chat_id, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True)
extracted_text_with_prompt = Document_extract(file_url)
# print(extracted_text_with_prompt)
if config.ClaudeAPI and "claude" in config.GPT_ENGINE:
robot = config.claudeBot
role = "Human"
else:
robot = config.ChatGPTbot
role = "user"
robot.add_to_conversation(extracted_text_with_prompt, role, str(update.effective_chat.id))
message = (
f"文档上传成功!\n\n"
)
await context.bot.send_message(chat_id=update.message.chat_id, text=escape(message), parse_mode='MarkdownV2', disable_web_page_preview=True)

@decorators.Authorization
async def qa(update, context):
Expand Down Expand Up @@ -651,7 +617,7 @@ async def post_init(application: Application) -> None:
application.add_handler(CommandHandler("zh2en", lambda update, context: command_bot(update, context, "english", robot=config.ChatGPTbot)))
application.add_handler(CommandHandler("info", info))
application.add_handler(CommandHandler("qa", qa))
application.add_handler(MessageHandler(filters.Document.MimeType('application/pdf'), handle_pdf))
application.add_handler(MessageHandler(filters.Document.PDF | filters.Document.TXT | filters.Document.DOC, handle_pdf))
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, lambda update, context: command_bot(update, context, prompt=None, title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.ChatGPTbot, has_command=False)))
application.add_handler(MessageHandler(filters.COMMAND, unknown))
application.add_error_handler(error)
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ requests
python-telegram-bot[webhooks,rate-limiter]==20.6

# langchain
chromadb
# chromadb
# unstructured[md,pdf]
# unstructured[md,pdf]
wikipedia
fake_useragent
openai==0.28.1
google-api-python-client
unstructured[md,pdf]
duckduckgo-search==3.9.6
# duckduckgo-search==3.8.5
langchain==0.0.271
Expand Down
3 changes: 2 additions & 1 deletion test/test_pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pdfminer.high_level import extract_text
text = extract_text('/Users/yanyuming/Library/Mobile Documents/iCloud~QReader~MarginStudy/Documents/论文/VersatileGait- A Large-Scale Synthetic Gait Dataset with Fine-Grained Attributes and Complicated Scenarios.pdf')
text = extract_text('/Users/yanyuming/Desktop/中国计算机学会推荐中文科技期刊目录.pdf')
# text = extract_text('/Users/yanyuming/Library/Mobile Documents/iCloud~QReader~MarginStudy/Documents/论文/VersatileGait- A Large-Scale Synthetic Gait Dataset with Fine-Grained Attributes and Complicated Scenarios.pdf')
# print(repr(text))
print(text)

Expand Down
27 changes: 18 additions & 9 deletions utils/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,15 +162,6 @@ async def pdfQA(docurl, docpath, query_message, model="gpt-3.5-turbo"):
result = qa({"query": query_message})
return result['result']

async def claudeQA(docurl, query_message):
from pdfminer.high_level import extract_text
filename = get_doc_from_url(docurl)
docpath = os.getcwd() + "/" + filename
text = extract_text(docpath)
print(text)
prompt = f"""你需要回答的问题是:{query_message}"""
return text

def pdf_search(docurl, query_message, model="gpt-3.5-turbo"):
chatllm = ChatOpenAI(temperature=0.5, openai_api_base=config.bot_api_url.v1_url, model_name=model, openai_api_key=os.environ.get('API', None))
embeddings = OpenAIEmbeddings(openai_api_base=config.bot_api_url.v1_url, openai_api_key=os.environ.get('API', None))
Expand All @@ -194,6 +185,24 @@ def pdf_search(docurl, query_message, model="gpt-3.5-turbo"):
result = qa({"query": query_message})
return result['result']

def Document_extract(docurl):
filename = get_doc_from_url(docurl)
docpath = os.getcwd() + "/" + filename
if filename[-3:] == "pdf":
from pdfminer.high_level import extract_text
text = extract_text(docpath)
if filename[-3:] == "txt":
with open(docpath, 'r') as f:
text = f.read()
prompt = (
"Here is the document, inside <document></document> XML tags:"
"<document>"
"{}"
"</document>"
).format(text)
os.remove(docpath)
return prompt

from typing import Optional, List
from langchain.llms.base import LLM
import g4f
Expand Down
2 changes: 2 additions & 0 deletions utils/chatgpt2api.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,8 @@ def add_to_conversation(
"""
Add a message to the conversation
"""
if convo_id not in self.conversation:
self.reset(convo_id=convo_id)
if function_name == "" and message != "":
self.conversation[convo_id].append({"role": role, "content": message})
else:
Expand Down
9 changes: 9 additions & 0 deletions utils/md2tgmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ def escape(text, flag=0):
# In all other places characters
# _ * [ ] ( ) ~ ` > # + - = | { } . !
# must be escaped with the preceding character '\'.
text = re.sub(r"\\\[", '@->@', text)
text = re.sub(r"\\\]", '@<-@', text)
text = re.sub(r"\\\(", '@-->@', text)
text = re.sub(r"\\\)", '@<--@', text)
if flag:
text = re.sub(r"\\\\", '@@@', text)
text = re.sub(r"\\", r"\\\\", text)
Expand All @@ -59,6 +63,10 @@ def escape(text, flag=0):
text = re.sub(r"\]", '\]', text)
text = re.sub(r"\(", '\(', text)
text = re.sub(r"\)", '\)', text)
text = re.sub(r"\@\-\>\@", '\[', text)
text = re.sub(r"\@\<\-\@", '\]', text)
text = re.sub(r"\@\-\-\>\@", '\(', text)
text = re.sub(r"\@\<\-\-\@", '\)', text)
text = re.sub(r"\@{3}(.*?)\@{3}\^{3}(.*?)\^{3}", '[\\1](\\2)', text)
text = re.sub(r"~", '\~', text)
text = re.sub(r">", '\>', text)
Expand Down Expand Up @@ -130,6 +138,7 @@ def escape(text, flag=0):
Cxy = abs (Pxy)**2/ (Pxx*Pyy)
`a`a-b-c`n`
\[ E[X^4] = \int_{-\infty}^{\infty} x^4 f(x) dx \]
`-a----++++`++a-b-c`-n-`
`[^``]*`a``b-c``d``
Expand Down

0 comments on commit 6145f88

Please sign in to comment.