From fd204036df8e2071eac9b37bc682d25b70b329f7 Mon Sep 17 00:00:00 2001 From: yym68686 Date: Thu, 19 Oct 2023 22:52:53 +0800 Subject: [PATCH] update g4f version to 0.1.6.7. fixed bug: pdf load error --- agent.py | 9 +++++-- requirements.txt | 2 +- test/test_Faucet.py | 10 +++++++ test/test_download_pdf.py | 56 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 test/test_Faucet.py create mode 100644 test/test_download_pdf.py diff --git a/agent.py b/agent.py index 86986f19..ad3224a9 100644 --- a/agent.py +++ b/agent.py @@ -7,6 +7,7 @@ import requests import threading import traceback +import urllib.parse from typing import Any import time from datetime import date @@ -126,7 +127,7 @@ async def docQA(docpath, query_message, persist_db_path="db", model = "gpt-3.5-t return result def get_doc_from_url(url): - filename = url.split("/")[-1] + filename = urllib.parse.unquote(url.split("/")[-1]) response = requests.get(url, stream=True) with open(filename, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): @@ -167,7 +168,11 @@ def pdf_search(docurl, query_message, model="gpt-3.5-turbo"): filename = get_doc_from_url(docurl) docpath = os.getcwd() + "/" + filename loader = UnstructuredPDFLoader(docpath) - documents = loader.load() + try: + documents = loader.load() + except: + print("pdf load error! docpath:", docpath) + return "" os.remove(docpath) # 初始化加载器 text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=25) diff --git a/requirements.txt b/requirements.txt index 0741d6b3..b8b73391 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,4 @@ unstructured[pdf] duckduckgo-search==3.8.5 langchain==0.0.271 oauth2client==3.0.0 -g4f==0.1.6.6 \ No newline at end of file +g4f==0.1.6.7 \ No newline at end of file diff --git a/test/test_Faucet.py b/test/test_Faucet.py new file mode 100644 index 00000000..6aa836e5 --- /dev/null +++ b/test/test_Faucet.py @@ -0,0 +1,10 @@ +from langchain.chat_models import ChatOpenAI +from langchain.schema import HumanMessage + +def gptsearch(result, llm): + response = llm([HumanMessage(content=result)]) + response = response.content + return response + + +print(gptsearch("鲁迅和周树人为什么打架", chainllm)) \ No newline at end of file diff --git a/test/test_download_pdf.py b/test/test_download_pdf.py new file mode 100644 index 00000000..b5dc7298 --- /dev/null +++ b/test/test_download_pdf.py @@ -0,0 +1,56 @@ +# import requests +# import urllib.parse +# import os +# import sys +# sys.path.append(os.getcwd()) +# import config + +# from langchain.chat_models import ChatOpenAI +# from langchain.embeddings.openai import OpenAIEmbeddings +# from langchain.vectorstores import Chroma +# from langchain.text_splitter import CharacterTextSplitter +# from langchain.document_loaders import UnstructuredPDFLoader +# from langchain.chains import RetrievalQA + + +# def get_doc_from_url(url): +# filename = urllib.parse.unquote(url.split("/")[-1]) +# response = requests.get(url, stream=True) +# with open(filename, 'wb') as f: +# for chunk in response.iter_content(chunk_size=1024): +# f.write(chunk) +# return filename + +# def pdf_search(docurl, query_message, model="gpt-3.5-turbo"): +# chatllm = ChatOpenAI(temperature=0.5, openai_api_base=config.API_URL.split("chat")[0], model_name=model, openai_api_key=os.environ.get('API', None)) +# embeddings = OpenAIEmbeddings(openai_api_base=config.API_URL.split("chat")[0], openai_api_key=os.environ.get('API', None)) +# filename = get_doc_from_url(docurl) +# docpath = os.getcwd() + "/" + filename +# loader = UnstructuredPDFLoader(docpath) +# print(docpath) +# documents = loader.load() +# os.remove(docpath) +# # 初始化加载器 +# text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=25) +# # 切割加载的 document +# split_docs = text_splitter.split_documents(documents) +# vector_store = Chroma.from_documents(split_docs, embeddings) +# # 创建问答对象 +# qa = RetrievalQA.from_chain_type(llm=chatllm, chain_type="stuff", retriever=vector_store.as_retriever(),return_source_documents=True) +# # 进行问答 +# result = qa({"query": query_message}) +# return result['result'] + +# pdf_search("https://www.nsfc.gov.cn/csc/20345/22468/pdf/2001/%E5%86%BB%E7%BB%93%E8%A3%82%E9%9A%99%E7%A0%82%E5%B2%A9%E4%BD%8E%E5%91%A8%E5%BE%AA%E7%8E%AF%E5%8A%A8%E5%8A%9B%E7%89%B9%E6%80%A7%E8%AF%95%E9%AA%8C%E7%A0%94%E7%A9%B6.pdf", "端水实验的目的是什么?") + +from PyPDF2 import PdfReader + +def has_text(pdf_path): + with open(pdf_path, 'rb') as file: + pdf = PdfReader(file) + page = pdf.pages[0] + text = page.extract_text() + return text + +pdf_path = '/Users/yanyuming/Downloads/GitHub/ChatGPT-Telegram-Bot/冻结裂隙砂岩低周循环动力特性试验研究.pdf' +print(has_text(pdf_path)) \ No newline at end of file