Skip to content

Commit d520dee

Browse files
authored
Document query
1 parent 57d7793 commit d520dee

File tree

3 files changed

+181
-0
lines changed

3 files changed

+181
-0
lines changed

Diff for: docquery.py

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
from langchain.embeddings import SentenceTransformerEmbeddings #HuggingFaceInstructEmbeddings
2+
from langchain.vectorstores import FAISS
3+
import os
4+
import copy
5+
import pprint
6+
#import google.generativeai as palm
7+
from langchain.llms import GooglePalm
8+
from langchain import PromptTemplate
9+
from langchain.chains import RetrievalQA
10+
11+
PALM_API="AIzaSyAIzDH7NVopxUvOL8PAqBnKZqdmAoXeS28"
12+
#palm.configure(api_key=PALM_API)
13+
14+
15+
def getmodel():
16+
"test"
17+
PALM_API="AIzaSyAIzDH7NVopxUvOL8PAqBnKZqdmAoXeS28"
18+
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
19+
db = FAISS.load_local("faiss", embeddings)
20+
retriever = db.as_retriever(search_kwargs={'k': 10})
21+
#prompt=getprompt()
22+
llm=GooglePalm(google_api_key=PALM_API,temperature=0.00003,max_output_tokens=512)
23+
qa_llm = RetrievalQA.from_chain_type(llm=llm,
24+
chain_type='refine',
25+
retriever=retriever,
26+
return_source_documents=True,
27+
#chain_type_kwargs={'prompt': prompt},
28+
verbose=True)
29+
return qa_llm
30+
31+
def getprompt():
32+
template = """Use the information to elaborate in points about the user's query.
33+
If user mentions something not in the 'Context', just answer that you don't know.
34+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
35+
36+
Context: {context}
37+
38+
Query: {question}
39+
40+
Only return the helpful answer below and nothing else.
41+
42+
Helpful answer:
43+
"""
44+
prompt = PromptTemplate(
45+
template=template,
46+
input_variables=['context', 'question'])
47+
return prompt
48+
49+
def parseresult(result):
50+
51+
PARSED=copy.deepcopy(result)
52+
docs=PARSED['source_documents']
53+
sourcepage=[]
54+
for d in docs:
55+
sourcepage.append(d.metadata['page'])
56+
PARSED['source_pages']=copy.deepcopy(sourcepage)
57+
del sourcepage,result
58+
return PARSED
59+
60+
def getsources(result):
61+
sources=[]
62+
for s in result['source_documents']:
63+
sources.append(f"{s.metadata}")
64+
return sources
65+
66+
def EXTRACT():
67+
print(f"{'>>>'*17} QUERY DOCS{'<<<'*17}")
68+
try:
69+
llm=getmodel()
70+
except:
71+
print("CANNOT LOAD MODEL OR DATABASE")
72+
print(f"{'###'*40}")
73+
return
74+
while True:
75+
print(f"{'###'*40}")
76+
prompt=input("(To stop querying enter exit) \n Query : ")
77+
78+
if prompt:
79+
if prompt.find('exit')==0:
80+
return
81+
else:
82+
pass
83+
84+
try:
85+
result=parseresult(llm(prompt))
86+
sources=getsources(result)
87+
result=result["result"]
88+
except:
89+
result='Error ocurred!'
90+
sources=[]
91+
print(f"{'!!!'*40}")
92+
print(f"QUERY: {prompt}")
93+
print(f"{'###'*40}")
94+
print("RESULT:")
95+
#print(f"{'###'*40}")
96+
print(result)
97+
print(f"{'$$$'*40}")
98+
print("SOURCES:")
99+
#print(f"{'$$$'*40}")
100+
print(sources)
101+
print(f"{'>>>'*40}")

Diff for: ingest.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from langchain.document_loaders import PyPDFLoader
2+
from langchain.text_splitter import RecursiveCharacterTextSplitter
3+
from langchain.embeddings import SentenceTransformerEmbeddings #HuggingFaceInstructEmbeddings
4+
from langchain.vectorstores import FAISS
5+
6+
import os
7+
def getdoctext(dir):
8+
os.chdir(dir)
9+
files=os.listdir()
10+
texts=[]
11+
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n","\n","."," "],chunk_size=2000, chunk_overlap=100)
12+
for file in files:
13+
if ".pdf" in file:
14+
print(file)
15+
texts=texts + text_splitter.split_documents(PyPDFLoader(file).load())
16+
os.chdir("..")
17+
return texts
18+
19+
def pageextract(texts):
20+
PAGES=[]
21+
id=0
22+
while id<len(texts):
23+
if (texts[id].metadata['page']-7)==len(PAGES):
24+
temp=''
25+
while id<len(texts) and (texts[id].metadata['page']-7)==len(PAGES) :
26+
temp=temp+texts[id].page_content
27+
id=id+1
28+
PAGES.append(copy.deepcopy(texts[0]))
29+
PAGES[-1].page_content=temp
30+
PAGES[-1].metadata['page']=1*len(PAGES)
31+
32+
del PAGES
33+
34+
def makedb(chunks,embeddings):
35+
36+
db = FAISS.from_documents(chunks, embeddings)
37+
db.save_local("faiss")
38+
del db
39+
40+
41+
def INGESTER():
42+
chunks=getdoctext("data")
43+
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
44+
makedb(chunks,embeddings)

Diff for: streamlit_app.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from langchain.document_loaders import PyPDFLoader
2+
from langchain.text_splitter import RecursiveCharacterTextSplitter
3+
from langchain.embeddings import SentenceTransformerEmbeddings #HuggingFaceInstructEmbeddings
4+
from langchain.vectorstores import FAISS
5+
import os
6+
import copy
7+
import time
8+
import pprint
9+
#import google.generativeai as palm
10+
from langchain.llms import GooglePalm
11+
from langchain import PromptTemplate
12+
from langchain.chains import RetrievalQA
13+
#import streamlit as st
14+
import os
15+
import subprocess
16+
import sentence_transformers
17+
import nltk
18+
19+
while True:
20+
choice=input(f"{'>>'*10} \n Choose : 0-> Ingest ; 1->Query ; 2->Exit \n : ")
21+
22+
if choice=='0':
23+
from ingest import INGESTER
24+
print("INGESTING!")
25+
INGESTER()
26+
print("Ingested")
27+
elif choice=='1':
28+
from docquery import EXTRACT
29+
print("QUERYING!")
30+
EXTRACT()
31+
elif choice=='2':
32+
print("Exiting!")
33+
time.sleep(2)
34+
break
35+
else:
36+
print("Invalid choice!")

0 commit comments

Comments
 (0)