Skip to content

Commit 28656b5

Browse files
authored
Simple Page-By-Page summarisation
1 parent ac4577e commit 28656b5

File tree

2 files changed

+189
-1
lines changed

2 files changed

+189
-1
lines changed

App.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from langchain.llms import GooglePalm
1111
from langchain import PromptTemplate
1212
from langchain.chains import RetrievalQA
13+
from langchain.chains.summarize import load_summarize_chain
14+
from tempfile import NamedTemporaryFile
1315

1416
import streamlit
1517
import streamlit.web.cli as stcli
@@ -28,7 +30,7 @@ def resolve_path(path):
2830

2931

3032
while True:
31-
choice=input(f"{'>>'*10} \n Choose : 0-> Ingest ; 1->Query ; 2->Exit \n : ")
33+
choice=input(f"{'>>'*10} \n Choose : 0-> Ingest ; 1->Query ; 2->Summary ; 3->Exit \n : ")
3234
if choice=='0':
3335
from ingest import INGESTER
3436
print(f"\n\n{'>>'*10}INGESTING!{'<<'*10}\n")
@@ -54,7 +56,20 @@ def resolve_path(path):
5456
except:
5557
print("\n\nERROR WHILE INITIATING QUERYING!\n\n")
5658
#EXTRACT()
59+
5760
elif choice=='2':
61+
print("\nSTARTING Summary-ing!\n")
62+
try:
63+
sys.argv = [
64+
"streamlit",
65+
"run",
66+
resolve_path("DocSumm.py"),
67+
"--global.developmentMode=false",
68+
]
69+
sys.exit(stcli.main())
70+
except:
71+
print("\n\nERROR WHILE INITIATING Summary-ing!\n\n")
72+
elif choice=='3':
5873
print("Exiting!")
5974
break
6075
else:

DocSumm.py

+173
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
from langchain.embeddings import SentenceTransformerEmbeddings #HuggingFaceInstructEmbeddings
2+
from langchain.vectorstores import FAISS
3+
import os
4+
import copy
5+
import pprint
6+
#import google.generativeai as palm
7+
from langchain.llms import GooglePalm
8+
from langchain import PromptTemplate
9+
from langchain.document_loaders import PyPDFLoader
10+
from langchain.text_splitter import RecursiveCharacterTextSplitter
11+
from langchain.chains import RetrievalQA
12+
from langchain.chains.summarize import load_summarize_chain
13+
from tempfile import NamedTemporaryFile
14+
import streamlit as st
15+
from ingest import pageextract
16+
import warnings
17+
warnings.filterwarnings("ignore")
18+
19+
20+
21+
MODES=["Page-By-Page","Complete"]
22+
23+
FILES=os.listdir("data")
24+
25+
26+
if 'count' not in st.session_state:
27+
st.session_state.count=0
28+
29+
if 'mode' not in st.session_state:
30+
st.session_state.view=False
31+
st.session_state.mode=MODES[0]
32+
st.session_state.page=0
33+
34+
35+
@st.cache_resource
36+
def getapi():
37+
return str(open("API.txt","r",encoding='utf-8').read())
38+
39+
40+
PALM_API=getapi()
41+
#palm.configure(api_key=PALM_API)
42+
43+
44+
@st.cache_resource
45+
def getmodel():
46+
llm=GooglePalm(google_api_key=PALM_API,temperature=0,max_output_tokens=4000)
47+
return llm
48+
49+
@st.cache_resource
50+
def getprompt():
51+
template = """Use the information to elaborate in points about the user's query.
52+
If user mentions something not in the 'Context', just answer that you don't know.
53+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
54+
55+
Context: {context}
56+
57+
Query: {question}
58+
59+
Only return the helpful answer below and nothing else.
60+
61+
Helpful answer:
62+
"""
63+
prompt = PromptTemplate(
64+
template=template,
65+
input_variables=['context', 'question'])
66+
return prompt
67+
68+
def parseresult(result):
69+
70+
PARSED=copy.deepcopy(result)
71+
docs=PARSED['source_documents']
72+
sourcepage=[]
73+
for d in docs:
74+
sourcepage.append(d.metadata['page'])
75+
PARSED['source_pages']=copy.deepcopy(sourcepage)
76+
del sourcepage,result
77+
return PARSED
78+
79+
def getsources(result):
80+
sources=[]
81+
for s in result['source_documents']:
82+
sources.append(f"{s.metadata}")
83+
return sources
84+
85+
def startview():
86+
#st.runtime.legacy_caching.clear_cache()
87+
st.session_state.page=0
88+
st.session_state.mode=mode
89+
try:
90+
st.session_state.data=getData()
91+
except:
92+
st.write("ERROR IN LOADING DATA.")
93+
94+
95+
def resetview():
96+
st.session_state.view=False
97+
st.session_state.mode=mode
98+
st.session_state.page=0
99+
100+
101+
102+
103+
def getdata(fi):
104+
#print(fi.path)
105+
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n","\n","."," "],chunk_size=2000, chunk_overlap=100)
106+
texts=[]
107+
with NamedTemporaryFile(dir='.', suffix='.pdf',delete=False) as f:
108+
f.write(fi.getbuffer())
109+
#print("DAADD>>>",f.name)
110+
texts=texts+copy.deepcopy(text_splitter.split_documents(PyPDFLoader(f.name).load()))
111+
#your_function_which_takes_a_path()
112+
#print(f"{fi}>>>>><<<<<{texts[0:20]}")
113+
os.remove(f.name)
114+
return fi.read()
115+
mode=st.sidebar.radio("Pick one", MODES,on_change=resetview)
116+
file=st.sidebar.selectbox("Pick one", FILES)
117+
def getData():
118+
texts=[]
119+
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n","\n","."," "],chunk_size=1000, chunk_overlap=100)
120+
texts=texts+copy.deepcopy(text_splitter.split_documents(PyPDFLoader("data/"+file).load()))
121+
print(">>Data recieved.")
122+
#print(texts[0])
123+
st.session_state.dummy=copy.deepcopy(texts[0])
124+
return pageextract(texts)#print(f"*******{texts[0]}")
125+
126+
def sliderch():
127+
st.session_state.page=BP
128+
129+
st.session_state.file=file
130+
st.title(f'{mode} Summary')
131+
132+
#file=st.sidebar.file_uploader("Upload a CSV")#,on_change=getdata)
133+
prompt=False#st.sidebar.text_input("Enter query")
134+
but=st.sidebar.button("Click me",on_click=startview)
135+
#st.write(but)
136+
if (but or st.session_state.view) and (st.session_state.mode==MODES[0]) and file:
137+
col1, col2, col3 = st.columns([1, 3, 3])
138+
BB=col1.button("Next page")
139+
PP=col1.button("Prev page")
140+
BP=col1.slider("Pick a page", 0, len(st.session_state.data),on_change=sliderch)
141+
#print(BP)
142+
try:
143+
chain = load_summarize_chain(getmodel(), chain_type="stuff")
144+
except:
145+
st.write("ERROR IN LOADING MODEL.")
146+
col3.header("Summary : ")
147+
#tt=getData(file)
148+
#no=st.number_input("Pick a page", 0, tt[-1].metadata['page'])
149+
#print(f"%%%%%%%{file}")
150+
st.session_state.view=True
151+
try:
152+
if BB:
153+
st.session_state.page+=1
154+
if st.session_state.page >=len(st.session_state.data):
155+
st.session_state.page=len(st.session_state.data)-1
156+
if PP:
157+
st.session_state.page-=1
158+
if st.session_state.page <0:
159+
st.session_state.page=0
160+
col2.header(f"Page {st.session_state.page} Preview: ")
161+
162+
col2.write(st.session_state.data[st.session_state.page].page_content[0:600]+" .......")
163+
st.session_state.dummy.page_content=st.session_state.data[st.session_state.page].page_content
164+
col3.write(chain.run([st.session_state.dummy]))
165+
except:
166+
col3.write("ISSUES IN GENERATING SUMMARY")
167+
#st.write(file.read())
168+
elif (but or st.session_state.view) and (st.session_state.mode==MODES[1]) and file:
169+
col1,col2=st.columns([2,3])
170+
col1.write("WORK IN")
171+
col2.write("PROGRESS ")
172+
st.session_state.view=True
173+
st.session_state.file=file

0 commit comments

Comments
 (0)