From f4590e7d55df5246d7394ee9e3aa206862abcd70 Mon Sep 17 00:00:00 2001
From: Nisita-M <131473392+Nisita-M@users.noreply.github.com>
Date: Mon, 30 Dec 2024 10:36:06 +0530
Subject: [PATCH] Update app.py

---
 app.py | 182 ++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 121 insertions(+), 61 deletions(-)

diff --git a/app.py b/app.py
index ca5fd73..0e66356 100644
--- a/app.py
+++ b/app.py
@@ -6,12 +6,21 @@
 import pytesseract
 from pytesseract import Output
 from PIL import Image
-from flask import Flask, render_template, request
+from flask import Flask, render_template, request ,redirect , url_for , session 
+from flask_session import Session
+# from flask_session import FileSystemSessionInterface
 import PyPDF2
+from io import BytesIO
+import uuid
+import os
+
+nltk.download('words')
+nltk.download('punkt')
+nltk.download('stopwords')
+
+UPLOAD_FOLDER = 'uploads'
+
 
-# nltk.download('words')
-# nltk.download('punkt')
-# nltk.download('stopwords')
 
 def preprocess_text(text):
     text = re.sub(r'[^\w\s]', '', text)
@@ -52,15 +61,17 @@ def readable_summary(text):
 # Specify the Tesseract executable path
 pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
 
+
 app = Flask(__name__)
 
 # nltk.download('words')
 # nltk.download('punkt')
 # nltk.download('stopwords')
+app.secret_key ='supra_me'
 
 def extract_text_from_pdf(pdf_file):
     text = ""
-    pdf_reader = PdfFileReader(pdf_file)
+    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
     for page_number in range(pdf_reader.numPages):
         page = pdf_reader.getPage(page_number)
         text += page.extractText()
@@ -68,73 +79,122 @@ def extract_text_from_pdf(pdf_file):
 
 @app.route('/')
 def index():
-    return render_template('index.html')
+    return render_template('index.ejs')
 
 @app.route('/process_file', methods=['POST'])
 def process_file():
+    session['questions']=[]
+    session['answers']=[]
+    
     # Get the choice of the user (image or pdf)
     file_type = request.form['file_type']
-
+    session['file_type'] = file_type
+    
     if file_type == 'image':
         # Example: Get the uploaded image file
         uploaded_file = request.files['file']
-
-        # Perform OCR on the image
-        with Image.open(uploaded_file) as image:
-            txt = pytesseract.image_to_string(image)
-
-    elif file_type == 'pdf':
+        name=uploaded_file.filename
+        if uploaded_file.filename == '':
+            return 'No selected file', 400
+        file_id = str(uuid.uuid4())  # Generate a unique ID for the file
+        file_path = os.path.join(UPLOAD_FOLDER, file_id)
+        uploaded_file.save(file_path)
+        session['file_path'] = file_path
+        # print(uploaded_file)
+
+        
+
+    if file_type == 'pdf':
         # Example: Get the uploaded PDF file
         uploaded_file = request.files['file']
-
+        name=uploaded_file.filename
+        if uploaded_file.filename == '':
+            return 'No selected file', 400
+        file_id = str(uuid.uuid4())  # Generate a unique ID for the file
+        file_path = os.path.join(UPLOAD_FOLDER, file_id)
+        uploaded_file.save(file_path)
+        session['file_path'] = file_path
+        print(name)
+    return redirect(url_for('quest',filename=name))
+       
+@app.route('/quest/<filename>',methods=['GET'])
+def quest(filename):
         # Extract text from the PDF using PyPDF2
-        pdf_reader = PyPDF2.PdfReader(uploaded_file)
-
-        # Get the number of pages in the PDF
-        num_pages =num_pages = len(pdf_reader.pages)
-
-        # Initialize an empty string to store the extracted text
-        txt = ""
-
-        # Iterate through all pages
-        for page_num in range(num_pages):
-            # Get the page
-            page = pdf_reader.pages[page_num]
-
-            # Extract text from the page
-            etext = page.extract_text()
-
-            # Append the text to the result string
-            txt+=etext
-
-    sample_text = txt
-    # Preprocess text using the loaded function
-    preprocessed_text = preprocess_text(sample_text)
-
-    # Divide documents into sentences using the loaded function
-    sentences = divide_documents(preprocessed_text)
-
-    # Generate summaries using the loaded function
-    summaries = generate_summaries(sentences)
-
-    # Check if the user wants to view the summary
-    show_summary = request.form.get('show_summary')
-
-    if show_summary:
-        # Use the loaded function to generate a readable summary
-        summary = readable_summary(sample_text)
-    else:
-        summary = None
-
-    # Example: Get the question from the user
-    user_question = request.form['question']
-
-    # Use the loaded QA pipeline
-    answer = qa_pipeline(question=user_question, context=sample_text)
-
-    # Render the result on the webpage
-    return render_template('result.html', question=user_question, answer=answer['answer'], summary=summary)
+        # print("Meowwww")
+
+        return render_template("question.ejs",filename=filename)
+        # uploaded_file
+
+@app.route('/quest/<filename>',methods=['POST'])
+def upload(filename):
+        print("broo")
+        items=session.get('questions')
+        print("Session before appending:", items)
+        print('hello')
+        file_path = session.get('file_path')
+
+        if file_path is None or not os.path.exists(file_path):
+            return "File not found", 400
+        
+        if session.get('file_type') == 'pdf':
+            with open(file_path, 'rb') as file:
+                file_data = file.read()
+            pdf_reader = PyPDF2.PdfReader(BytesIO(file_data))
+
+            # Get the number of pages in the PDF
+            num_pages =num_pages = len(pdf_reader.pages)
+
+            # Initialize an empty string to store the extracted text
+            txt = ""
+
+            # Iterate through all pages
+            for page_num in range(num_pages):
+                # Get the page
+                page = pdf_reader.pages[page_num]
+
+                # Extract text from the page
+                etext = page.extract_text()
+
+                # Append the text to the result string
+                txt+=etext
+        
+        if session.get('file_type') == 'image':
+            # Perform OCR on the image
+            with Image.open(file_path) as image:
+                txt = pytesseract.image_to_string(image)
+
+        sample_text = txt
+        if sample_text == '':
+            sample_text = "No answers found"
+        # Preprocess text using the loaded function
+        preprocessed_text = preprocess_text(sample_text)
+
+        # Divide documents into sentences using the loaded function
+        sentences = divide_documents(preprocessed_text)
+
+        # Generate summaries using the loaded function
+        summaries = generate_summaries(sentences)
+
+        # Check if the user wants to view the summary
+        show_summary = request.form.get('show_summary')
+
+        if show_summary:
+            # Use the loaded function to generate a readable summary
+            summary = readable_summary(sample_text)
+        else:
+            summary = None
+
+        # Example: Get the question from the user
+        user_question = request.form['question']
+        session['questions'].append(user_question)
+        session.modified = True
+        print("Session after appending:", session.get('questions'))
+        # Use the loaded QA pipeline
+        answer = qa_pipeline(question=user_question, context=sample_text)
+        session['answers'].append(answer['answer'])
+
+        # Render the result on the webpage
+        return render_template('question.ejs', question=session.get('questions'), answer=session.get('answers'), summary=summary,filename=filename,len=len(session.get('questions')))
 
 if __name__ == '__main__':
     app.run(debug=True)
-