Merge branch 'main' of github.com:librairy/muheqa

librairy · May 5, 2022 · 31107fe · 31107fe
2 parents 622e78d + b502336
commit 31107fe
Show file tree

Hide file tree

Showing 23 changed files with 549 additions and 33,018 deletions.
diff --git a/...tasets/LC-QuAD_2.0/data/parsedDataset.csv → ...ion/datasets/LC-QuAD_2.0/data/LC-Quad.csv b/...tasets/LC-QuAD_2.0/data/parsedDataset.csv → ...ion/datasets/LC-QuAD_2.0/data/LC-Quad.csv
diff --git a/application/datasets/LC-QuAD_2.0/data/LC-Quad_Dataset.csv b/application/datasets/LC-QuAD_2.0/data/LC-Quad_Dataset.csv
diff --git a/...ts/VANiLLA/data/Vanilla_Dataset_Test.json → ...n/datasets/VANiLLA/data/VANiLLA_Test.json b/...ts/VANiLLA/data/Vanilla_Dataset_Test.json → ...n/datasets/VANiLLA/data/VANiLLA_Test.json
diff --git a/application/datasets/VANiLLA/retrieve.py b/application/datasets/VANiLLA/retrieve.py
@@ -6,6 +6,8 @@
 import multiprocessing as mp
 import pandas as pd
 
+datasetUrl = "data/VANiLLA_Test.json"
+
 def JSONLineToDict(JSONRoute):
     '''
     Funcion auxiliar que dado un archivo json con JSONObjects en cada linea,
@@ -116,4 +118,4 @@ def retriever(pool, rows, counter, JSONroute, queryURL, csvRoute, writeHeader =
         queryUrl = "http://localhost:5000/eqakg/dbpedia/en?evidence=true"
         #queryUrl = "https://librairy.linkeddata.es/eqakg/dbpedia/en?text=false" 
 
-        retriever(pool,rows,counter,"data/Vanilla_Dataset_Test.json",queryUrl,"results/VANiLLA.csv", writeHeader=True)
+        retriever(pool,rows,counter,datasetUrl,queryUrl,"results/VANiLLA.csv", writeHeader=True)
diff --git a/application/datasets/VQuAnDa/data/test.json → ...n/datasets/VQuAnDa/data/VQuAnDA_test.json b/application/datasets/VQuAnDa/data/test.json → ...n/datasets/VQuAnDa/data/VQuAnDA_test.json
diff --git a/application/datasets/VQuAnDa/retrieve.py b/application/datasets/VQuAnDa/retrieve.py
@@ -5,8 +5,8 @@
 import time
 import multiprocessing as mp
 import pandas as pd
-import traceback
-from pprint import pprint
+
+datasetUrl = "data/VQuAnDA_test.json"
 
 def jsonToDict(route):
     '''
@@ -120,4 +120,4 @@ def retriever(pool, rows, counter, JSONroute, queryURL, csvRoute, writeHeader =
         queryUrl = "http://localhost:5000/muheqa/dbpedia/en?evidence=true"
         #queryUrl = "https://librairy.linkeddata.es/muheqa/dbpedia/en?evidence=true" 
 
-        retriever(pool,rows,counter,"data/test.json",queryUrl,"results/VQuAnDa.csv", writeHeader=True)
+        retriever(pool,rows,counter,datasetUrl,queryUrl,"results/VQuAnDa.csv", writeHeader=True)
diff --git a/application/datasets/parseDatasets.py b/application/datasets/parseDatasets.py
diff --git a/ui/pages/datasetManagement.py b/ui/pages/datasetManagement.py
@@ -0,0 +1,40 @@
+import streamlit as st
+from utils import dbManager
+from utils import parseDatasets
+
+dbDirection = "mongodb://localhost:27017"
+
+def main():
+
+    #Subtitulo de la seccion de gestion de conjuntos de datos
+    st.subheader('Dataset Management')
+
+    #Texto del cuerpo de la pagina web
+    st.markdown(""" 
+    You may upload your dataset below. For it to be processed and uploaded to our database, please follow these guidelines:
+    - 1. Upload your dataset either on .CSV or .JSON format.
+    - 2. JSONs may be on JSON lines or JSON array format.
+    - 3. Answers should be on the "answer" column/key, and Questions on the "question" column/key.
+    - 4. If your Answer is verbalized, you shall name its key/column "verbalized_answer", and format it with the answer between brackets, i.e. "Fernando Alonso was born in [Oviedo]."
+    """, unsafe_allow_html=True)
+
+    inputBuffer = st.file_uploader("Upload an Image", type=["csv","json"])
+
+    if inputBuffer:
+        try:
+            db = dbManager.DbManager(dbDirection)
+            filename = inputBuffer.name
+            splitFilename = filename.split(".")
+            datasetDict = parseDatasets.parseDataset(inputBuffer, isCsv=(splitFilename[1] == "csv"))
+            datasetName = splitFilename[0].lower()
+            if datasetDict:
+                db.importDataset(datasetDict, datasetName)
+                if datasetName in db.getCollections():
+                    st.success("✨ Your dataset has been registered on our database!")
+                    st.write("A dataset with name ", datasetName, "and length ", len(datasetDict), " questions has been registered on MongoDB")
+                else:
+                    st.error("We could not upload your dataset on our database. Please contact the administrator.")
+            else:
+                st.error("Your dataset could not be processed correctly. Please revise the format or contact the administrator")
+        except Exception as e:
+            st.exception(e)    
diff --git a/ui/pages/dataset_upload.py b/ui/pages/dataset_upload.py
diff --git a/ui/pages/question_answering.py → ui/pages/questionAnswering.py b/ui/pages/question_answering.py → ui/pages/questionAnswering.py
@@ -1,9 +1,32 @@
-import streamlit as st
+import pytz
 import requests
-from annotated_text import annotated_text
 import operator
-from utils import db
-import random
+from utils import dbManager
+import streamlit as st
+from utils import spreadManager
+from datetime import datetime
+from annotated_text import annotated_text
+
+"""
+Variables globales:
+- timezone: Huso horario cuyas horas vamos a usar en nuestra hoja
+- knowledgeBases: Lista de bases de conocimiento para nuestra consulta
+- QAService: Url del servicio de Question-Answering
+- dbDirection: Direccion de la base de datos
+- spreadsheet: Nombre del Libro de Calculo 
+- spreadsheet_id: Identificador de nuestro Libro de Calculo
+- validationSheet: Nombre de la Hoja a modificar (hoja de validacion)
+"""
+
+timezone = pytz.timezone("Europe/Madrid")
+
+knowledgeBases = ["wikidata","dbpedia","cord19"]
+QAService = "http://127.0.0.1:5000/muheqa/"
+dbDirection = "mongodb://localhost:27017"
+
+spreadsheet = "MuHeQa_Validation"
+spreadsheetId = "1TY6Tj1OwITOW3o1nYRFFRY1bunvHNImUj-J0omRq4-I"
+validationSheet = "Validation"
 
 def queryJSON(queryURL, question):
     """
@@ -20,48 +43,43 @@ def queryJSON(queryURL, question):
 def main():
 
     @st.cache(show_spinner=False, allow_output_mutation=True)
-    def getAnswers(data):
+    def getAnswers(question):
         """
         Funcion auxiliar que obtiene una lista con todas las respuestas sobre las distintas bases de conocimiento
         """
         answerList = [
-
         ]
 
         for i in knowledgeBases:
-            queryURL = "http://127.0.0.1:5000/muheqa/" + i + "/en?evidence=true"
-            answer = queryJSON(queryURL,data["question"])
+            queryURL = QAService + i + "/en?evidence=true"
+            answer = queryJSON(queryURL,question)
             #Si la respuesta es distinta de None, guardamos la fuente y agregamos la respuesta a la lista de contestaciones
             if answer:
                 answer["source"] = i
                 answerList.append(answer)
 
         return answerList
 
-    def annotateContext(response):
+    def annotateContext(response, answer, context, answerStart, answerEnd):
         '''
         Funcion auxiliar que anota la respuesta sobre el texto de evidencia
         '''
-        #Por defecto la etiqueta del texto anotado sera "ANSWER" y el color verde
         tag = "ANSWER"
         color = "#adff2f"
-        #Guardamos la respuesta, el contexto, y su principio y final en el texto
-        answer = response["answer"]
-        context = response["evidence"]["summary"]
-        answerStart = response["evidence"]["start"]
-        answerEnd = response["evidence"]["end"]
+        #Buscamos la respuesta en el texto
         answerInText = (response["evidence"]["summary"])[answerStart:answerEnd]
         #Si la respuesta en el texto es distinta de la respuesta en el json:
         if answer != answerInText:
             #Cambiamos la etiqueta a "EVIDENCE" y el color a a azul
             tag = "EVIDENCE"
             color = "#8ef"
-        #Marcamos en el texto de evidencia la respuesta
+        #Marcamos en el texto de evidencia la respuesta y lo mostramos en la interfaz
         annotated_text(context[:answerStart],(answerInText,tag,color),context[answerEnd:],)
 
-    #Creamos la conexion para la base de datos de validacion
-    worksheet = db.connectToSheet()
-
+    #Creamos la conexion para la base de datos (datasets) y el Libro de Calculo (validacion)
+    spread = spreadManager.SpreadManager(spreadsheet, spreadsheetId, validationSheet)
+    db = dbManager.DbManager(dbDirection)
+
     #Subtitulo de la seccion de pregunta y respuesta
     st.subheader('MuHeQa UI - Question Answering over Multiple and Heterogeneous Knowledge Bases')
 
@@ -70,22 +88,12 @@ def annotateContext(response):
     Write any question below or use a random one from a pre-loaded datasets!
     """, unsafe_allow_html=True)
 
-    #Lista de Hojas de Calculo con Datasets en nuestro Libro 
-    datasetList = db.getDatasetsInSheet(worksheet)
-
-    #Obtenemos el contenido de cada una de estas hojas
-    recordList = []
-    #Creamos una lista de listas para dicho contenido, donde cada lista sera un dataset (hoja)
-    for i in datasetList:
-        recordList.append(db.getRecordsInSheet(i))    
-
+    #Lista de Hojas de Calculo con Datasets en nuestra base de datos
+    selectorList = ["All"] 
+    selectorList.extend(db.getCollections())
+
     #Buscador para realizar preguntas
     question = st.text_input("")
-
-    #Creamos la lista para el selector
-    selectorList = ["All"]
-    #Quitamos "_Validation" del nombre de las hojas del Libro de Calculo
-    selectorList.extend([i.split("_")[0] for i in datasetList])
 
     #Selector para el Dataset del que provendran las preguntas aleatorias
     dataset = st.selectbox("Select a DataSet", selectorList)
@@ -96,24 +104,14 @@ def annotateContext(response):
     modelAnswer = None
 
     if randomQuestion:
-        randomDict = random.choice(random.choices(recordList, weights=map(len, recordList))[0])
-        print(randomDict)
+        randomDict = db.getRandomDocument(1,dataset)[0]
         question = randomDict["question"]
         modelAnswer = randomDict["answer"]
 
-    data = {
-        'question': question,
-        'answerNumber': 10
-    }
-
     #Establecemos el titulo de la barra lateral
     st.sidebar.subheader('Options')
     #Control deslizante para el numero de respuestas a mostrar
-    answerNumber = st.sidebar.slider('How many relevant answers do you want?', 1, 10, 5)
-
-    #Lista de bases de conocimiento sobre las que haremos nuestra consulta
-    knowledgeBases = ["wikidata","dbpedia","cord19"]
-
+    answerNumber = st.sidebar.slider('How many relevant answers do you want?', 1, 10, 1)
     if question:
         st.write("**Question: **", question)
         if modelAnswer:
@@ -125,37 +123,43 @@ def annotateContext(response):
         #Mensaje de carga para las preguntas. Se muestra mientras que estas se obtienen.
         with st.spinner(text=':hourglass: Looking for answers...'):
             counter = 0
-            buttonKey = 1
-            results = getAnswers(data)
+            highestScoreAnswer = {}
+            results = getAnswers(question)
             results.sort(key = operator.itemgetter('confidence'), reverse = True)
-            for response in results:
+            for idx,response in enumerate(results):
                 if counter >= answerNumber:
                     break
                 counter += 1
                 answer = response['answer']
-                if answer:
+                if answer and answer != "-":
                     context = "..." + response["evidence"]["summary"] + "..."
                     source = response["source"]
-                    relevance = response["confidence"]
-                    annotateContext(response)
+                    confidence = response["confidence"]
+                    annotateContext(response, answer, context, response["evidence"]["start"], response["evidence"]["end"])
                     st.write("**Answer: **", answer)
-                    st.write('**Relevance:** ', relevance , '**Source:** ' , source)
-                    col1, col2 = st.columns([1,1])
-                    with col1:
-                        isRight = st.button("👍", buttonKey)
-                    with col2:
-                        isWrong = st.button("👎", buttonKey + 1)
-                    buttonKey += 2
-                    #Si se pulsa el boton de correcto/incorrecto:
-                    if isRight or isWrong:
-                        #Mensaje de que el input del usuario ha sido registrado
-                        st.success("✨ Thanks for your input!")
-                        #Insertamos en la Spreadsheet de Google
-                        #db.insert(conn, [[question,source,answer,isRight]])
-                        #Reseteamos los valores de los botones
-                        isRight = False
-                        isWrong = False
-
+                    st.write('**Relevance:** ', confidence , '**Source:** ' , source)
+                    if idx == 0:
+                        highestScoreAnswer = {
+                            "answer": answer,
+                            "confidence": confidence
+                        }
+        st.write("Please rate if our answer has been helpful to you so we can further improve our system!")
+        #Botones para validar la respuesta por parte del usuario en columnas separadas          
+        col1, col2 = st.columns([1,1])
+        with col1:
+            isRight = st.button("👍")
+        with col2:
+            isWrong = st.button("👎")
+
+        #Si se pulsa el boton de correcto/incorrecto:
+        if isRight or isWrong:
+            #Insertamos en la Spreadsheet de Google
+            spread.insertRow([[question, highestScoreAnswer["answer"], str(highestScoreAnswer["confidence"]), isRight, str(datetime.now(tz=timezone))]])
+            #Reseteamos los valores de los botones
+            isRight = False
+            isWrong = False
+            #Mensaje de que el input del usuario ha sido registrado
+            st.success("✨ Thanks for your input!")
 
     #Checkbox. Si tenemos respuesta y la caja es marcada, imprimimos las respuestas JSON obtenidas.
     if question and st.sidebar.checkbox('Show JSON Response', key = 0):