Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions test/extract_questions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 26 15:03:02 2019

@author: domonique_hodge
"""
import pickle
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from summa.summarizer import summarize

import pynlp
from pynlp import StanfordCoreNLP

#Extract questions from papers
papers = json.load( open( "/Users/domonique_hodge/Documents/External Clients/DARPA/cleanedArticles-03042019.json", "rb" ) )

all_questions = []


for i in papers.keys():
print(i)


all_text = papers[i]['text']

try:
sentence = sent_tokenize(all_text)
sentence_index = [i for i, x in enumerate(sentence) if x[-1:] == "?"]

num_quest = len(sentence_index)

if num_quest >0:
for j in range(0,num_quest):
question = sentence[[i for i, x in enumerate(sentence) if x[-1:] == "?"][j]]
all_questions.append(question)
except:
print('Error occured with tokenizing article')

no_duplicate_quest = list(dict.fromkeys(all_questions))

#write to txt file

with open('/Users/domonique_hodge/Documents/External Clients/DARPA/QG Net update/questions_v2.txt', 'w') as f:
for item in no_duplicate_quest:
f.write("%s\n" % item)

3,201 changes: 3,201 additions & 0 deletions test/input_subset_used.txt

Large diffs are not rendered by default.

11,204 changes: 11,204 additions & 0 deletions test/inputall.txt

Large diffs are not rendered by default.

109 changes: 109 additions & 0 deletions test/key_sentence_extraction_summa_taylor.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os \n",
"import summa\n",
"import json\n",
"\n",
"os.chdir('/Users/taylor_bolt/PycharmProjects/multivac/multivac/data')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parse Article Json file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"article_json = json.load(open('cleanedArticles-03042019.json'))\n",
"all_article_text = [ article['text'] for article in article_json.values() ]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extract Key Sentences w/ TextRank"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from summa.summarizer import summarize\n",
"key_sentences = []\n",
"for count,article in enumerate(all_article_text):\n",
" print('{} out of {} articles'.format(count,len(all_article_text)))\n",
" try:\n",
" summary = summarize(article, split=True)\n",
" key_sentences.append(summary[0:10])\n",
" except ValueError: \n",
" key_sentences.append('No text found for article')\n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
181 changes: 181 additions & 0 deletions test/preprocess_First_submitted.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 30 15:27:25 2019

@author: domonique_hodge
"""

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 28 17:31:50 2019

@author: domonique_hodge
"""

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 2 11:21:36 2019
@author: domonique_hodge
"""

"""
This code preprocess pdfs for the QG-Net (Query Generation) algorithm
"""

import os
os.environ["CORENLP_HOME"] = r'Users/domonique_hodge/Downloads/stanford-corenlp-full-2018-10-05'

import corenlp
import warnings
import logging
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sortedcontainers import SortedDict


import logging
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sortedcontainers import SortedDict


def preprocess_pdf(abstract, features, tfidf, num_sentences=5):
'''
Chooses top `num_sentences` based on the sum of their terms' TF-IDF scores,
and then picks the word/term in each sentence with the top TF-IDF score as
the "answer."
'''
document = nlp.annotate(abstract)
term_scores = [(w,s) for w, s in zip(tfidf.get_feature_names(), features.reshape(-1)) if s > 0]
sent_scores = SortedDict()

for i, sentence in enumerate(document['sentences']):
score = sum([s for w, s in term_scores if w in str(sentence)])
sent_scores[score] = i

warnings.filterwarnings('ignore', category=UserWarning, append=True)

for i in reversed(sent_scores.values()):
out_str = ''
if i >= num_sentences:
break

sentence = document['sentences'][i]
str_sent = ' '.join([t['originalText'] for t in sentence['tokens']])
top_term = sorted([(w,s) for w, s in term_scores if w in str_sent],
key=lambda x: x[1], reverse=True)[0][0]
top_term = top_term.split()

for i, token in enumerate(sentence['tokens']):
tok_text = token['originalText']

if tok_text[0].isupper():
low = 'U'
else:
low = 'L'

if len(top_term) > 0:
ans = 'A'
else:
ans = '-'

for j, t in enumerate(top_term):
if sentence['tokens'][i+j]['originalText'] != top_term[j]:
ans = '-'
break

if ans =='A' and len(top_term) > 0:
del top_term[0]
#updated here
out_str += "{}│{}│{}│{}│{} ".format(tok_text,low,token['pos'],token['ner'],ans)

if '│A' in out_str:
print(out_str)





def load_data(jsonPath, picklePath = None):
"""Load data - if picklePath is specified, load the pickle. Else, try json file.
This returns the JSON file as well as a list of document texts
"""
if picklePath is not None:
l_docs = pickle.load(open(picklePath, "rb" ))
else:

## Read JSON data into the datastore variable - this comes from Peter and Domonique's effort. Don
with open(jsonPath, 'r') as f:
datastore = json.load(f)

## These were some bad files - nothing substantive in them, or they were retrieved in bad format
for e in ['1805.10677v1', '0911.5378v1']:
if e in datastore:
del datastore[e]

## Extract texts
l_docs = [value['text'] for key,value in list(datastore.items())[0:] if value['text'] ]

print('# of documents: ', len(l_docs))

return datastore, l_docs


def create_tf_idf(docs, writeFile=True, pathToFolders=''):
""" Creates a TF-IDF matrix of terms in the corpus and saves this to disk
as a sparse matrix in the data/processed folder when writeFile=True and
pathToFolders is provided.
"""

tfidf = TfidfVectorizer(sublinear_tf=True,
min_df=10,
norm=None,
ngram_range=(1, 3),
stop_words='english',
use_idf=True,
smooth_idf=True)

features = tfidf.fit_transform(docs)

if writeFile:
if len(pathToFolders) == 0:
pathToFolders = settings.processed_dir

with open(pathToFolders / 'multivac_tfidf.pkl', 'wb') as f:
pickle.dump({'features': features, 'tfidf': tfidf}, f)

return True
else:
return features, tfidf


# ####################
#
# Substitute this hard-coded path for a path passed during the automated
# pre-processing.
#
# ####################

jsonObj, allDocs = load_data("/Users/domonique_hodge/Documents/External Clients/DARPA/cleanedArticles-03042019.json")
abstracts = []

for value in jsonObj.values():
if "summary" in value['metadata']:
abstracts.append(value['metadata']["summary"])
elif "abstract" in value['metadata']:
abstracts.append(value['metadata']["abstract"])

nlp = corenlp.CoreNLPClient(output_format='json', properties={
'timeout': '50000'})

# Once we've determined the proper filepath to output, we can change
# this call to write the files out to disk
features, tfidf = create_tf_idf(abstracts, False)

for i, abstract in enumerate(abstracts):
preprocess_pdf(abstract, features[i,:].toarray(), tfidf)
Loading