-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
117 lines (82 loc) · 3.64 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
import os
import json
import gensim
from gensim.summarization import summarize
from flask import escape
from google.cloud import language_v1
def parse_content(captions_url: str) -> str:
"""Parse the JSON from the captions URL and return the text.
Args:
captions_url (str): URL of the captions file from YouTube
Returns:
str: Text content of the captions
"""
data = requests.get(f"{captions_url}&fmt=json3").json()
# create a string of all the words in utf format.
events = data['events']
words = []
for event in events:
segments = event['segs'] if 'segs' in event.keys() else None
if segments is not None:
for segment in segments:
words.append(segment['utf8'])
words = [word for word in words if word != '\n']
return ''.join(words)
def create_summary(text: str) -> str:
summary = summarize(text, word_count = 100)
return summary
path = "key.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
def get_top_keywords(content):
client = language_v1.LanguageServiceClient()
type_ = language_v1.Document.Type.PLAIN_TEXT
language = "en"
document = {"content": content, "type_": type_, "language": language}
encoding_type = language_v1.EncodingType.UTF8
response = client.analyze_entities(request = {'document': document, 'encoding_type': encoding_type})
number_of_keyword = 0
keywords = []
for entity in response.entities:
print(u"Representative name for the entity: {}".format(entity.name))
keyword = str(entity.name).strip('.\n ')
if keyword not in keywords and len(keyword.split()) < 4:
keywords.append(keyword)
number_of_keyword +=1
if number_of_keyword == 5:
break
##### If we want to display more information about the keywords
# Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al
#print(u"Entity type: {}".format(language_v1.Entity.Type(entity.type_).name))
# Get the salience score associated with the entity in the [0, 1.0] range
#print(u"Salience score: {}".format(entity.salience))
#for metadata_name, metadata_value in entity.metadata.items():
#print(u"{}: {}".format(metadata_name, metadata_value))
# Loop over the mentions of this entity in the input document.
# The API currently supports proper noun mentions.
#for mention in entity.mentions:
#print(u"Mention text: {}".format(mention.text.content))
# Get the mention type, e.g. PROPER for proper noun
#print(u"Mention type: {}".format(language_v1.EntityMention.Type(mention.type_).name))
return keywords
def get_summary(request):
headers = {}
if request.method == 'OPTIONS':
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'POST',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600'
}
return ('', 204, headers)
# Set CORS headers for the main request
headers = {
'Access-Control-Allow-Origin': '*'
}
URL = request.get_json(silent=True, force=True)['url']
text = parse_content(URL)
text_with_punctuation = requests.post("http://bark.phon.ioc.ee/punctuator", {"text": text})
text = text_with_punctuation.content.decode('utf-8')
summary = create_summary(text)
keywords = get_top_keywords(text)
return (json.dumps({"summary" : summary, "keywords" : keywords}), 200, headers)