-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpptParser.py
152 lines (129 loc) · 5.35 KB
/
pptParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# TODO:
# 1) Finish this program
# a. Figure out how we want the user to choose the PowerPoint file
# b. Error handling involved with choosing a file
# c. finalize as a group what we want to be our metadata
# d. figure out/edit how the metadata is being inserted to the program
# 2) Set up code delivery in a way where our user only has to do 1 simple installation to use our program
from __future__ import division, unicode_literals
from textblob import TextBlob as tb
import os
import operator
import sys
# from Tkinter import *
# import Tkinter, Tkconstants, tkFileDialog
from stopwords import filter_stop
from pptx import Presentation # pip install python-pptx
from tfidfForStringLists import sortedTfIdfLists
# 1) Open dialog box to select pptx files (currently doesn't parse metadata for .ppt files)
# 2) For each presentation, parse presentation for text
# 3) Take text and categorize into core properties
# 4) Actually insert those core properties to each Presentation
DEFAULT_VAL = 10
def main():
# Get all of files that user wants to change tags for
# Tuple is a ( [list of Presentations], [list of filenames] )
prsAndFileNameTuple = findFile()
if prsAndFileNameTuple == ([],[]):
print "No pptx presentations found. Exiting program.\n"
os.system("pause")
sys.exit()
else:
i = 0
for i in range(len(prsAndFileNameTuple[1])):
#print "i is " + str(i)
prs = prsAndFileNameTuple[0][i]
#print "prs is " + str(prs)
filename = prsAndFileNameTuple[1][i]
print "Parsing: " + os.path.basename(filename)
# Get the largest text from each slide, add each of those word to a list
wordList = parseText(prs)
# rank the word list to figure out the order in which we will populate the tags
metadata = parseMetaData(wordList, DEFAULT_VAL)
# Insert metadata (Core Properties) to tags
populateCoreProperties(prs, metadata, filename)
i += 1
print
print("Successfully added tags to all .pptx files in directory")
os.system("pause")
def findFile():
pptx_files = [] # list of filenames
powerPoints = [] # list of Presentations
# pptx_files = tkFileDialog.askopenfilenames(filetypes = (("Power Point", "*.pptx"),)) # add each file name to list
#
# count = 0
# for fileName in pptx_files:
# print "Selected files " + fileName
# powerPoints.append(Presentation(fileName))
# count+=1
#
# if count == 0:
# print("No files entered. Aborting")
# return None
print "Finding all .pptx files.."
currentPath = os.path.dirname(os.path.abspath(__file__))
for file in os.listdir(currentPath):
try:
if file.endswith(".pptx"):
pptx_fullPath = os.path.join(currentPath, file)
pptx_files.append(pptx_fullPath)
powerPoints.append(Presentation(pptx_fullPath))
except Exception:
print "Error with File:" + pptx_fullPath
# return 2-tuple of lists of Presentations, and filenames
return (powerPoints, pptx_files)
# for each slide, for each paragraph run, find the runs with the biggest font
# insert each word to a word list
# filter out common words such as "the" or "is" and return that word list
def parseText(presentation):
splitText = [] # get rid of extra white space
text = ""
listOfTexts = []
greatestFontDict = {}
for slide in presentation.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
# read the text in the text box, encode it from unicode to ascii, get rid of extra white space
# then finally add all the contents in the split() list to the word list
# could have data loss
splitText = filter_stop(run.text.encode('ascii', 'ignore').split())
text += " ".join(splitText)
text = text.lower()
listOfTexts.append(tb(text))
return listOfTexts
# find the X most common words in the word list
# return a String
def parseMetaData(wordList, val):
sorted_words = sortedTfIdfLists(wordList)
# initialize loop variables
keywords = ""
# This if statement is here to get a string with no extra white space
if len(sorted_words) != 0:
keywords = sorted_words[0][0]
# top 10 or when we run out
for word in sorted_words[1:val]:
keywords += ", " + word[0]
print "Adding tags: " + keywords
return keywords
# Add all keywords in wordList to tags property and save
# We may need to change this depending on if we want to add any more metadata
# or how we want to insert it
def populateCoreProperties(presentation, metadata, filename):
string = ""
## Useless, right now it's a single string anyways
# for item in metadata:
# string += item
# presentation.core_properties.keywords = string
presentation.core_properties.keywords = metadata
presentation.save(filename)
def printAllKeywords():
prsAndFileNameTuple = findFile()
for i in range(len(prsAndFileNameTuple[1])):
prs = prsAndFileNameTuple[0][i]
filename = prsAndFileNameTuple[1][i]
print "hello"
if __name__ == "__main__":
main()