-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
104 lines (75 loc) · 3.59 KB
/
script.py
File metadata and controls
104 lines (75 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import xml.etree.ElementTree as ET
import nltk as nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os
from lxml import etree
def action(folderPath, folderComplPath, wcf, lolf, fpaf):
if not os.path.exists(folderPath):
os.makedirs(folderPath)
if not os.path.exists(folderComplPath):
os.makedirs(folderComplPath)
if not os.path.exists(wcf):
os.makedirs(wcf)
if not os.path.exists(lolf):
os.makedirs(lolf)
if not os.path.exists(fpaf):
os.makedirs(fpaf)
num_figures = []
global stop_words
stop_words = set(stopwords.words('english'))
for filename in os.listdir(folderPath):
if os.path.isfile(os.path.join(folderPath, filename)):
#Creating keywordCloud
print(folderPath+filename)
# load xml file
tree = ET.parse(folderPath+filename)
root = tree.getroot()
# namespace prefix
xmlns = root.tag[root.tag.find('{')+1 : root.tag.find('}')]
ns = {'tei': xmlns}
# extract abstract label
abTag = root.find('.//tei:abstract/tei:p', ns)
if abTag is None:
continue
abstract = abTag.text
# tokenize critical words
tokens = word_tokenize(abstract.lower())
tokens = [token for token in tokens if not token in stop_words]
# frequency distribution
freq_dist = nltk.FreqDist(tokens)
# word cloud from the frequency distribution
wordcloud = WordCloud().generate_from_frequencies(freq_dist)
# save
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig(wcf+'/'+ filename[0:filename.find('.')] +'.png')
#Number of figures per article
print(folderComplPath, filename)
# parse the XML file using lxml
tree = etree.parse(os.path.join(folderComplPath, filename))
# find the number of figure elements in the file
num_figures_in_file = len(tree.xpath("//tei:figure", namespaces=ns))
print(num_figures_in_file)
# append the number of figures to the list
for i in range(num_figures_in_file):
num_figures.append(filename)
#create list of links
f = open(lolf+"/"+ filename[0:filename.find('.')] +'.txt', "w")
# find all the biblStruct elements
biblstructs = tree.xpath("//tei:biblStruct", namespaces=ns)
# print the contents of each biblStruct element
for biblstruct in biblstructs:
f.write(etree.tostring(biblstruct, pretty_print=True, encoding='unicode') + '\n')
f.close()
# create a histogram of the number of figures per article
plt.clf()
plt.hist(list(el[0: el.find('.')] for el in num_figures), bins=range(len(num_figures)), width= 0.8)
plt.xlabel("Name of Articles", fontsize=12)
plt.xticks(rotation=25, ha='right', fontsize=8)
plt.subplots_adjust(bottom=0.4, left=0.3)
plt.ylabel("Number of Figures")
plt.savefig(fpaf +'/figure.png')
action('outputs/header/', 'outputs/', 'wordClouds', 'listOfLinks', 'figuresperArticle')