-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.h
300 lines (262 loc) · 8.35 KB
/
indexer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#ifndef INDEXER_H_
#define INDEXER_H_
#include "document.h"
#include "stopwords.h"
#include "tokenizer.h"
#include <vector>
#include <iomanip>
#include <iostream>
#include <map>
#include <math.h>
using namespace std;
template<int size>
class indexer{
private:
//! @brief A private int: the number of documents.
int N;
//! @brief a private boolean: whether the indexer has been normalized.
bool normalized;
//! @brief a private stopwords: the stopwords the indexer should keep track of.
stopwords *stpw;
//! @brief a private vector of documents: the list of documents which's score the indexer should keep.
vector<document> documents;
//! @brief a private map of string, int: the total number of documents that a token appears in.
map<string,int> dft;
//! @brief a private map of string, vector<int>: the token frequency in documents represented by the vector<int>.
/*!
* The string is the token. Every document will have a term frequency, which is stored in the vector<int>.
* The first int in the vector will be the first document's frequency, etc...
*/
map<string,vector<int> > tftd1;
//! @brief a private map of string, vector<int>: The same as tftd1, but with stopwords removed.
map<string,vector<int> > tftd2;
//! @brief a private map of string, vector<double>: the weight of each token in each document, stored similarly to tftd1 but for weight instead of frequency.
map<string,vector<double> > wtd;
//! @brief a private vector int: the total number of words per document, stored similarly to the vector<int> in tftd1.
vector<int>total1;
//! @brief a private vector int: the same as total1, but without stopwords.
vector<int>total2;
public:
//! @brief a default constructor.
/*!
* Initializes the indexer's N to 0, normalized to false, and stpw to a stopwords ("stopwords.txt").
*/
indexer<size>()
:N(0),normalized(false),stpw(new stopwords("stopwords.txt"))
{
}
~indexer<size>(){
delete stpw;
}
//! @brief an accessor for size.
/*!
* @return the size of the indexer.
*/
int indexersize(){
return size;
}
//! @brief an accessor for normalize.
/*!
* @return if the indexer is normalized.
*/
bool isNormalize(){
return normalized;
}
//! @brief an accessor for stpw.
/*!
* @return the indexer's stopwords.
*/
stopwords* getstpw(){
return stpw;
}
//! @brief an accessor for documents.
/*!
* @return the documents in indexer.
*/
vector<document> & getdocuments(){
return documents;
}
//! @brief an accessor for tftd2.
/*!
* @return the indexer's tftd2.
*/
map<string,vector<int> > & getTFtd2(){
return tftd2;
}
//! @brief an accessor for wtd.
/*!
* @return the indexer's wtd.
*/
map<string,vector<double> > & getWtd(){
return wtd;
}
//! @brief a function which normalizes the indexer.
/*!
* Calculates the frequency of a token (how many documents it appears in).
* Then, calculates the weight of a token in each document.
*/
void normalize(){
normalized = true;
// looping on the filtered tokens and calculating the document frequency for each token
for(map<string, vector<int> >::const_iterator it = tftd2.begin();it != tftd2.end();it++){
string t = it->first;
int df = 0;
for(int j=0;j<tftd2[t].size();j++){
if(tftd2[t][j] > 0)
df++;
}
dft[t] = df;
}
wtd.clear();
// looping on the filtered tokens and calculating the document weight wt,d for each token
for(map<string, vector<int> >::const_iterator it = tftd2.begin();it != tftd2.end();it++){
string t = it->first;
for(int j=0;j<tftd2[t].size();j++){
// formula from assignment
double s = indexersize();
double d = dft[t];
double tf = tftd2[t][j];
double temp1 = 1+((double)log(tf)/(double)log(10));
double temp2 = (double)log(s/d)/(double)log(10);
double Wtd = temp1*temp2;
if(Wtd == -INFINITY || Wtd == INFINITY || isnan(Wtd))
wtd[t].push_back(0);
else
wtd[t].push_back(Wtd);
}
}
}
//! @brief an operator[] overload.
/*!
* Gets the indexer's n'th document.
* @param n: an int index.
* @return the document at index n.
*/
const document & operator [](int n){
return documents[n];
}
//! @brief an operator>> overload.
/*!
* Adds a new document to the indexer, then calculates term frequency for that document and normalizes the indexer.
* @param d: the document to add to the indexer.
* @param idx: the indexer to which we are adding a document.
* @return the document d.
*/
friend const document & operator >>(document & d,indexer<size> & idx){
idx.normalized = false; // reading new document so indexer not normalized
idx.documents.push_back(d); // pushing document to documnet vector
string cont = d.content(); // string content of document
int total_1 = 0;
int total_2 = 0;
tokenizer* t = new tokenizer;
vector<string> tokens = t->tokenize(cont); // changing the content of document into tokens
// same as assignment 1 pushing zeros then incrementing the token
for(int i=0;i<tokens.size();i++){
string s = tokens[i];
if(idx.stpw->operator ()(s)){
if(idx.tftd1[s].size() == 0){
for(int i=0;i<size;i++){
idx.tftd1[s].push_back(0);
}
}
idx.tftd1[s][idx.N]++;
total_1++;
}
else{
if(idx.tftd1[s].size() == 0){
for(int i=0;i<size;i++){
idx.tftd1[s].push_back(0);
}
}
idx.tftd1[s][idx.N]++;
if(idx.tftd2[s].size() == 0){
for(int i=0;i<size;i++){
idx.tftd2[s].push_back(0);
}
}
idx.tftd2[s][idx.N]++;
total_1++;
total_2++;
}
}
idx.total1.push_back(total_1);
idx.total2.push_back(total_2);
idx.normalize();
idx.N++; // counter for index of which document is read
delete t;
return d;
}
//! @brief an operator<< overload
/*!
* Fills an outstream with the document matrices information (term frequency, weight etc.)
* @param os: the outstream which will receive the matrix information
* @param idx: the indexer from which we take information
* @return the outstream with matrix information.
*/
friend ostream & operator <<(ostream & os,indexer<size> & idx){
os << endl << "******* Full Document Matrix version *********" <<endl;
os << left << setw(20) << "Dictionary";
for(int i=0;i<size;++i){
os << right << setw(20) << idx[i].name();
}
os << endl;
for(map<string,vector<int> >::iterator it = idx.tftd1.begin();it != idx.tftd1.end();++it){
os << left << setw(20) << it->first;
for(int i=0;i<it->second.size();i++){
os << right << setw(20) << it->second[i];
}
os << endl;
}
os << left << setw(20) << "Total";
for(int i=0;i<idx.total1.size();i++){
os << right << setw(20) << idx.total1[i];
}
os << endl;
os << endl << "******* Filtered Document Matrix version *********" <<endl;
os << left << setw(20) << "Dictionary";
for(int i=0;i<size;++i){
os << right << setw(20) << idx[i].name();
}
os << endl;
for(map<string,vector<int> >::iterator it = idx.tftd2.begin();it != idx.tftd2.end();++it){
os << left << setw(20) << it->first;
for(int i=0;i<it->second.size();i++){
os << right << setw(20) << it->second[i];
}
os << endl;
}
os << left << setw(20) << "Total";
for(int i=0;i<idx.total2.size();i++){
os << right << setw(20) << idx.total2[i];
}
os << endl;
os << endl << "******* Document Frequency *********" <<endl;
os << left << setw(20) << "Dictionary" << right << setw(20) << "Document Frequency" << endl;
for(map<string,int >::iterator it = idx.dft.begin();it != idx.dft.end();++it)
os << left << setw(20) << it->first << right << setw(20) << it->second << endl;
os << endl << "******* Tf-idf weight *********" <<endl;
os << left << setw(20) << "Dictionary";
for(int i=0;i<size;++i){
os << right << setw(20) << idx[i].name();
}
os << endl;
for(map<string,vector<double> >::iterator it = idx.wtd.begin();it != idx.wtd.end();++it){
os << left << setw(20) << it->first;
for(int i=0;i<it->second.size();i++){
os << right << setw(20) << it->second[i];
}
os << endl;
}
return os;
}
//! @brief an exception to be thrown when attemptin to query an un-normalized indexer.
enum Exceptions { INDEX_NOT_NORMALIZED };
};
//! @brief Compares two document-double pairs
/*!
* @param a: the first score pair
* @param b: the second score pair
* @return a boolean: whether the a's score is greater than b's
*/
bool sortpairs(const pair<document,double> &a,const pair<document,double> &b);
#endif /* INDEXER_H_ */