-
Notifications
You must be signed in to change notification settings - Fork 0
/
searchTfIdf.c
146 lines (112 loc) · 3.25 KB
/
searchTfIdf.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*
* COMP2521 - Data Structures and Algorithms
* Assignment 2 - Simple Search Engines
*
* Group: TwoBrothers
*
* Partners: Kurt Banwell-Pachernegg (Z5022859)
* Sam Eager (Z3414861)
*
*
* Part 2. - Content-based Search Engine
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <ctype.h>
#include "collection.h"
#include "BST.h"
#include "HashMap.h"
#include "Queue.h"
#include "PriorityQueue.h"
#define MAX_RESULTS 30
struct Result {
char * url;
int count;
double rank;
};
typedef struct Result * Result;
static Result newResult(char * url);
static int resultCompare(void * a, void * b);
static int stringCompare(void * a, void * b);
int main (int argc, char *argv[])
{
Queue terms = newQueue();
Queue urls = getUrls();
PriorityQueue termsTemp = newPriorityQueue(stringCompare);
int i, n;
// Convert search terms to lowercase and load into words.
for(i = 1; i < argc; i++) {
n = 0;
while(argv[i][n] != '\0') {
argv[i][n] = tolower(argv[i][n]);
n++;
}
addPriorityQueue(termsTemp, argv[i]);
addQueue(terms, argv[i]);
}
/* Generates HashMap with search terms as keys and the number of
urls that contain the term as values. */
HashMap urlCounts = getUrlCount(termsTemp);
// For ease of iteration.
QueueIterator urlIterator = newQueueIterator(urls);
QueueIterator termsIterator = newQueueIterator(terms);
char * url;
char * term;
HashMap wordCounts;
double tf;
double idf;
Result result;
PriorityQueue results = newPriorityQueue(resultCompare);
/* Create a priority queue full of results. Each result is a url with its
corresponding number of matches and rank. */
while(hasNextQueueIterator(urlIterator)) {
url = nextQueueIterator(urlIterator);
/* Generates a HashMap with words contained in the given url as
keys and their corrensponding frequency as values. Also contains
a special ' ' key that maps to the total number of words in the url.*/
wordCounts = getUrlWordsFrequency(url);
result = newResult(url);
while(hasNextQueueIterator(termsIterator)) {
term = nextQueueIterator(termsIterator);
// Calculate tf id values.
tf = (double) getInteger(getHashMap(wordCounts, term)) / getInteger(getHashMap(wordCounts, " "));
idf = log10((double) sizeQueue(urls) / getInteger(getHashMap(urlCounts, term)));
result->rank += tf*idf;
if(existsHashMap(wordCounts, term)) result->count++;
}
resetQueueIterator(termsIterator);
addPriorityQueue(results, result);
}
int count = 0;
// Print out the urls in order (max 30)
while(count++ < MAX_RESULTS && !emptyPriorityQueue(results)) {
result = nextPriorityQueue(results);
printf("%s %.6f\n", result->url, result->rank);
}
return 1;
}
static Result newResult(char * url)
{
Result result = malloc(sizeof(struct Result));
result->url = url;
result->count = 0;
result->rank = 0;
return result;
}
static int resultCompare(void * a, void * b)
{
Result A = (Result) a;
Result B = (Result) b;
int result = A->count - B->count;
if(result != 0) return result;
if(A->rank - B->rank > 0) return 1;
if(A->rank - B->rank < 0) return -1;
return strcmp(A->url, B->url);
}
static int stringCompare(void * a, void * b)
{
return strcmp((char *) b, (char *) a);
}