-
Notifications
You must be signed in to change notification settings - Fork 0
/
trie.h
186 lines (148 loc) · 9.56 KB
/
trie.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#ifndef _TRIE_H
#define _TRIE_H 1
#include <malloc.h> // for dynamic memory allocations
#include <stdio.h> // for writing to streams
#include <stdlib.h> // for exit(), in case of abnormal conditions rearing their ugly head :P
#define DEFAULT_STR_SIZE 8
// ============================ Structs/Types ============================
struct trie_node; // Forward declaration
typedef struct char_n_addr // Holds a character-address pair
{
char ch;
struct trie_node *addr;
} char_n_addr;
typedef struct trie_node // Holds information related to a Trie Node.
{
unsigned int frequency, child_count, children_size;
char_n_addr **children;
} trie_node;
typedef trie_node* trie_t;
// ============================ Functions/Operations ============================
static trie_node* create_trie_node() // Creates a new Trie node with the default values.
{
trie_node* node = malloc(sizeof(trie_node)); // allocate memory for an instance of a Trie node.
node->frequency = 0; // by default, the frequency..
node->child_count = 0; // ..child_count..
node->children_size = 0; // ..children_size is made 0 since it is a new node..
node->children = NULL; // ..and we make the "children" pointer a NULL pointer.
return node; // finally, the address of the new node is returned.
}
trie_t create_trie() // Creates an empty trie which is nothing but a single
{ // node; which is called the root node.
return create_trie_node(); // hence, we just call create_node() and that node
} // is the root node; whose address is returned.
void add_string(trie_t root, const char* string) // Adds the string pointed to by "string" to the Trie pointed by "root".
{
int i;
trie_node* next_node = NULL; // We need to traverse the Trie to the bottom and then add the new
// node (or increment the frequency if it is already present). Thus,
// this pointer stores the next node that we need to look at.
if (*string == '\0') // if it is the end of the string..
{
++root->frequency; // then just increase the frequency
return; // and return.. This is the base/terminating condition
} // for the recursive calls made to add_string().
for(i = 0; i < root->child_count; ++i) // checking whether the current character is present or not.
if(root->children[i]->ch == *string) // if it is..
{
next_node = root->children[i]->addr; // ..then store the address in next_node
break; // and break out of the loop.
}
if(next_node == NULL) // if such a character is not present, then create a new node for it.
{
int pos = root->child_count; // This is the position where the new node would be inserted to root->children.
++root->child_count; // a new child is created, so increment the child_count.
char_n_addr *char_n_addr_node = malloc(sizeof(char_n_addr)); // allocate space for a character-address pair. that node's
// address is stored in this variable.
if(root->child_count > root->children_size) // if the number of children is greater than the size of the children array..
{
if(root->children_size == 0) // ..and if children_size is 0, i.e., its default value
root->children_size = 1; // ..then make the size of the array to hold one character-address pair.
else // ..else, we double the size of the
root->children_size <<= 1; // ..children array by bit-shifting one place to the left
root->children = realloc(root->children, root->children_size * sizeof(char_n_addr*)); // we reallocate in powers of 2 to
} // reduce the number of reallocations
if(root->children == NULL) // if the reallocation failed, then display an error message and quit.
{
fprintf(stderr, "Error during re-allocation in function add_string() in file trie.h\n");
exit(EXIT_FAILURE);
}
next_node = create_trie_node(); // the next Trie node that we need to jump to, does not exist, so create it
char_n_addr_node->ch = *string; // we assign the character
char_n_addr_node->addr = next_node; // -address pair..
root->children[pos] = char_n_addr_node; // and add it to the children array at the position "pos"
}
add_string(next_node, ++string); // move to the next character and add it to the next node.
}
/* Purpose: Recurses through the input trie; concatenating the character at each node of the trie and writes the frequency to the output stream, fout.
**
** Idea: Each recursive call to this function concatenates one character to a string. So, to create the string "ABC", 3 calls in total would be made.
** The position where the new character would be inserted is indicated by the parameter "pos".
** Say, we have two strings in the trie "AC" and "AD". The trie node "A" has two children, viz. "C" and "D". In such cases, a loop is run over all
** the children, and each child is appended one at a time to the position indicated by parameter "pos".
**
** The author wanted to create a dynamically allocated string which is used to contenate the characters at each trie node and would be shared by all
** the instances of this recursive function to reduce the number of memory allocations (which takes precious time and memory). Also, to decrease the
** number of reallocations, the author wanted to increase the size of the string in powers of 2; so initially the size would be 8 (it is rare to come
** across strings that are less than 7 chars; note that an extra char is required to store the NULL); then if extra memory is needed, then the size
** is increased to 16, then 32 and so on. The macro DEFAULT_STR_SIZE defines the default string size.
** To achieve this, the address of the length (of the string) and the address of the string is passed among the various recursive instances.
** A string is a char*; so a pointer to a char* would be char**. We can think of it as a dynamic jagged 2D array where we only use the first string, i.e.
** string[0] contains the intermediate string (and this is how the author dereferences the string in this function). The length's address is passed,
** so we need int* length. This explains the parameters- char** string and int *length
*/
static void write_freq_worker(FILE *fout, trie_node* root, char** string, int pos, int* length)
{
if(root->frequency != 0) // if the frequency is non-zero, it means that it is a whole word..
fprintf(fout, "%-50s %d\n", *string, root->frequency); // hence print the string created so far and its frequency.
if(root->child_count != 0) // if the root has any children, then we proceed, else the function exits
{
int i, prev_length = *length; // prev_length helps detect whether we need extra memory or not.
if(*length == 0) // if *length is 0, it means that the string does not exist.
*length = DEFAULT_STR_SIZE; // so, allocate 8 characters - it is rare to have strings less than 7 chars.
else if (pos + 2 > *length) // the current position + 2 indicates how many characters there would be..and if that is
(*length) <<= 1; // greater than the current length, then double the length of the string.
if(prev_length != *length) // if more memory needs to be allocated..
{
string[0] = realloc(*string, *length * sizeof(char)); // then reallocate it..
if(string[0] == NULL) // and in case reallocation was not possible, output an error message to stderr.
{
fprintf(stderr, "Error during re-allocation in function write_freq_worker() in file trie.h\n");
exit(EXIT_FAILURE);
}
}
string[0][pos + 1] = '\0'; // a character would be added at position "pos". pos + 1 should contain a NULL character to terminate the string.
for(i = 0; i < root->child_count; ++i) // for each child of the current root..
{
string[0][pos] = root->children[i]->ch; // add the character to the appropriate position..
// and call the same function, but this time with an updated root and position. The next child is at root->children[i]->addr
write_freq_worker(fout, root->children[i]->addr, string, pos + 1, length);
}
string[0][pos] = '\0'; // "erasing" the newly inserted character so that the previous recusive callers do not accidentally print it.
}
}
void write_freq(FILE *fout, trie_t root) // writes the frequency of each string in the trie to the output stream fout
{
char **string_ptr = malloc(sizeof(char*)); // as explained in the function description of write_freq_worker(), we need a char**.
// we dynamically allocate the pointer
int len_var = 0; // the length is stored in a local variable of this function.
string_ptr[0] = NULL; // the 0th string (as explained in the worker function description), is initialised to NULL
fprintf(fout, "%-50s FREQUENCY\n\n", "TRACKER NAME");
write_freq_worker(fout, root, string_ptr, 0, &len_var); // call the worker function..
free(string_ptr[0]); // free the string..
free(string_ptr); // and free the dynamic pointer as well.
}
void destroy_trie(trie_t root) // frees memory allocated to a trie.
{
if(root->child_count != 0) // if the root has any children..
{
for(int i = 0; i < root->child_count; ++i) // ..then, for each child..
{
destroy_trie(root->children[i]->addr); // ..consider the child as the next root and recursively free it..
free(root->children[i]); // and free the character-address pair associated with that child.
}
free(root->children); // once all children are freed, free up the children array.
}
free(root); // lastly, destroy the root itself.
}
#endif // trie.h included