-
Notifications
You must be signed in to change notification settings - Fork 0
/
WiktionaryParser.cs
250 lines (233 loc) · 7.71 KB
/
WiktionaryParser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
using System;
using System.Collections.Generic;
using System.Data;
using System.Data.SqlClient;
using System.IO;
using System.Text;
using System.Threading.Tasks;
using HtmlAgilityPack;
using WiktionaryCrawler.Data;
using WiktionaryCrawler.Models;
using WiktionaryCrawler.Serializers;
namespace WiktionaryCrawler
{
/// <summary>
/// A class that parses the Html from the web page.
/// </summary>
public class WiktionaryParser
{
public IDataAccessor DataAccess { get; set; }
public IWiktionarySerializer WikiSerializer { get; set; }
#region Constructors
/// <summary>
/// Constructs a new WiktionaryParser with a web accessor and a file manager.
/// </summary>
public WiktionaryParser()
{
DataAccess = new WebAccessor();
WikiSerializer = new FileManager();
}
/// <summary>
/// Constructs a new WiktionaryParser.
/// </summary>
/// <param name="wikiSerializer">An object that serializes a List of Dictionary Entries (implements IWiktionarySerializer).</param>
/// <param name="dataAccess">An object that accesses wikitionary.org (implements IDataAccessor)</param>
public WiktionaryParser(IWiktionarySerializer wikiSerializer, IDataAccessor dataAccess)
{
DataAccess = dataAccess;
WikiSerializer = wikiSerializer;
}
/// <summary>
/// Copies an existing WiktionaryParser.
/// </summary>
/// <param name="toCopy">The WiktionaryParser to copy.</param>
public WiktionaryParser(WiktionaryParser toCopy)
{
DataAccess = toCopy.DataAccess;
WikiSerializer = toCopy.WikiSerializer;
}
#endregion
/// <summary>
/// Retrieves the current version of wiktionary.org and saves it to the specified medium.
/// </summary>
/// <returns>True if the list of dictionary entries was created successfully, false otherwise.</returns>
public bool GetDictionary()
{
List<string> pages = GetListOfLetterPages();
List<DictionaryEntry> wiktionary = new List<DictionaryEntry>();
foreach (string s in pages)
{
wiktionary = GetWordsFromPage(s);
WikiSerializer.SerializeWiktionary(wiktionary);
}
return true;
}
/// <summary>
/// To comply with the Wikimedia Terms of Service, this method exposes the Urls from which the dictionary is scraped.
/// </summary>
/// <returns>A list of Urls as strings.</returns>
public List<string> GetListOfLetterPages()
{
HtmlDocument wikiDoc = DataAccess.GetStartHtmlDocument();
List<string> letterUrls = new List<string>();
foreach (HtmlNode link in wikiDoc.DocumentNode.SelectNodes("//div[@id='mw-content-text']/center/p/a[@href]"))
{
foreach (HtmlAttribute att in link.Attributes.AttributesWithName("href"))
{
letterUrls.Add(att.Value.ToString());
}
}
//Include the starting Url, since it's disabled on the start page
letterUrls.Add(DataAccess.StartUrl);
return letterUrls;
}
/// <summary>
/// Gets the list of words from a page of wiktionary.org.
/// </summary>
/// <param name="path">The relative path of the page of words to be parsed.</param>
/// <returns>A list of DictionaryEntries.</returns>
public List<DictionaryEntry> GetWordsFromPage(string path)
{
string currWord;
string currWordUri;
Dictionary<PartOfSpeech, string> currDefinitions;
List<DictionaryEntry> wiktionaryPage = new List<DictionaryEntry>();
HtmlDocument wikiDoc = DataAccess.GetHtmlDocumentFromRelativeUrl(path);
HtmlNodeCollection hnc = wikiDoc.DocumentNode.SelectNodes("//div[@id='mw-content-text']/div[@class='index']/ol/li");
if (hnc == null || hnc.Count < 1)
{
return null;
}
foreach (HtmlNode entry in hnc)
{
//Omit words that have no part of speech
if (entry.LastChild.HasAttributes)
{
continue;
}
currWord = entry.FirstChild.InnerText;
currWordUri = GetFirstAttributeByName(entry.FirstChild, "href");
currDefinitions = GetDefinitions(currWordUri);
if (currDefinitions == null)
{
continue;
}
foreach (PartOfSpeech k in currDefinitions.Keys)
{
wiktionaryPage.Add(new DictionaryEntry(currWord, k, currWordUri, currDefinitions[k]));
}
}
return wiktionaryPage;
}
/// <summary>
/// Gets the first attribute in a node by name.
/// </summary>
/// <param name="node">The node to search for the attribute.</param>
/// <param name="attributeName">The attribute to retrieve.</param>
/// <returns>The first value of the attribute if it exists, null otherwise.</returns>
internal string GetFirstAttributeByName(HtmlNode node, string attributeName)
{
//HtmlAttribute doesn't support an index and doesn't have a getfirstElement method
foreach (HtmlAttribute att in node.Attributes.AttributesWithName(attributeName))
{
return att.Value.ToString();
}
return null;
}
/// <summary>
/// Gets the definitions listed on a page for a single word.
/// </summary>
/// <param name="wordUrl">The relative Url of the word.</param>
/// <returns>A Dictionary of parts of speech and definitions from the page.</returns>
internal Dictionary<PartOfSpeech, string> GetDefinitions(string wordUrl)
{
if (wordUrl.Contains("redlink"))
{
return null;
}
HtmlDocument wordPage = DataAccess.GetHtmlDocumentFromRelativeUrl(wordUrl);
HtmlNode textContent = GetFirstNode(wordPage.DocumentNode.SelectNodes("//div[@id='mw-content-text']"));
HtmlNodeCollection definitionNodes = textContent.SelectNodes("ol");
Dictionary<PartOfSpeech, List<string>> defsDict = new Dictionary<PartOfSpeech, List<string>>();
StringBuilder defBuilder = new StringBuilder();
HtmlNode currPosNode;
string currPosText;
PartOfSpeech currPos;
if (definitionNodes == null)
{
return null;
}
for (int i = 0; i < definitionNodes.Count; i++)
{
currPosNode = definitionNodes[i].PreviousSibling.PreviousSibling;
currPosText = currPosNode.InnerText;
if (currPosText.Contains("["))
{
currPos = PartOfSpeech.FromPosName(currPosText.Substring(0,currPosNode.InnerText.IndexOf("[")));
}
else
{
currPos = PartOfSpeech.FromPosName(currPosText.Trim());
}
if (currPos == null)
{
continue;
}
if (!defsDict.ContainsKey(currPos))
{
defsDict.Add(currPos, new List<string>());
}
foreach (HtmlNode li in definitionNodes[i].SelectNodes("li"))
{
foreach (HtmlNode child in li.ChildNodes)
{
defBuilder.Append(child.InnerText);
}
defsDict[currPos].Add(defBuilder.ToString());
defBuilder.Clear();
}
}
return FinalizeDictionary(defsDict);
}
/// <summary>
/// Gets the first node from node collection.
/// </summary>
/// <param name="hnc">The node to get the first node from.</param>
/// <returns>The first node in the collection if it exists, null otherwise.</returns>
internal HtmlNode GetFirstNode(HtmlNodeCollection hnc)
{
if (hnc == null)
{
return null;
}
foreach (HtmlNode hn in hnc)
{
return hn;
}
return null;
}
/// <summary>
/// Converts a list of definitions to a definition string in a dictionary.
/// </summary>
/// <param name="workingDict">The dictionary to be finalized.</param>
/// <returns>The dictionary ready to pack up into a List of DictionaryEntry.</returns>
internal Dictionary<PartOfSpeech, string> FinalizeDictionary(Dictionary<PartOfSpeech, List<string>> workingDict)
{
Dictionary<PartOfSpeech, string> finalDict = new Dictionary<PartOfSpeech, string>();
StringBuilder defsBuilder = new StringBuilder();
int counter = 0;
foreach (PartOfSpeech k in workingDict.Keys)
{
foreach (string s in workingDict[k])
{
counter++;
defsBuilder.Append(counter.ToString() + ".) " + s);
}
finalDict.Add(k, defsBuilder.ToString());
counter = 0;
defsBuilder.Clear();
}
return finalDict;
}
}
}