-
Notifications
You must be signed in to change notification settings - Fork 0
/
SimpleTokenizer.cs
57 lines (49 loc) · 1.6 KB
/
SimpleTokenizer.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
using System.Collections.Generic;
namespace MySearchEngine.Core.Analyzer.Tokenizers
{
public class SimpleTokenizer : ITokenizer
{
private readonly Dictionary<string, Token> _termTokenMapping;
public SimpleTokenizer()
{
_termTokenMapping = new Dictionary<string, Token>();
}
public IEnumerable<Token> Tokenize(string text)
{
// Tokenize by space
var index = 0;
var tokenStart = -1;
while (index <= text.Length)
{
if (index == text.Length || ShouldEscape(text[index]))
{
if (tokenStart >= 0)
{
var term = text[tokenStart..index];
AddToToken(term, tokenStart);
tokenStart = -1;
}
}
else if (index == 0 || (index > 0 && ShouldEscape(text[index - 1])))
{
tokenStart = index;
}
index++;
}
return _termTokenMapping.Values;
}
private void AddToToken(string term, int index)
{
var token = _termTokenMapping.ContainsKey(term)
? _termTokenMapping[term]
: new Token(term);
token.Positions.Add(index);
_termTokenMapping.TryAdd(term, token);
}
private static bool ShouldEscape(char c)
{
// Take it simple, tokenize letter and digit only
return !char.IsLetterOrDigit(c);
}
}
}