-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcanonicalstrings.pas
133 lines (113 loc) · 3.77 KB
/
canonicalstrings.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
{$MODE OBJFPC} { -*- delphi -*- }
{$INCLUDE settings.inc}
unit canonicalstrings;
interface
{$IFDEF CPU64}
// if we are on CPU64, there's a DWord of padding in the TAnsiRec header
// we could in theory use this to store the hash
// unfortunately, this only works if we don't use constants, since we can't
// modify the constant's header (it's on disk)
// if you want to use this, uncomment the following line:
// {$DEFINE PACKHASH}
{$ENDIF}
type
TCanonicalString = record
private
FValue: Pointer;
{$IFNDEF PACKHASH} FHashCode: DWord; {$ENDIF}
function GetHashCode(): DWord; inline;
procedure SetHashCode(const Value: DWord);
function GetValue(): UTF8String; inline;
public
property AsString: UTF8String read GetValue;
property HashCode: DWord read GetHashCode;
class function Equals(const A, B: TCanonicalString): Boolean; static; inline;
class function LessThan(const A, B: TCanonicalString): Boolean; static; inline;
class function GreaterThan(const A, B: TCanonicalString): Boolean; static; inline;
end;
operator = (const Op1, Op2: TCanonicalString): Boolean; inline;
function CanonicalStringHash32(const Key: TCanonicalString): DWord; inline;
function Intern(const S: UTF8String): TCanonicalString; inline;
implementation
uses
hashset, hashfunctions, stringutils;
{$IFDEF PACKHASH}
type
PAnsiRec = ^TAnsiRec;
TAnsiRec = record
// based on TAnsiRec in astrings.inc
// hopefully the layout won't change any time soon
CodePage: TSystemCodePage;
ElementSize: Word;
HashCode: DWord; // this is Dummy in astrings.inc's record definition
Ref: SizeInt;
Len: SizeInt;
Data: record end; // this is where the string data goes
end;
{$IF SizeOf(TAnsiRec) <> SizeOf(TSystemCodePage) +
SizeOf(Word) +
SizeOf(DWord) +
SizeOf(SizeInt) +
SizeOf(SizeInt) } {$FATAL TAnsiRec size is unexpected} {$ENDIF}
{$ENDIF}
function TCanonicalString.GetHashCode(): DWord;
begin
{$IFDEF PACKHASH}
Result := PAnsiRec(FValue-SizeOf(TAnsiRec))^.HashCode;
{$ELSE}
Result := FHashCode;
{$ENDIF}
end;
procedure TCanonicalString.SetHashCode(const Value: DWord);
begin
{$IFDEF PACKHASH}
Assert(FValue <> nil);
Assert(PAnsiRec(FValue-SizeOf(TAnsiRec))^.Ref > 0);
PAnsiRec(FValue-SizeOf(TAnsiRec))^.HashCode := Value;
{$ELSE}
FHashCode := Value;
{$ENDIF}
end;
function TCanonicalString.GetValue(): UTF8String;
begin
Result := UTF8String(FValue);
end;
class function TCanonicalString.Equals(const A, B: TCanonicalString): Boolean;
begin
Result := A.FValue = B.FValue;
Assert((Result) = (UTF8String(A.FValue) = UTF8String(B.FValue)));
end;
class function TCanonicalString.LessThan(const A, B: TCanonicalString): Boolean;
begin
Result := A.FValue < B.FValue;
end;
class function TCanonicalString.GreaterThan(const A, B: TCanonicalString): Boolean;
begin
Result := A.FValue > B.FValue;
end;
operator = (const Op1, Op2: TCanonicalString): Boolean;
begin
Result := TCanonicalString.Equals(Op1, Op2);
end;
function CanonicalStringHash32(const Key: TCanonicalString): DWord;
begin
Result := Key.HashCode;
end;
type
TUTF8StringHashSet = specialize THashSet<UTF8String, UTF8StringUtils>;
var
Strings: TUTF8StringHashSet;
function Intern(const S: UTF8String): TCanonicalString;
var
Hash: DWord;
begin
Result.FValue := Pointer(Strings.Intern(S, Hash));
Result.SetHashCode(Hash);
end;
initialization
Strings := TUTF8StringHashSet.Create(@UTF8StringHash32, 8);
finalization
Strings.Free();
// after this point, all TCanonicalString instances are going to be bogus
// since they all point to UTF8Strings that have been dereffed
end.