-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlangid.h
507 lines (440 loc) · 19.8 KB
/
langid.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
/****************************** -*- C++ -*- *****************************/
/* */
/* LA-Strings: language-aware text-strings extraction */
/* by Ralf Brown / Carnegie Mellon University */
/* */
/* File: langid.h */
/* Version: 1.30 */
/* LastEdit: 2019-07-14 */
/* */
/* (c) Copyright 2010,2011,2012,2013,2014,2015,2019 */
/* Ralf Brown/Carnegie Mellon University */
/* This program is free software; you can redistribute it and/or */
/* modify it under the terms of the GNU General Public License as */
/* published by the Free Software Foundation, version 3. */
/* */
/* This program is distributed in the hope that it will be */
/* useful, but WITHOUT ANY WARRANTY; without even the implied */
/* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR */
/* PURPOSE. See the GNU General Public License for more details. */
/* */
/* You should have received a copy of the GNU General Public */
/* License (file COPYING) along with this program. If not, see */
/* http://www.gnu.org/licenses/ */
/* */
/************************************************************************/
#ifndef __LANGID_H_INCLUDED
#define __LANGID_H_INCLUDED
#include "mtrie.h"
#include "ptrie.h"
using namespace std ;
/************************************************************************/
/* Manifest Constants */
/************************************************************************/
// current binary file format version
#define LANGID_FILE_VERSION 6
#define LANGID_FILE_SIGNATURE "Language Identification Database\r\n\x1A\004\0"
// minimum file version still supported
#define LANGID_MIN_FILE_VERSION 6
// reserved space for future additions to the file header
#define LANGID_PADBYTES_1 64
#define LANGID_FILE_DMOFFSET 96
// version 1-4 file format uses fixed-length string fields for simplicity
#define LANGID_STRING_LENGTH 64
#ifndef DBDIR
#define DBDIR "/usr/share/langident"
#endif
#ifndef DEFAULT_LANGID_DATABASE
#define DEFAULT_LANGID_DATABASE DBDIR "/languages.db"
#endif
#ifndef ALTERNATE_LANGID_DATABASE
#define ALTERNATE_LANGID_DATABASE "~/.langident/languages.db"
#endif
#ifndef FALLBACK_LANGID_DATABASE
#define FALLBACK_LANGID_DATABASE "./languages.db"
#endif
#ifndef DEFAULT_CHARSET_DATABASE
#define DEFAULT_CHARSET_DATABASE DBDIR "/charsets.db"
#endif
#ifndef ALTERNATE_CHARSET_DATABASE
#define ALTERNATE_CHARSET_DATABASE "~/.langident/charsets.db"
#endif
#ifndef FALLBACK_CHARSET_DATABASE
#define FALLBACK_CHARSET_DATABASE "./charsets.db"
#endif
// since the bigram byte model is much weaker than the long-ngram
// model, give it proportionally less weight so that it basically
// acts as a tie-breaker when there are no long ngram hits
#ifndef DEFAULT_BIGRAM_WEIGHT
#define DEFAULT_BIGRAM_WEIGHT 0.15
#endif
// consider any language score up to this value to be the same as zero
// to avoid random noise
#define LANGID_ZERO_SCORE 0.01
// how much above the minimal score must a language score be to be considered
// even a guess
#define GUESS_CUTOFF (20 * LANGID_ZERO_SCORE)
// at what point are we so sure that we don't flag the identification even
// if it is highly ambiguous?
#define SURE_THRESHOLD (800 * LANGID_ZERO_SCORE)
// the range of length- and frequency-weighted ngram coverages; this
// determines the scaling of the 32-bit integer actually stored in
// the binary model file
#define MAX_WEIGHTED_COVER 32.0
#define MAX_FREQ_COVER 100.0
#define MAX_MATCH_FACTOR 16.0
/************************************************************************/
/************************************************************************/
class NybbleTrie ;
class TrigramCounts
{
private:
uint32_t m_counts[256 * 256 * 256] ;
public:
TrigramCounts() { std::fill_n(m_counts,lengthof(m_counts),0) ; }
TrigramCounts(const TrigramCounts *orig) ;
~TrigramCounts() {}
// accessors
uint32_t count(uint8_t c1, uint8_t c2, uint8_t c3) const
{ return m_counts[(c1 << 16) + (c2 << 8) + c3] ; }
uint32_t totalCount(uint8_t c1, uint8_t c2) const ;
bool enumerate(NybbleTrie &ngrams) const ;
// modifiers
void copy(const TrigramCounts *orig) ;
void clear(uint8_t c1, uint8_t c2, uint8_t c3)
{ m_counts[(c1 << 16) + (c2 << 8) + c3] = 0 ; }
void incr(uint8_t c1, uint8_t c2, uint8_t c3, uint32_t cnt = 1)
{ m_counts[(c1 << 16) + (c2 << 8) + c3] += cnt ; }
void filter(unsigned K, unsigned max_len, bool verbose) ;
void filter(int32_t threshold) ;
// I/O
static TrigramCounts *load(Fr::CFile& f) ;
bool read(Fr::CFile& f) ;
bool save(Fr::CFile& f) const ;
} ;
//----------------------------------------------------------------------
class BigramCounts
{
private:
uint64_t m_total ;
uint32_t m_counts[256 * 256] ;
public:
BigramCounts() { std::fill_n(m_counts,lengthof(m_counts),0) ; m_total = 0 ; }
BigramCounts(Fr::CFile& f) ;
BigramCounts(const BigramCounts *) ;
BigramCounts(const TrigramCounts &) ;
BigramCounts(const TrigramCounts *) ;
~BigramCounts() {}
// accessors
uint32_t count(uint8_t c1, uint8_t c2) const { return m_counts[(c1 << 8) + c2] ; }
uint64_t totalCount() const { return m_total ; }
double probability(uint8_t c1, uint8_t c2) const
{ return this->count(c1,c2) / (double)this->totalCount() ; }
double averageProbability(const char *buffer, size_t buflen) const ;
// modifiers
void copy(const BigramCounts *orig) ;
void clear(uint8_t c1, uint8_t c2)
{ m_counts[(c1 << 8) + c2] = 0 ; }
void set(uint8_t c1, uint8_t c2, uint32_t cnt)
{ m_counts[(c1 << 8) + c2] = cnt ; }
void incr(uint8_t c1, uint8_t c2, uint32_t cnt = 1)
{ m_counts[(c1 << 8) + c2] += cnt ; }
void scaleTotal(unsigned factor) { m_total *= factor ; }
// I/O
static BigramCounts *load(Fr::CFile& f) ;
bool read(Fr::CFile& f) ;
bool readBinary(Fr::CFile& f) ;
bool dumpCounts(Fr::CFile& f) const ;
bool save(Fr::CFile& f) const ;
} ;
//----------------------------------------------------------------------
class LanguageID
{
public:
LanguageID() ;
LanguageID(const char *lang, const char *reg, const char *enc,
const char *source = nullptr, const char *script = "UNKNOWN") ;
LanguageID(const LanguageID &orig) ;
LanguageID(const LanguageID *orig) ;
LanguageID& operator= (LanguageID& orig) ;
LanguageID& operator= (LanguageID&& orig) ;
~LanguageID() ;
// accessors
const char *language() const { return m_language ; }
const char *friendlyName() const { return m_friendlyname ; }
const char *region() const { return m_region ; }
const char *encoding() const { return m_encoding ; }
const char *source() const { return m_source ; }
const char *script() const { return m_script ; }
unsigned alignment() const { return m_alignment ; }
double coverageFactor() const { return m_coverage > 0.0 ? m_coverage : 1.0 ; }
double countedCoverage() const { return m_countcover ; }
double freqCoverage() const { return m_freqcover ; }
double matchFactor() const { return m_matchfactor ; }
uint64_t trainingBytes() const { return m_trainbytes ; }
// modifiers
void setLanguage(const char *lang, const char *friendly = nullptr) ;
void setRegion(const char *region) ;
void setEncoding(const char *encoding) ;
void setSource(const char *source) ;
void setScript(const char *scr) ;
void setAlignment(unsigned align) { m_alignment = align ; }
void setAlignment(const char *align) ;
void setCoverageFactor(double coverage) ;
void setCountedCoverage(double coverage) ;
void setFreqCoverage(double coverage) ;
void setMatchFactor(double match) ;
void setTraining(uint64_t train_bytes) { m_trainbytes = train_bytes ; }
bool guessScript() ;
// operators
bool sameLanguage(const LanguageID &other, bool ignore_region) const ;
bool matches(const LanguageID *lang_info) const ;
bool matches(const char *language, const char *region,
const char *encoding, const char *source) const ;
bool operator == (const LanguageID &) const ;
// I/O
static LanguageID* read(Fr::CFile& f, unsigned version) ;
static bool read(Fr::CFile& f, LanguageID *langID, unsigned version) ;
bool write(Fr::CFile& f) const ;
protected:
void clear() ;
private:
Fr::CharPtr m_language ;
Fr::CharPtr m_region ;
Fr::CharPtr m_encoding ;
Fr::CharPtr m_source ;
Fr::CharPtr m_script ;
const char *m_friendlyname ;
double m_coverage ; // percent of training covered by ngrams
double m_countcover ; // coverage weighted by count of matches
double m_freqcover ; // coverage weighted by frequencies of matches
double m_matchfactor ;
uint64_t m_trainbytes ;
unsigned m_alignment ;
} ;
//----------------------------------------------------------------------
class LanguageScores
{
public: // types
class Info
{
public:
// Info() {}
void init(double sc, unsigned short new_id)
{ m_score = sc ; m_id = new_id ; }
// accessors
double score() const { return m_score ; }
unsigned short id() const { return m_id ; }
// manipulators
void setScore(double sc) { m_score = sc ; }
void incrScore(double inc) { m_score += inc ; }
void decrScore(double dec) { m_score -= dec ; }
void setLang(unsigned short id) { m_id = id ; }
Info& operator= (double sc) { m_score = sc ; return *this ; }
Info& operator= (int id) { m_id = id ; m_score = 0.0 ; return *this ; }
static void swap(Info&, Info&) ;
// comparison
static int compare(const Info&, const Info&) ;
bool operator< (const Info& other) const { return compare(*this,other) < 0 ; }
bool operator== (double sc) { return sc == m_score ; }
private:
double m_score ;
unsigned short m_id ;
} ;
typedef Fr::ItemPoolFlat<Info>::iter_type iter_type ;
typedef Fr::ItemPoolFlat<Info>::const_iter_type const_iter_type ;
public:
LanguageScores(size_t num_languages) ;
LanguageScores(const LanguageScores *orig) ;
LanguageScores(const LanguageScores *orig, double scale) ;
~LanguageScores() = default ;
// accessors
void *userData() const { return m_userdata ; }
bool sorted() const { return m_sorted ; }
unsigned numLanguages() const { return m_info.size() ; }
unsigned maxLanguages() const { return m_info.capacity() ; }
unsigned activeLanguage() const { return m_active_language ; }
unsigned topLanguage() const { return m_info[0].id() ; }
unsigned languageNumber(size_t N) const
{ return (N < numLanguages()) ? m_info[N].id() : ~0 ; }
double score(size_t N) const
{ return (N < numLanguages()) ? m_info[N].score() : -1.0 ; }
double highestScore() const ;
unsigned highestLangID() const ;
unsigned nonzeroScores() const ;
// manipulators
void setUserData(void *u) { m_userdata = u ; }
void clear() ;
void reserve(size_t N) ;
void setScore(size_t N, double val)
{ if (N < numLanguages()) m_info[N].setScore(val) ; }
void increment(size_t N, double incr = 1.0)
{ if (N < numLanguages()) m_info[N].incrScore(incr) ; }
void decrement(size_t N, double decr = 1.0)
{ if (N < numLanguages()) m_info[N].incrScore(-decr) ; }
void scaleScore(size_t N, double scale_factor)
{ if (N < numLanguages()) m_info[N].setScore(m_info[N].score() * scale_factor) ; }
void scaleScores(double scale_factor) ;
void sqrtScores() ;
void add(const LanguageScores *scores, double weight = 1.0) ;
void addThresholded(const LanguageScores *scores, double threshold,
double weight = 1.0) ;
void subtract(const LanguageScores *scores, double weight = 1.0) ;
bool lambdaCombineWithPrior(LanguageScores *prior, double lambda,
double smoothing) ;
void filter(double cutoff_ratio) ;
void sort(double cutoff_ratio = 0.0) ;
void sort(double cutoff_ratio, unsigned max_langs) ;
void mergeDuplicateNamesAndSort(const LanguageID *langinfo) ;
void filterDuplicates(const class LanguageIdentifier *,
bool ignore_region = false) ;
void setLanguage(unsigned lang)
{ m_active_language = lang ; }
// iterator support
iter_type begin() const { return m_info.begin() ; }
const_iter_type cbegin() const { return m_info.cbegin() ; }
iter_type end() const { return m_info.end() ; }
const_iter_type cend() const { return m_info.cend() ; }
protected: // methods
void sortByName(const LanguageID *langinfo) ;
protected: // members
Fr::ItemPoolFlat<Info> m_info ;
void* m_userdata ;
unsigned m_active_language ;
bool m_sorted ;
} ;
//----------------------------------------------------------------------
class WeightedLanguageScores : public LanguageScores
{
public:
WeightedLanguageScores(size_t num_languages,
double def_weight = 1.0) ;
~WeightedLanguageScores() = default ;
// accessors
double weight(size_t N) const
{ return (N < numLanguages()) ? m_weights[N] : 0.0 ; }
// manipulators
void setWeight(size_t N, double wt)
{ if (N < numLanguages()) m_weights[N] = wt ; }
void incrWeight(size_t N, double wt)
{ if (N < numLanguages()) m_weights[N] += wt ; }
void sqrtWeights() ;
private:
Fr::DoublePtr m_weights ;
} ;
//----------------------------------------------------------------------
class LanguageIdentifier
{
public:
static constexpr uint32_t unknown_lang = (uint32_t)~0 ;
// how much above the minimal score must a language score be to
// be considered a reliable identification (and not get a
// question mark)?
static constexpr double UNSURE_CUTOFF = (120 * LANGID_ZERO_SCORE) ;
// set the multiplicative factor by which to decay the prior
// scores for each new string
static constexpr double SMOOTHING_DECAY_FACTOR = 0.25 ;
public:
LanguageIdentifier(const char *language_data_file,
bool verbose = false) ;
~LanguageIdentifier()
{ if (m_charsetident && m_charsetident != this) delete m_charsetident ; }
// factory
static Fr::Owned<LanguageIdentifier> load(const char* db_file, const char* charset_file, bool create = false,
bool verbose = false) ;
// set charset_file to NULL for default search, "" to not use a separate
// database (use the main database for charset ID as well as lang ID)
static void unload(LanguageIdentifier* id) ;
// accessors
bool good() const { return m_langdata && m_langdata->good() ; }
bool verbose() const { return m_verbose ; }
bool smoothingScores() const { return m_smooth ; }
bool applyCoverageFactor() const { return m_apply_cover_factor && m_adjustments ; }
size_t allocLanguages() const { return m_langinfo.capacity() ; }
size_t numLanguages() const { return m_langinfo.size() ; }
double adjustmentFactor(size_t N) const { return m_adjustments[N] ; }
LanguageIdentifier *charsetIdentifier() const { return m_charsetident ; }
class LangIDPackedMultiTrie* trie() const { return m_langdata.get() ; }
LangIDPackedMultiTrie *packedTrie() ;
class LangIDMultiTrie *unpackedTrie() ;
const char *databaseLocation() const { return m_directory ; }
const char *languageName(size_t N) const ;
const char *friendlyName(size_t N) const ;
const char *languageScript(size_t N) const ;
const uint8_t *alignments() const { return m_alignments ; }
Fr::CharPtr languageDescriptor(size_t N) const ;
const char *languageEncoding(size_t N) const ;
const char *languageSource(size_t N) const ;
const LanguageID *languageInfo(size_t N) const
{ return N < numLanguages() ? &m_langinfo[N] : nullptr ; }
uint64_t trainingBytes(size_t N) const
{ return N < numLanguages() ? m_langinfo[N].trainingBytes() : 0 ; }
unsigned languageNumber(const LanguageID *lang_info) const ;
unsigned languageNumber(const char *langdescript) const ;
bool identify(LanguageScores *scores, const char *buffer,
size_t buflen, const uint8_t *alignments,
bool ignore_whitespace = false,
bool apply_stop_grams = true,
size_t length_normalization = 0) const ;
LanguageScores *identify(const char *buffer, size_t buflen,
bool ignore_whitespace = false,
bool apply_stop_grams = true,
bool enforce_alignments = true) const ;
LanguageScores *identify(LanguageScores *scores, /* may be NULL */
const char *buffer, size_t buflen,
bool ignore_whitespace = false,
bool apply_stop_grams = true,
bool enforce_alignments = true) const ;
bool finishIdentification(LanguageScores *scores, unsigned select_highestN = 0,
double cutoff_ratio = 0.1) const ;
Fr::Owned<LanguageScores> smoothedScores(LanguageScores* rawscores, int buflen) const ;
Fr::Owned<LanguageScores> similarity(unsigned langid) const ;
bool sameLanguage(size_t L1, size_t L2,
bool ignore_region = false) const ;
double bigramWeight() const { return m_bigram_weight ; }
// modifiers
uint32_t addLanguage(const LanguageID &info, uint64_t train_bytes) ;
void charsetIdentifier(LanguageIdentifier *id)
{ m_charsetident = (id ? id : this) ; }
void setBigramWeight(double weight) { m_bigram_weight = weight ; }
void useFriendlyName(bool friendly = true) { m_friendly_name = friendly ; }
void smoothScores(bool sm = true) { m_smooth = sm ; }
void runVerbosely(bool v) { m_verbose = v ; }
void applyCoverageFactor(bool apply) { m_apply_cover_factor = apply ; }
void incrStringCount(size_t langnum) ;
bool computeSimilarities() ;
// I/O
static bool checkSignature(Fr::CFile& f, unsigned *version = nullptr) ;
bool writeStatistics(Fr::CFile& f) const ;
bool writeHeader(Fr::CFile& f) const ;
bool write(Fr::CFile& f) ;
bool write(const char* filename) const ;
bool dump(Fr::CFile& f, bool show_ngrams = false) const ;
private:
void setAlignments() ;
bool setAdjustmentFactors() ;
static Fr::Owned<LanguageIdentifier> tryLoading(const char* db_file, bool verbose) ;
private:
Fr::Owned<LangIDPackedMultiTrie> m_langdata { nullptr } ;
Fr::Owned<LangIDMultiTrie> m_uncomplangdata { nullptr } ;
mutable Fr::Owned<LanguageScores> m_prior_scores { nullptr } ;
Fr::ItemPoolFlat<LanguageID> m_langinfo ;
Fr::DoublePtr m_length_factors ;
Fr::DoublePtr m_adjustments ;
Fr::UInt8Ptr m_alignments ;
Fr::UInt8Ptr m_unaligned ;
Fr::NewPtr<size_t> m_string_counts ;
Fr::CharPtr m_directory ;
LanguageIdentifier* m_charsetident ;
double m_bigram_weight ;
bool m_friendly_name ;
bool m_apply_cover_factor ;
bool m_verbose ;
bool m_smooth { true } ;
} ;
/************************************************************************/
/* Procedural interface */
/************************************************************************/
double set_stopgram_penalty(double wt) ;
#endif /* !__LANGID_H_INCLUDED */
// end of file langid.h //