Skip to content

Commit 0ec045c

Browse files
author
nshmyrev
committed
Fix crash when arpa line is not properly formatted
git-svn-id: svn+ssh://svn.code.sf.net/p/cmusphinx/code/trunk/sphinxbase@13273 94700074-3cef-4d97-a70e-9c8c206c02f5
1 parent 69c473c commit 0ec045c

File tree

5 files changed

+32
-28
lines changed

5 files changed

+32
-28
lines changed

src/libsphinxbase/lm/ngram_model_trie.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,12 @@ read_1grams_arpa(lineiter_t ** li, uint32 count, ngram_model_t * base,
119119
*li = lineiter_next(*li);
120120
if (*li == NULL) {
121121
E_ERROR
122-
("Unexpected end of ARPA file. Failed to read %dth unigram\n",
122+
("Unexpected end of ARPA file. Failed to read unigram %d\n",
123123
i + 1);
124124
return -1;
125125
}
126126
if ((n = str2words((*li)->buf, wptr, 3)) < n_parts) {
127-
E_ERROR("Format error at line %s, Failed to read unigrams\n", (*li)->buf);
127+
E_ERROR("Format error at line %d, Failed to read unigrams\n", (*li)->lineno);
128128
return -1;
129129
}
130130

src/libsphinxbase/lm/ngrams_raw.c

+25-26
Original file line numberDiff line numberDiff line change
@@ -68,28 +68,20 @@ ngram_ord_comparator(const void *a_raw, const void *b_raw)
6868
}
6969

7070
static int
71-
read_ngram_instance(lineiter_t ** li, hash_table_t * wid,
72-
logmath_t * lmath, int order, int order_max,
73-
ngram_raw_t * raw_ngram)
71+
ngrams_raw_read_line(lineiter_t *li, hash_table_t *wid,
72+
logmath_t *lmath, int order, int order_max,
73+
ngram_raw_t *raw_ngram)
7474
{
75-
int n;
75+
int n, i;
7676
int words_expected;
77-
int i;
7877
char *wptr[NGRAM_MAX_ORDER + 1];
7978
uint32 *word_out;
8079

81-
if (*li)
82-
*li = lineiter_next(*li);
83-
if (*li == NULL) {
84-
E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n",
85-
order);
86-
return -1;
87-
}
8880
words_expected = order + 1;
8981
if ((n =
90-
str2words((*li)->buf, wptr,
82+
str2words(li->buf, wptr,
9183
NGRAM_MAX_ORDER + 1)) < words_expected) {
92-
E_ERROR("Format error; %d-gram ignored: %s\n", order, (*li)->buf);
84+
E_ERROR("Format error; %d-gram ignored at line %d\n", order, li->lineno);
9385
return -1;
9486
}
9587

@@ -136,12 +128,12 @@ read_ngram_instance(lineiter_t ** li, hash_table_t * wid,
136128
}
137129

138130
static int
139-
ngrams_raw_read_order(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
140-
hash_table_t * wid, logmath_t * lmath, uint32 count,
131+
ngrams_raw_read_section(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
132+
hash_table_t * wid, logmath_t * lmath, uint32 *count,
141133
int order, int order_max)
142134
{
143135
char expected_header[20];
144-
uint32 i;
136+
uint32 i, cur;
145137

146138
sprintf(expected_header, "\\%d-grams:", order);
147139
while (*li && strcmp((*li)->buf, expected_header) != 0) {
@@ -153,14 +145,21 @@ ngrams_raw_read_order(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
153145
return -1;
154146
}
155147

156-
*raw_ngrams = (ngram_raw_t *) ckd_calloc(count, sizeof(ngram_raw_t));
157-
for (i = 0; i < count; i++) {
158-
if (read_ngram_instance(li, wid, lmath, order, order_max,
159-
&((*raw_ngrams)[i])) < 0)
160-
break;
148+
*raw_ngrams = (ngram_raw_t *) ckd_calloc(*count, sizeof(ngram_raw_t));
149+
for (i = 0, cur = 0; i < *count && *li != NULL; i++) {
150+
*li = lineiter_next(*li);
151+
if (*li == NULL) {
152+
E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n",
153+
order);
154+
return -1;
155+
}
156+
if (ngrams_raw_read_line(*li, wid, lmath, order, order_max,
157+
*raw_ngrams + cur) == 0) {
158+
cur++;
159+
}
161160
}
162-
163-
qsort(*raw_ngrams, count, sizeof(ngram_raw_t), &ngram_ord_comparator);
161+
*count = cur;
162+
qsort(*raw_ngrams, *count, sizeof(ngram_raw_t), &ngram_ord_comparator);
164163
return 0;
165164
}
166165

@@ -175,8 +174,8 @@ ngrams_raw_read_arpa(lineiter_t ** li, logmath_t * lmath, uint32 * counts,
175174
(ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));
176175

177176
for (order_it = 2; order_it <= order; order_it++) {
178-
if (ngrams_raw_read_order(&raw_ngrams[order_it - 2], li, wid, lmath,
179-
counts[order_it - 1], order_it, order) < 0)
177+
if (ngrams_raw_read_section(&raw_ngrams[order_it - 2], li, wid, lmath,
178+
counts + order_it - 1, order_it, order) < 0)
180179
break;
181180
}
182181

test/unit/test_ngram/107.lm.gz

291 Bytes
Binary file not shown.

test/unit/test_ngram/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ EXTRA_DIST = \
3131
104.lm.gz \
3232
105.lm.gz \
3333
106.lm.gz \
34+
107.lm.gz \
3435
turtle.lm \
3536
turtle.lm.dmp \
3637
turtle.ug.lm \

test/unit/test_ngram/test_lm_read.c

+4
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ main(int argc, char *argv[])
7373
model = ngram_model_read(NULL, LMDIR "/106.lm.gz", NGRAM_ARPA, lmath);
7474
TEST_EQUAL(NULL, model);
7575

76+
/* Read corrupted language model with wrong format line, error expected */
77+
model = ngram_model_read(NULL, LMDIR "/107.lm.gz", NGRAM_ARPA, lmath);
78+
TEST_EQUAL(0, ngram_model_free(model));
79+
7680
/* Read a language model */
7781
model = ngram_model_read(NULL, LMDIR "/100.lm.bz2", NGRAM_ARPA, lmath);
7882
test_lm_vals(model);

0 commit comments

Comments
 (0)