Edinburgh Speech Tools
2.4-release
Loading...
Searching...
No Matches
ngram_test_main.cc
1
/*************************************************************************/
2
/* */
3
/* Centre for Speech Technology Research */
4
/* University of Edinburgh, UK */
5
/* Copyright (c) 1995,1996 */
6
/* All Rights Reserved. */
7
/* */
8
/* Permission is hereby granted, free of charge, to use and distribute */
9
/* this software and its documentation without restriction, including */
10
/* without limitation the rights to use, copy, modify, merge, publish, */
11
/* distribute, sublicense, and/or sell copies of this work, and to */
12
/* permit persons to whom this work is furnished to do so, subject to */
13
/* the following conditions: */
14
/* 1. The code must retain the above copyright notice, this list of */
15
/* conditions and the following disclaimer. */
16
/* 2. Any modifications must be clearly marked as such. */
17
/* 3. Original authors' names are not deleted. */
18
/* 4. The authors' names are not used to endorse or promote products */
19
/* derived from this software without specific prior written */
20
/* permission. */
21
/* */
22
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30
/* THIS SOFTWARE. */
31
/* */
32
/*************************************************************************/
33
/* Authors: Simon King */
34
/* Date : July 1995 */
35
/*-----------------------------------------------------------------------*/
36
/* EST_Ngrammar test program */
37
/* */
38
/*=======================================================================*/
39
#include "EST.h"
40
#include "EST_Ngrammar.h"
41
42
43
/** @name <command>ngram_test</command> <emphasis> Test n-gram language model </emphasis>
44
@id ngram_test_manual
45
* @toc
46
*/
47
48
//@{
49
50
51
/**@name Synopsis
52
*/
53
//@{
54
55
//@synopsis
56
57
/**
58
ngram_test is for testing ngram models generated from
59
<link linkend=ngram-build-manual>ngram_build</link>.
60
61
<formalpara> <para> <title> How do we test an ngram model ? </title>
62
</para>
63
64
<para> ngram_test will compute the entropy (or perplexity, see below)
65
of some test data, given an ngram model. The entropy gives a measure
66
of how likely the ngram model is to have generated the test
67
data. Entropy is defined (for a sliding-window type ngram) as:
68
69
\[H = -\frac{1}{Q} \sum_{i=1}^{Q} log P(w_i | w_{i-1}, w_{i-2},... w_{i-N+1}) \]
70
71
where \(Q\) is the number of words of test data and \(N\) is the order
72
of the ngram model. Perplexity is a more intuitive mease, defined as:
73
74
\[B = 2^H \]
75
76
The perplexity of an ngram model with vocabulary size V will be
77
between 1 and V. Low perplexity indicates a more predictable language,
78
and in speech recognition, a models with low perplexity on test data
79
(i.e. data NOT used to estimate the model in the first place)
80
typically give better accuracy recognition than models with higher
81
perplexity (this is not guaranteed, however).
82
83
test_ngram works with non-sliding-window type models when the input
84
format is <parameter>ngram_per_line</parameter>.
85
86
</para>
87
</formalpara>
88
89
<formalpara>
90
<para><title>Input data format</title></para>
91
<para> The data input format options are the same as
92
<link linkend=ngram-build-manual>ngram_build</link>, as is the treatment of sentence start/end using
93
special tags.
94
</para>
95
<para>
96
97
Note: To get meaningful entropy/perplexity figures, it is recommended that
98
you use the same data input format in both
99
<link linkend=ngram-build-manual>ngram_build</link> and <link linkend=ngram-test-manual>ngram_test</link>, and the treatment of
100
sentence start/end should be the same.
101
</para>
102
</formalpara>
103
104
105
@see ngram_build */
106
//@}
107
108
/**@name OPTIONS
109
*/
110
//@{
111
112
//@options
113
114
//@}
115
116
117
int
main(
int
argc
,
char
**
argv
)
118
{
119
//int order;
120
EST_StrList
files
,
script
;
121
EST_Option
al
, op;
122
EST_String
wordlist_file
,
script_file
,
in_file
,
format
;
123
EST_String
prev_tag
,
prev_prev_tag
,
last_tag
;
124
EST_Litem
*p;
125
//EST_Ngrammar::representation_t representation =
126
//EST_Ngrammar::dense;
127
128
EST_StrList
wordlist
;
129
EST_Ngrammar
ngrammar
;
130
bool
per_file_stats
=
false
;
131
bool
raw_stats
=
false
;
132
bool
brief
=
false
;
133
EST_String
input_format
;
134
135
double
raw_entropy
,count,entropy,
perplexity
,
total_raw_H
,
total_count
;
136
total_count
= 0;
137
total_raw_H
= 0;
138
139
parse_command_line
140
(
argc
,
argv
,
141
EST_String
(
"[input file0] [input file1] ...\n"
)+
142
"-g <ifile> grammar file (required)\n"
+
143
"-w <ifile> filename containing word list (required for some grammar formats)\n"
+
144
"-S <ifile> script file\n"
+
145
"-raw_stats print unnormalised entropy and sample count\n"
+
146
"-brief print results in brief format\n"
+
147
"-f print stats for each file\n"
+
148
"\n"
+
149
"-input_format <string>\n"
+
150
" format of input data (default sentence_per_line)\n"
+
151
" may also be sentence_per_file, or ngram_per_line.\n"
+
152
"\n"
+
153
"Pseudo-words :\n"
+
154
"-prev_tag <string>\n"
+
155
" tag before sentence start\n"
+
156
"-prev_prev_tag <string>\n"
+
157
" all words before 'prev_tag'\n"
+
158
"-last_tag <string>\n"
+
159
" after sentence end\n"
+
160
"-default_tags\n"
+
161
" use default tags of "
+SENTENCE_START_MARKER+
162
","
+SENTENCE_END_MARKER+
" and "
+SENTENCE_END_MARKER+
"\n"
+
163
" respectively\n"
,
164
files
,
al
);
165
166
167
if
(
al
.present(
"-w"
))
168
wordlist_file
=
al
.val(
"-w"
);
169
else
{
170
wordlist_file
=
""
;
171
}
172
173
if
(
al
.present(
"-f"
))
174
per_file_stats
=
true
;
175
if
(
al
.present(
"-input_format"
))
176
input_format
=
al
.val(
"-input_format"
);
177
else
178
input_format
=
"sentence_per_line"
;
179
180
if
(
al
.present(
"-raw_stats"
) ||
al
.present(
"-r"
))
181
raw_stats
=
true
;
182
183
if
(
al
.present(
"-brief"
) ||
al
.present(
"-b"
) )
184
brief
=
true
;
185
186
187
if
(
al
.present(
"-default_tags"
))
188
{
189
prev_tag
= SENTENCE_START_MARKER;
190
prev_prev_tag
= SENTENCE_END_MARKER;
191
last_tag
= SENTENCE_END_MARKER;
192
}
193
194
if
(
al
.present(
"-prev_tag"
))
195
{
196
if
(
al
.present(
"-default_tags"
))
197
cerr
<<
"test_ngram: WARNING : -prev_tag overrides -default_tags"
198
<<
endl
;
199
prev_tag
=
al
.val(
"-prev_tag"
);
200
}
201
202
if
(
al
.present(
"-prev_prev_tag"
))
203
{
204
if
(
al
.present(
"-default_tags"
))
205
cerr
<<
"test_ngram: WARNING : -prev_prev_tag overrides -default_tags"
<<
endl
;
206
prev_prev_tag
=
al
.val(
"-prev_prev_tag"
);
207
}
208
209
if
(
al
.present(
"-last_tag"
))
210
{
211
if
(
al
.present(
"-default_tags"
))
212
cerr
<<
"test_ngram: WARNING : -last_tag overrides -default_tags"
<<
endl
;
213
last_tag
=
al
.val(
"-last_tag"
);
214
}
215
216
if
( ( (
prev_tag
==
""
) || (
prev_prev_tag
==
""
) || (
last_tag
==
""
) )
217
&& ( (
prev_tag
!=
""
) || (
prev_prev_tag
!=
""
) || (
last_tag
!=
""
) ) )
218
{
219
cerr
<<
"test_ngram: ERROR : if any tags are given, ALL must be given"
<<
endl
;
220
exit
(1);
221
}
222
223
224
// script
225
if
(
al
.present(
"-S"
))
226
{
227
script_file
=
al
.val(
"-S"
);
228
229
if
(load_StrList(
script_file
,
script
) != format_ok)
230
{
231
cerr
<<
"test_ngram: Could not read script from file "
232
<<
script_file
<<
endl
;
233
exit
(1);
234
}
235
}
236
237
if
(
al
.present(
"-g"
))
238
in_file
=
al
.val(
"-g"
);
239
else
240
{
241
cerr
<<
"test_ngram: Must give a grammar filename using -g"
<<
endl
;
242
exit
(1);
243
}
244
245
// plus any files on command line
246
// except file "-" unless there is no script
247
if
(
script
.head()==NULL)
248
script
+=
files
;
249
else
250
for
(p=
files
.head();p!=0;p=p->next())
251
if
(
files
(p) !=
"-"
)
252
script
.append(
files
(p));
253
254
if
(
script
.head() == NULL)
255
{
256
cerr
<<
"test_ngram: No test files given"
<<
endl
;
257
exit
(1);
258
}
259
260
if
(
wordlist_file
!=
""
)
261
{
262
// load wordlist
263
if
(load_StrList(
wordlist_file
,
wordlist
) != format_ok)
264
{
265
cerr
<<
"test_ngram: Could not read wordlist from file "
<<
wordlist_file
266
<<
endl
;
267
exit
(1);
268
}
269
270
// load grammar using wordlist
271
if
(
ngrammar
.load(
in_file
,
wordlist
) != format_ok)
272
{
273
cerr
<<
"test_ngram: Failed to load grammar"
<<
endl
;
274
exit
(1);
275
}
276
}
277
else
278
{
279
if
(
ngrammar
.load(
in_file
) != format_ok)
280
{
281
cerr
<<
"test_ngram: Failed to load grammar"
<<
endl
;
282
exit
(1);
283
}
284
}
285
286
if
(!
brief
)
287
{
288
cout
<<
"Ngram Test Results"
<<
endl
;
289
cout
<<
"=================="
<<
endl
;
290
}
291
292
for
(p =
script
.head(); p; p = p->next())
293
{
294
// test each file
295
if
(test_stats(
ngrammar
,
296
script
(p),
297
raw_entropy
,count,
298
entropy,
perplexity
,
299
input_format
,
300
prev_tag
,
301
prev_prev_tag
))
302
{
303
total_raw_H
+=
raw_entropy
;
304
total_count
+= count;
305
306
if
(
per_file_stats
)
307
{
308
if
(
brief
)
309
cout
<< basename(
script
(p)) <<
" \t"
;
310
else
311
cout
<<
script
(p) <<
endl
;
312
313
if
(
raw_stats
)
314
{
315
if
(
brief
)
316
cout
<<
raw_entropy
<<
" "
<< count <<
" "
;
317
else
318
{
319
cout
<<
" raw entropy "
<<
raw_entropy
<<
endl
;
320
cout
<<
" count "
<< count <<
endl
;
321
}
322
}
323
324
if
(
brief
)
325
cout
<< entropy <<
" "
<<
perplexity
<<
endl
;
326
else
327
{
328
cout
<<
" entropy "
<< entropy <<
endl
;
329
cout
<<
" perplexity "
<<
perplexity
<<
endl
<<
endl
;
330
}
331
}
332
}
333
else
334
{
335
cerr
<<
"test_ngram: WARNING : file '"
<<
script
(p)
336
<<
"' could not be processed"
<<
endl
;
337
}
338
339
}
340
if
(
total_count
> 0)
341
{
342
if
(!
brief
)
343
cout
<<
"Summary for grammar "
<<
in_file
<<
endl
;
344
else
345
if
(
per_file_stats
)
346
cout
<<
"summary \t"
;
347
348
if
(
raw_stats
)
349
{
350
if
(
brief
)
351
cout
<<
total_raw_H
<<
" "
<<
total_count
<<
" "
;
352
else
353
{
354
cout
<<
" raw entropy "
<<
total_raw_H
<<
endl
;
355
cout
<<
" count "
<<
total_count
<<
endl
;
356
}
357
}
358
if
(
brief
)
359
{
360
cout
<<
total_raw_H
/
total_count
;
361
cout
<<
" "
<<
pow
(2.0,
total_raw_H
/
total_count
);
362
cout
<<
endl
;
363
}
364
else
365
{
366
cout
<<
" entropy "
<<
total_raw_H
/
total_count
<<
endl
;
367
cout
<<
" perplexity "
<<
pow
(2.0,
total_raw_H
/
total_count
);
368
cout
<<
endl
;
369
}
370
}
371
else
372
{
373
cerr
<<
"test_ngram: No data processed"
<<
endl
;
374
}
375
376
// everything went okay
377
return
0;
378
}
379
380
381
void
override_lib_ops(
EST_Option
&
a_list
,
EST_Option
&
al
)
382
{
383
(
void
)
a_list
;
384
(
void
)
al
;
385
}
386
387
/** @name Hints
388
389
<title>I got a perplexity of Infinity - what went wrong ?</title>
390
391
A perplexity of Infinity means that at least one of the ngrams in your
392
test data had a probability of zero. Possible reasons for this include:
393
394
<itemizedlist>
395
396
<listitem><para>The training data had no examples of this ngram, and
397
you did not specify a floor for zero frequency ngrams in
398
\Ref{build_ngram} </para></listitem>
399
<listitem><para>You used differing input formats for \Ref{ngram_build}
400
and \Ref{ngram_test}. </para></listitem>
401
<listitem><para>You used differing sentence start/end treatments in
402
\Ref{ngram_build} and \Ref{ngram_test}. </para></listitem>
403
</itemizedlist>
404
405
*/
406
407
//@{
408
//@}
409
410
//@}
EST_Hash_Pair
Definition
EST_THash.h:75
EST_Ngrammar
Definition
EST_Ngrammar.h:209
EST_Option
Definition
EST_Option.h:50
EST_String
Definition
EST_String.h:70
EST_TList
Definition
EST_TList.h:109
EST_UItem
Definition
EST_UList.h:51
main
ngram_test_main.cc
Generated on Tue Mar 12 2024 07:03:34 for Edinburgh Speech Tools by
1.9.8