Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
ngrammar_utils.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1999 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : February 1999 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* A rationalization of some of the general functions */
38/* */
39/*=======================================================================*/
40#include <iostream>
41#include <cstring>
42#include "EST_String.h"
43#include "EST_Token.h"
44#include "EST_error.h"
45#include "EST_Ngrammar.h"
46
47static int get_next_window(EST_TokenStream &ts,
50 EST_Ngrammar &ngram)
51{
52 int i;
53 if ((input_format == "sentence_per_line") ||
54 (input_format == "sentence_per_file"))
55 {
56 EST_String t = ts.get().string();
57 slide(window,-1);
58 window[ngram.order()-1] = t;
59 if (ngram.wordlist_index(t) == -1)
60 cerr << "EST_Ngrammar test: skipping bad word \"" <<
61 t << "\"" << endl;
62 }
63 else if (input_format == "ngram_per_line")
64 {
65 for (i=0; i < ngram.order(); i++)
66 {
67 EST_String t = ts.get().string();
68 window[i] = t;
69 if (ngram.wordlist_index(t) == -1)
70 cerr << "EST_Ngrammar test: skipping bad word \"" <<
71 t << "\"" << endl;
72 }
73 }
74 else
75 EST_error("EST_Ngrammar test: unknown input format \"%s\"\n",
76 (const char *)input_format);
77
78 // Sigh, you pull a little thread and it all falls down
79 // For the time being can only deal in StrVectors rather than
80 // IVectors
81 for (i=0; i < ngram.order(); i++)
82 if (ngram.wordlist_index(window(i)) == -1)
83 return FALSE;
84 return TRUE;
85}
86
87bool test_stats(EST_Ngrammar &ngram,
88 const EST_String &filename,
89 double &raw_entropy,
90 double &count,
91 double &entropy,
92 double &perplexity,
94 const EST_String &prev,
95 const EST_String &prev_prev,
96 const EST_String &last)
97{
98 // Apply an ngram to some data and report on its performance
99 // Output entropy and test set perplexity
100 // H = -1/Q . log P(wi | wi-1, wi-2, ... wi-n)
101 // H_p = 2^H
102 // Rabiner and Juang p450
104 double H,prob;
105 int Q;
106 EST_StrVector window(ngram.order());
107 (void)last;
108
109 if (filename == "-")
110 ts.open(stdin,FALSE);
111 else if (ts.open(filename) == -1)
112 EST_error("EST_Ngrammar test: unable to open test file \"%s\"\n",
113 (const char *)filename);
114
115 Q=0;
116 H=0.0;
117 ngram.fill_window_start(window,prev,prev_prev);
118
119 while (!ts.eof() &&
120 (get_next_window(ts,window,input_format,ngram) == TRUE))
121 {
122 prob = ngram.probability(window);
123 H += log(prob);
124 Q++;
125 if ((input_format == "sentence_per_line") && (ts.eoln()))
126 ngram.fill_window_start(window,prev,prev_prev);
127 }
128
129 count = Q;
130 raw_entropy = -1 * H;
131 entropy = -1 * (H/Q);
132 perplexity = pow(2.0,entropy);
133
134// printf("count %g entropy %g perplexity %g\n",
135// count,entropy,perplexity);
136
137 return true;
138}