Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
EST_DProbDist.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : July 1996 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* Simple statistics (for discrete probability distributions */
38/* */
39/*=======================================================================*/
40#include <iostream>
41#include <fstream>
42#include <cstdlib>
43#include <cstdio>
44#include <cstring>
45#include "EST_String.h"
46#include "EST_TKVL.h"
47#include "EST_simplestats.h"
48
49/* We share ints and pointers for two types of probability distributions */
50/* The know discrete sets can be indexed by ints which is *much* faster */
51/* the indices pass around a pointers but the lower part contain ints in */
52/* the discrete case */
53/* On 64bit architectures this is a issue so we need have some macros */
54/* to help us here. */
55
56const int est_64to32(void *c)
57{ /* this returns the bottom end of the pointer as an unsigned int */
58 /* I believe this is a safe way to do it, we check the bits in the */
59 /* 64 bit int and multiply them out in the 32 bit one */
60 /* there might be better ways, but I think you'd need to think about */
61 /* byte order then */
62 long long l;
63 int d;
64 int i,x;
65
66 l = (long long)c;
67
68 for (i=0,d=0,x=1; i<24; i++)
69 {
70 if (l & 1)
71 d += x;
72 l = l >> 1;
73 x += x;
74 }
75
76 return d;
77}
78/* #define tprob_int(X) ((sizeof(void *) != 8) ? est_64to32(X) : (int)X) */
79#define tprob_int(X) (est_64to32(X))
80
81
82EST_DiscreteProbDistribution::EST_DiscreteProbDistribution(const EST_Discrete *d,
83 const double n_samples, const EST_DVector &counts)
84{
85 type = tprob_discrete;
86 discrete = d;
87 num_samples = n_samples;
88
89 icounts = counts;
90}
91
92EST_DiscreteProbDistribution::EST_DiscreteProbDistribution(const EST_DiscreteProbDistribution &b)
93{
94 copy(b);
95}
96
98{
99 type = b.type;
100 num_samples = b.num_samples;
101 discrete = b.discrete;
102 icounts = b.icounts;
103 scounts = b.scounts;
104}
105
107{
108 icounts.resize(0);
109}
110
112{
113 type = tprob_string;
114 num_samples = 0;
115 discrete = 0;
116}
117
119{
120 int i;
121 clear();
122 type = tprob_discrete;
123 num_samples = 0;
124 discrete = new EST_Discrete(vocab);
125
126 icounts.resize(vocab.length());
127 for (i=0; i<icounts.length(); i++)
128 icounts.a_no_check(i) = 0;
129
130 return true;
131}
132
134{
135 int i;
136 clear();
137 type = tprob_discrete;
138 num_samples = 0;
139 discrete = d;
140 icounts.resize(d->length());
141 for (i=0; i<icounts.length(); i++)
142 icounts.a_no_check(i) = 0;
143}
144
146{
147 icounts[tprob_int(i)] += count;
148 num_samples += count;
149}
150
151void EST_DiscreteProbDistribution::cumulate(int i,double count)
152{
153 icounts[i] += count;
154 num_samples += count;
155}
156
158{
159 EST_Litem *p;
160
161 if (type == tprob_discrete)
162 {
163 int idx = discrete->index(s);
164 icounts[idx] += count;
165 }
166 else // its a (slow) string type
167 {
168 for (p=scounts.list.head(); p != 0; p=p->next())
169 {
170 if (scounts.list(p).k == s)
171 {
172 scounts.list(p).v += count;
173 break;
174 }
175 }
176 if (p == 0) // first occurrence
177 scounts.add_item(s,count,1); // add without search
178 }
179 num_samples += count;
180}
181
183{
184 EST_Litem *p,*t;
185 double max = 0;
186
187 if (type == tprob_discrete)
188 {
189 int i,pt=-1;
190 for (i=0; i < icounts.length(); i++)
191 if (icounts.a_no_check(i) > max)
192 {
193 pt = i;
194 max = icounts.a_no_check(i);
195 }
196 if (max == 0)
197 {
198 if(prob != NULL)
199 *prob = 0.0;
200 return EST_String::Empty;
201 }
202 else
203 {
204 if(prob != NULL)
205 *prob = probability(pt);
206 return discrete->name(pt);
207 }
208 }
209 else
210 {
211 t = 0;
212 for (p=scounts.list.head(); p != 0; p=p->next())
213 if (scounts.list(p).v > max)
214 {
215 t = p;
216 max = scounts.list(p).v;
217 }
218 if (max == 0)
219 {
220 if(prob != NULL)
221 *prob = 0.0;
222 return EST_String::Empty;
223 }
224 else
225 {
226 if(prob != NULL)
227 *prob = (double)max/num_samples;
228 return scounts.list(t).k;
229 }
230 }
231}
232
233double EST_DiscreteProbDistribution::probability(const EST_String &s) const
234{
235 if (frequency(s) == 0.0)
236 return 0.0;
237 else
238 return (double)frequency(s)/num_samples;
239}
240
241double EST_DiscreteProbDistribution::probability(const int i) const
242{
243 if (frequency(i) == 0.0)
244 return 0.0;
245 else
246 return (double)frequency(i)/num_samples;
247}
248
249double EST_DiscreteProbDistribution::frequency(const EST_String &s) const
250{
251 if (type == tprob_discrete)
252 return icounts.a_no_check(discrete->index(s));
253 else
254 return scounts.val_def(s,0);
255}
256
257double EST_DiscreteProbDistribution::frequency(const int i) const
258{
259 if (type == tprob_discrete)
260 return icounts(i);
261 else
262 {
263 cerr << "ProbDistribution: can't access string type pd with int\n";
264 return 0;
265 }
266}
267
269{
270 if (type == tprob_discrete)
271 {
272 num_samples -= icounts.a_no_check(discrete->index(s));
273 num_samples += c;
274 icounts.a_no_check(discrete->index(s)) = c;
275 }
276 else
277 {
278 num_samples -= scounts.val_def(s,0);
279 num_samples += c;
280 scounts.add_item(s,c);
281 }
282}
283
285{
286 if (type == tprob_discrete)
287 {
288 num_samples -= icounts[i];
289 num_samples += c;
290 icounts[i] = c;
291 }
292 else
293 {
294 cerr << "ProbDistribution: can't access string type pd with int\n";
295 }
296
297}
298
300{
301 if (type == tprob_discrete)
302 {
303 num_samples -= icounts[tprob_int(i)];
304 num_samples += c;
305 icounts[tprob_int(i)] = c;
306 }
307 else
308 {
309 cerr << "ProbDistribution: can't access string type pd with int\n";
310 }
311
312}
313
314
316{
317 if (type == tprob_discrete)
318 icounts.a_no_check(discrete->index(s)) = c;
319 else
320 scounts.add_item(s,c);
321}
322
324{
325 if (type == tprob_discrete)
326 icounts[i] = c;
327 else
328 cerr << "ProbDistribution: can't access string type pd with int\n";
329}
330
332{
333 if (type == tprob_discrete)
334 icounts[tprob_int(i)] = c;
335 else
336 cerr << "ProbDistribution: can't access string type pd with int\n";
337}
338
340{
341 // Returns the entropy of the current distribution
342 double e=0.0;
343 EST_Litem *p;
344 int i;
345
346 if (type == tprob_discrete)
347 {
348 for (i=0; i < icounts.length(); i++)
349 {
350 double prob = icounts.a_no_check(i)/num_samples;
351 if (prob != 0.0)
352 e += prob * log(prob); /* log10(prob)/log10(2) */
353 }
354 }
355 else
356 {
357 for (p=scounts.list.head(); p != 0; p=p->next())
358 {
359 double prob = scounts.list(p).v/num_samples;
360 if (prob != 0.0)
361 e += prob * log(prob); /* log10(prob)/log10(2) */
362 }
363 }
364
365 return -e;
366
367}
368
369// For iterating through members of a probability distribution
371{
372 if (type == tprob_discrete)
373 return NULL;
374 else
375 return scounts.list.head();
376}
377
379{
380 if (type == tprob_discrete)
381 return (tprob_int(idx) >= icounts.length());
382 else
383 return (idx == 0);
384}
385
387{
388 if (type == tprob_discrete)
389 return (EST_Litem *)(((unsigned char *)idx)+1);
390 else
391 return idx->next();
392}
393
395{
396 if (type == tprob_discrete)
397 return discrete->name(tprob_int(idx));
398 else
399 return scounts.list(idx).k;
400}
401
403{
404 if (type == tprob_discrete)
405 {
406 s = discrete->name(tprob_int(idx));
407 freq = icounts(tprob_int(idx));
408 }
409 else
410 {
411 s = scounts.list(idx).k;
412 freq = scounts.list(idx).v;
413 }
414}
415
417{
418 if (type == tprob_discrete)
419 {
420 prob = probability(tprob_int(idx));
421 s = discrete->name(tprob_int(idx));
422 }
423 else
424 {
425 s = scounts.list(idx).k;
426 prob = (double)scounts.list(idx).v/num_samples;
427 }
428}
429
431{
432 // Output best with probabilities
433 EST_Litem *i;
434 double prob;
435 double sum=0;
436 EST_String name;
437
438 s << "(";
439 for (i=pd.item_start(); !pd.item_end(i); i=pd.item_next(i))
440 {
441 pd.item_prob(i,name,prob);
442 s << "(" << name << "=" << prob << ") ";
443 sum+=prob;
444 }
445 s << "best=" << pd.most_probable(&prob) << " samples="
446 << pd.samples() << " sum=" << sum << ")";
447 return s;
448}
449
450EST_DiscreteProbDistribution &EST_DiscreteProbDistribution::operator=(const EST_DiscreteProbDistribution &a)
451{
452 // I'd much rather this was never called
453 copy(a);
454 return *this;
455}
456
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
EST_Litem * item_start() const
Used for iterating through members of the distribution.
void item_prob(EST_Litem *idx, EST_String &s, double &prob) const
During iteration returns name and probability given index.
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
const EST_String & item_name(EST_Litem *idx) const
During iteration returns name given index.
double samples(void) const
Total number of example found.
void clear(void)
Reset, clearing all counts and vocabulary.
void override_frequency(const EST_String &s, double c)
Sets the frequency of named item, without modifying {\tt num_samples}.
void copy(const EST_DiscreteProbDistribution &b)
Copy all data from another DPD to this.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
void set_frequency(const EST_String &s, double c)
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
const int length(void) const
The number of members in the discrete.
static const EST_String Empty
Constant empty string.
Definition EST_String.h:111
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
Definition EST_TKVL.cc:248
EST_TList< EST_TKVI< K, V > > list
Linked list of key-val pairs. Don't use this as it will be made private in the future.
Definition EST_TKVL.h:93
const V & val_def(const K &rkey, const V &def) const
value or default
Definition EST_TKVL.cc:151
void resize(int n, int set=1)
resize vector
INLINE int length() const
number of items in vector.
INLINE const T & a_no_check(int n) const
read-only const access operator: without bounds checking