asr_lib: lang_model.H Source File

00001 
00002 //  $Id: lang_model.H,v 1.4 2009/10/16 13:52:52 stanchen Exp $
00003 
00004 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **
00005 *   @file lang_model.H
00006 *   @brief Contains LangModel class.
00007 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00008 
00009 #ifndef _LANG_MODEL_H
00010 #define _LANG_MODEL_H
00011 
00012 
00013 #include "util.H"
00014 
00015 
00016 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **
00017 *   Language model class.
00018 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00019 class LangModel
00020     {
00021     public:
00022         /** Ctor; get configuration parameters from "params". **/
00023         LangModel(const map<string, string>& params = ParamsType());
00024 
00025         /** Get associated mapping from word spellings to integer indices. **/
00026         const SymbolTable& get_sym_table() const { return *m_symTable.get(); }
00027 
00028         /** Returns index of beginning-of-sentence token. **/
00029         int get_bos_index() const { return m_bosIdx; }
00030 
00031         /** Returns index of end-of-sentence token. **/
00032         int get_eos_index() const { return m_eosIdx; }
00033 
00034         /** Returns index of unknown token. **/
00035         int get_unknown_index() const { return m_unkIdx; }
00036 
00037         /** Returns "n" of n-gram. **/
00038         int get_ngram_length() const { return m_n; }
00039 
00040         /** Returns conditional prob of last word given previous words.
00041         *   The argument @p ngram can be any length from 1 to the
00042         *   value of #get_ngram_length().
00043         **/
00044         double get_prob(const vector<int>& ngram) const;
00045 
00046     private:
00047 
00048         /** Count n-grams in a training sentence. **/
00049         void count_sentence_ngrams(const vector<int>& wordIdxList);
00050 
00051         /** Returns conditional prob of last word given previous words
00052         *   under plus-delta smoothing.
00053         *   The argument @p ngram can be any length from 1 to the
00054         *   value of #get_ngram_length().
00055         **/
00056         double get_prob_plus_delta(const vector<int>& ngram) const;
00057 
00058         /** Returns conditional prob of last word given previous words
00059         *   under Witten-Bell smoothing.
00060         *   The argument @p ngram can be any length from 1 to the
00061         *   value of #get_ngram_length().
00062         **/
00063         double get_prob_witten_bell(const vector<int>& ngram) const;
00064 
00065         /** Write out all counts to a file, for debugging. **/
00066         void write_counts(const string& fileName) const;
00067 
00068     private:
00069         /** Stores copy of input parameters. **/
00070         map<string, string> m_params;
00071 
00072         /** Map from words to integer indices. **/
00073         shared_ptr<SymbolTable> m_symTable;
00074 
00075         /** Index of beginning-of-sentence token. **/
00076         int m_bosIdx;
00077 
00078         /** Index of end-of-sentence token. **/
00079         int m_eosIdx;
00080 
00081         /** Index of unknown token. **/
00082         int m_unkIdx;
00083 
00084         /** Value of "n". **/
00085         int m_n;
00086 
00087         /** The delta in plus-delta smoothing; if negative, do Witten-Bell
00088         *   smoothing instead.
00089         **/
00090         double m_delta;
00091 
00092         /** Map from pred n-grams to their counts. **/
00093         NGramCounter m_predCounts;
00094 
00095         /** Map from hist n-grams to their counts. **/
00096         NGramCounter m_histCounts;
00097 
00098         /** Map from hist n-grams to their "1+" counts. **/
00099         NGramCounter m_histOnePlusCounts;
00100     };
00101 
00102 
00103 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
00104 *
00105 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00106 
00107 #endif
00108 
00109