00001 00002 // $Id: lang_model.H,v 1.4 2009/10/16 13:52:52 stanchen Exp $ 00003 00004 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ** 00005 * @file lang_model.H 00006 * @brief Contains LangModel class. 00007 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00008 00009 #ifndef _LANG_MODEL_H 00010 #define _LANG_MODEL_H 00011 00012 00013 #include "util.H" 00014 00015 00016 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ** 00017 * Language model class. 00018 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00019 class LangModel 00020 { 00021 public: 00022 /** Ctor; get configuration parameters from "params". **/ 00023 LangModel(const map<string, string>& params = ParamsType()); 00024 00025 /** Get associated mapping from word spellings to integer indices. **/ 00026 const SymbolTable& get_sym_table() const { return *m_symTable.get(); } 00027 00028 /** Returns index of beginning-of-sentence token. **/ 00029 int get_bos_index() const { return m_bosIdx; } 00030 00031 /** Returns index of end-of-sentence token. **/ 00032 int get_eos_index() const { return m_eosIdx; } 00033 00034 /** Returns index of unknown token. **/ 00035 int get_unknown_index() const { return m_unkIdx; } 00036 00037 /** Returns "n" of n-gram. **/ 00038 int get_ngram_length() const { return m_n; } 00039 00040 /** Returns conditional prob of last word given previous words. 00041 * The argument @p ngram can be any length from 1 to the 00042 * value of #get_ngram_length(). 00043 **/ 00044 double get_prob(const vector<int>& ngram) const; 00045 00046 private: 00047 00048 /** Count n-grams in a training sentence. **/ 00049 void count_sentence_ngrams(const vector<int>& wordIdxList); 00050 00051 /** Returns conditional prob of last word given previous words 00052 * under plus-delta smoothing. 00053 * The argument @p ngram can be any length from 1 to the 00054 * value of #get_ngram_length(). 00055 **/ 00056 double get_prob_plus_delta(const vector<int>& ngram) const; 00057 00058 /** Returns conditional prob of last word given previous words 00059 * under Witten-Bell smoothing. 00060 * The argument @p ngram can be any length from 1 to the 00061 * value of #get_ngram_length(). 00062 **/ 00063 double get_prob_witten_bell(const vector<int>& ngram) const; 00064 00065 /** Write out all counts to a file, for debugging. **/ 00066 void write_counts(const string& fileName) const; 00067 00068 private: 00069 /** Stores copy of input parameters. **/ 00070 map<string, string> m_params; 00071 00072 /** Map from words to integer indices. **/ 00073 shared_ptr<SymbolTable> m_symTable; 00074 00075 /** Index of beginning-of-sentence token. **/ 00076 int m_bosIdx; 00077 00078 /** Index of end-of-sentence token. **/ 00079 int m_eosIdx; 00080 00081 /** Index of unknown token. **/ 00082 int m_unkIdx; 00083 00084 /** Value of "n". **/ 00085 int m_n; 00086 00087 /** The delta in plus-delta smoothing; if negative, do Witten-Bell 00088 * smoothing instead. 00089 **/ 00090 double m_delta; 00091 00092 /** Map from pred n-grams to their counts. **/ 00093 NGramCounter m_predCounts; 00094 00095 /** Map from hist n-grams to their counts. **/ 00096 NGramCounter m_histCounts; 00097 00098 /** Map from hist n-grams to their "1+" counts. **/ 00099 NGramCounter m_histOnePlusCounts; 00100 }; 00101 00102 00103 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 00104 * 00105 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00106 00107 #endif 00108 00109
1.5.5