00001 00002 // $Id: lab3_lm.H,v 1.7 2009/10/15 23:26:20 stanchen Exp $ 00003 00004 00005 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ** 00006 * @file lab3_lm.H 00007 * @brief Main loop for Lab 3 language model training/eval. 00008 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00009 00010 #ifndef _LAB3_LM_H 00011 #define _LAB3_LM_H 00012 00013 00014 #include "util.H" 00015 00016 00017 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ** 00018 * Encapsulation of main loop for LM training/eval. 00019 * 00020 * Holds global variables and has routines for initializing variables 00021 * and updating them for each utterance. 00022 * We do this so that we can call this code from Java as well. 00023 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00024 class Lab3LmMain 00025 { 00026 public: 00027 /** Initialize all data given parameters. **/ 00028 Lab3LmMain(const map<string, string>& params, 00029 const SymbolTable& symTable, int n, int bosIdx, int eosIdx, 00030 int unkIdx); 00031 00032 /** Called at the beginning of processing each utterance. 00033 * Returns whether at EOF. 00034 **/ 00035 bool init_utt(); 00036 00037 /** Called at the end of processing each utterance. **/ 00038 void finish_utt(); 00039 00040 /** Called at the beginning of processing each word. 00041 * Returns whether at end of sentence. 00042 **/ 00043 bool init_word(); 00044 00045 /** Called at the end of processing each word. **/ 00046 void finish_word(double curProb); 00047 00048 /** Called at end of program. **/ 00049 void finish(); 00050 00051 00052 /** Returns current n-gram. **/ 00053 const vector<int>& get_ngram() const { return m_ngramBuf; } 00054 00055 private: 00056 /** Program parameters. **/ 00057 map<string, string> m_params; 00058 00059 /** The LM vocab. **/ 00060 const SymbolTable& m_symTable; 00061 00062 /** Value of "n" for the LM, i.e., as in an n-gram model. **/ 00063 int m_n; 00064 00065 /** Index of beginning-of-sentence token in vocab. **/ 00066 int m_bosIdx; 00067 00068 /** Index of end-of-sentence token in vocab. **/ 00069 int m_eosIdx; 00070 00071 /** Index of unknown token in vocab. **/ 00072 int m_unkIdx; 00073 00074 /** Stream for reading input text. **/ 00075 ifstream m_inStrm; 00076 00077 /** File to write word probs to, if desired. **/ 00078 string m_wordProbFile; 00079 00080 /** Stream for writing word probs to. **/ 00081 ofstream m_wordProbStrm; 00082 00083 /** File to write sentence log probs to, if desired. **/ 00084 string m_sentProbFile; 00085 00086 /** Stream for writing sentence log probs to. **/ 00087 ofstream m_sentProbStrm; 00088 00089 /** For holding string representation of current sentence. **/ 00090 vector<string> m_wordList; 00091 00092 /** For holding int representation of current sentence. **/ 00093 vector<int> m_wordIdxList; 00094 00095 /** Buffer for holding the current n-gram. **/ 00096 vector<int> m_ngramBuf; 00097 00098 /** Current position in sentence. **/ 00099 int m_posIdx; 00100 00101 /** Total number of words processed so far. **/ 00102 int m_totWordCnt; 00103 00104 /** Total log prob of words processed so far. **/ 00105 double m_totLogProb; 00106 00107 /** Total log prob of words in current sentence processed so far. **/ 00108 double m_sentLogProb; 00109 }; 00110 00111 00112 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 00113 * 00114 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00115 00116 #endif 00117 00118