220 std::vector<std::string> prefixCompletionCandidates;
221 for (
size_t k =
cardinality; (k > 0 && prefixCompletionCandidates.size() < max_partial_prediction_size); k--) {
222 logger << DEBUG <<
"Building partial prefix completion table of cardinality: " << k <<
endl;
224 Ngram prefix_ngram(k);
225 copy(tokens.end() - k, tokens.end(), prefix_ngram.begin());
228 logger << DEBUG <<
"prefix_ngram: ";
229 for (
size_t r = 0; r < prefix_ngram.size(); r++) {
230 logger << DEBUG << prefix_ngram[r] <<
' ';
236 db->beginTransaction();
241 partial =
db->getNgramLikeTable(prefix_ngram,max_partial_prediction_size - prefixCompletionCandidates.size());
243 partial =
db->getNgramLikeTableFiltered(prefix_ngram,filter, max_partial_prediction_size - prefixCompletionCandidates.size());
246 db->endTransaction();
249 logger << DEBUG <<
"partial prefixCompletionCandidates" <<
endl
250 << DEBUG <<
"----------------------------------" <<
endl;
251 for (
size_t j = 0; j < partial.size(); j++) {
252 for (
size_t k = 0; k < partial[j].size(); k++) {
253 logger << DEBUG << partial[j][k] <<
" ";
259 logger << DEBUG <<
"Partial prefix completion table contains " << partial.size() <<
" potential completions." <<
endl;
265 std::vector<Ngram>::const_iterator it = partial.begin();
266 while (it != partial.end() && prefixCompletionCandidates.size() < max_partial_prediction_size) {
270 std::string candidate = *(it->end() - 2);
271 if (find(prefixCompletionCandidates.begin(),
272 prefixCompletionCandidates.end(),
273 candidate) == prefixCompletionCandidates.end()) {
274 prefixCompletionCandidates.push_back(candidate);
281 logger << DEBUG <<
"prefixCompletionCandidates" <<
endl
282 << DEBUG <<
"--------------------------" <<
endl;
283 for (
size_t j = 0; j < prefixCompletionCandidates.size(); j++) {
284 logger << DEBUG << prefixCompletionCandidates[j] <<
endl;
290 db->beginTransaction();
293 int unigrams_counts_sum =
db->getUnigramCountsSum();
294 for (
size_t j = 0; (j < prefixCompletionCandidates.size() && j < max_partial_prediction_size); j++) {
296 tokens[
cardinality - 1] = prefixCompletionCandidates[j];
298 logger << DEBUG <<
"------------------" <<
endl;
301 double probability = 0;
303 double numerator =
count(tokens, 0, k+1);
305 double denominator = (k == 0 ? unigrams_counts_sum :
count(tokens, -1, k));
306 double frequency = ((denominator > 0) ? (numerator / denominator) : 0);
307 probability +=
deltas[k] * frequency;
309 logger << DEBUG <<
"numerator: " << numerator <<
endl;
310 logger << DEBUG <<
"denominator: " << denominator <<
endl;
311 logger << DEBUG <<
"frequency: " << frequency <<
endl;
315 assert(numerator <= denominator);
316 assert(frequency <= 1);
320 logger << DEBUG <<
"probability: " << probability <<
endl;
322 if (probability > 0) {
326 db->endTransaction();
342 std::map<std::list<std::string>,
int> ngramMap;
346 for (
size_t curr_cardinality = 1;
351 int change_size = change.size();
353 std::list<std::string> ngram_list;
357 (i < curr_cardinality - 1 && change_idx < change_size);
360 ngram_list.push_back(change[change_idx]);
364 while (change_idx < change_size)
366 ngram_list.push_back(change[change_idx++]);
367 ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
368 ngram_list.pop_front();
391 if (change.size() > 0 &&
396 std::list<std::string> ngram_list(change.begin(), change.begin() + 1);
411 std::string extra_token =
contextTracker->getExtraTokenToLearn(tk_idx, change);
412 logger << DEBUG <<
"Adding extra token: " << extra_token <<
endl;
414 if (extra_token.empty())
418 ngram_list.push_front(extra_token);
420 ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
427 db->beginTransaction();
429 std::map<std::list<std::string>,
int>::const_iterator it;
430 for (it = ngramMap.begin(); it != ngramMap.end(); it++)
433 Ngram ngram((it->first).begin(), (it->first).end());
436 int count =
db->getNgramCount(ngram);
440 db->updateNgram(ngram,
count + it->second);
446 db->insertNgram(ngram, it->second);
450 db->endTransaction();
451 logger << INFO <<
"Committed learning update to database" <<
endl;
455 db->rollbackTransaction();
456 logger << ERROR <<
"Rolling back learning update : " << ex.
what() <<
endl;
472 size_t size = ngram.size();
473 for (
size_t i = 0; i < size; i++) {
474 if (
count(ngram, -i, size - i) >
count(ngram, -(i + 1), size - (i + 1))) {
475 logger << INFO <<
"consistency adjustment needed!" <<
endl;
477 int offset = -(i + 1);
478 int sub_ngram_size = size - (i + 1);
480 logger << DEBUG <<
"i: " << i <<
" | offset: " << offset <<
" | sub_ngram_size: " << sub_ngram_size <<
endl;
482 Ngram sub_ngram(sub_ngram_size);
483 copy(ngram.end() - sub_ngram_size + offset, ngram.end() + offset, sub_ngram.begin());
486 logger <<
"ngram to be count adjusted is: ";
487 for (
size_t i = 0; i < sub_ngram.size(); i++) {
488 logger << sub_ngram[i] <<
' ';
493 db->incrementNgramCount(sub_ngram);
494 logger << DEBUG <<
"consistency adjusted" <<
endl;
Tracks user interaction and context.
Predictor(Configuration *configuration, ContextTracker *contextTracker, const char *predictorName="Predictor", const char *shortDescription="", const char *longDescription="")