///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// $Id: FrequencyDB.cc,v 1.26 2003/08/30 21:42:24 bburton Exp $
//
// Copyright (C) 2000 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//

#include <stdexcept>
#include <unistd.h>
#include <fcntl.h>
#include "WordData.h"
#include "FrequencyDB.h"
#include "FrequencyDBImpl_split.h"
#include "FrequencyDBImpl_cache.h"
#include "FrequencyDBImpl_dual.h"
#include "FrequencyDBImpl_dbm.h"
#include "FrequencyDBImpl_bdb.h"
#include "FrequencyDBImpl_pbl.h"

struct DBType {
  const string prefix;
  FrequencyDBImplFactory_t factory;
};
static DBType DBTYPES[] = {
#ifdef USE_MMAP
  { "hash:", FrequencyDBImpl_split::factory },
#endif
#ifdef USE_PBL
  { "pbl:", FrequencyDBImpl_pbl::factory },
#endif
#ifdef USE_DB
  { "bdb:", FrequencyDBImpl_bdb::factory },
#endif
#ifdef USE_DBM
  { "gdbm:", FrequencyDBImpl_dbm::factory },
#endif
#ifdef USE_PBL
  { "", FrequencyDBImpl_pbl::factory },
#endif
#ifdef USE_DB
  { "", FrequencyDBImpl_bdb::factory },
#endif
#ifdef USE_DBM
  { "", FrequencyDBImpl_dbm::factory },
#endif
  { "", 0 }
};

/// defined here to avoid having an otherwise empty WordData.cc file - yeah I know
unsigned long WordData::s_today = 0;

static const string DIGEST_PREFIX("__MD5__");
const string FrequencyDB::COUNT_WORD("__COUNT__");

const int FrequencyDBImpl::SHARED_DB_MODE = 0666;
const int FrequencyDBImpl::PRIVATE_DB_MODE = 0600;

FrequencyDB::FrequencyDB()
{
}

FrequencyDB::~FrequencyDB()
{
  close();
}

FrequencyDBImpl *FrequencyDB::createDB(string &filename)
{
  bool has_colon = (filename.find(':') != string::npos);
  for (DBType *dbt = DBTYPES; dbt->factory; ++dbt) {
    if ((has_colon && dbt->prefix.length() > 0 && starts_with(filename, dbt->prefix)) ||
        (!has_colon && dbt->prefix.length() == 0)) {
      filename.erase(0, dbt->prefix.length());
      if (is_debug) {
        cerr << "USING DATABASE TYPE " << dbt->prefix << endl;
      }
      return (dbt->factory)();
    }
  }
  throw runtime_error(string("no database type known for filename: ") + filename);
}

string FrequencyDB::removeTypePrefix(const string &db_path)
{
  string filename(db_path);
  for (DBType *dbt = DBTYPES; dbt->factory; ++dbt) {
    if (dbt->prefix.length() > 0 && starts_with(filename, dbt->prefix)) {
      string::size_type last_colon = filename.rfind(':');
      filename.erase(0, last_colon + 1);
      break;
    }
  }
  return filename;
}

bool FrequencyDB::open(const string &arg_filename,
                       bool read_only)
{
  close();

  WordData::setTodayDate();

  bool cached = false;
  string filename(arg_filename);
  m_db.set(new FrequencyDBImpl_cache(createDB(filename)));
  return m_db->open(filename, read_only, FrequencyDBImpl::PRIVATE_DB_MODE);
}

bool FrequencyDB::open(const string &arg_shared_filename,
                       const string &arg_private_filename,
                       bool read_only)
{
  close();

  WordData::setTodayDate();

  string shared_filename(arg_shared_filename);
  NewPtr<FrequencyDBImpl> shared_db(createDB(shared_filename));

  string private_filename(arg_private_filename);
  NewPtr<FrequencyDBImpl> private_db(createDB(private_filename));

  m_db.set(new FrequencyDBImpl_cache(new FrequencyDBImpl_dual(shared_db.release(),
                                                              private_db.release(),
                                                              shared_filename)));
  return m_db->open(private_filename, read_only, FrequencyDBImpl::PRIVATE_DB_MODE);
}

void FrequencyDB::close()
{
  m_db.clear();
}

void FrequencyDB::flush()
{
  assert(m_db.get());

  m_db->flush();
}

void FrequencyDB::beginTransaction()
{
  assert(m_db.get());

  m_db->beginTransaction();
}

void FrequencyDB::endTransaction(bool commit)
{
  assert(m_db.get());

  m_db->endTransaction(commit);
}

void FrequencyDB::setWordCounts(const string &word,
                                int good_count,
                                int spam_count)
{
  assert(m_db.get());
  assert(good_count >= 0 && spam_count >= 0);

  WordData counts(good_count, spam_count);
  counts.adjustDate();
  m_db->writeWord(word, counts);
}

void FrequencyDB::touchWord(const string &word)
{
  assert(m_db.get());

  WordData counts;
  if (m_db->readWord(word, counts)) {
    counts.adjustDate();
    m_db->writeWord(word, counts);
  }
}

void FrequencyDB::addWord(const string &word,
                          int good_count,
                          int spam_count)
{
  assert(m_db.get());

  WordData counts;
  m_db->readWord(word, counts);

  counts.adjustGoodCount(good_count);
  counts.adjustSpamCount(spam_count);
  counts.adjustDate();

  m_db->writeWord(word, counts);
}

void FrequencyDB::addWord(const string &word,
                          int good_count,
                          int spam_count,
                          unsigned long flags)
{
  assert(m_db.get());

  WordData counts;
  bool exists = m_db->readWord(word, counts);

  counts.adjustGoodCount(good_count);
  counts.adjustSpamCount(spam_count);
  if (exists) {
    counts.adjustDate();
  } else {
    counts.setFlags(flags);
  }

  m_db->writeWord(word, counts);
}

void FrequencyDB::removeWord(const string &word,
                             int good_count,
                             int spam_count)
{
  assert(m_db.get());

  addWord(word, -good_count, -spam_count);
}

void FrequencyDB::getWordCounts(const string &word,
                                int &good_count,
                                int &spam_count)
{
  assert(m_db.get());

  WordData counts;
  m_db->readWord(word, counts);
  good_count = counts.goodCount();
  spam_count = counts.spamCount();
}

void FrequencyDB::getMessageCounts(int &good_message_count,
                                   int &spam_message_count)
{
  getWordCounts(COUNT_WORD, good_message_count, spam_message_count);
}

int FrequencyDB::getTotalMessageCount()
{
  int good_message_count, spam_message_count;
  getWordCounts(COUNT_WORD, good_message_count, spam_message_count);
  return good_message_count + spam_message_count;
}

int FrequencyDB::getMessageCount(const Message &msg,
                                 bool &is_spam)
{
  assert(m_db.get());

  WordData counts;
  if (!m_db->readWord(DIGEST_PREFIX + msg.getDigest(), counts)) {
    is_spam = false;
    return 0;
  }

  assert(counts.goodCount() >= 0 && counts.spamCount() >= 0);
  assert(!(counts.goodCount() > 0 && counts.spamCount() > 0));

  is_spam = (counts.spamCount() > 0);
  return is_spam ? counts.spamCount() : counts.goodCount();
}

bool FrequencyDB::containsMessage(const Message &msg,
                                  bool &is_spam)
{
  return getMessageCount(msg, is_spam) > 0;
}

void FrequencyDB::adjustWordCounts(const string &word,
                                   int delta,
                                   bool is_spam)
{
  assert(m_db.get());

  if (is_spam) {
    addWord(word, 0, delta);
  } else {
    addWord(word, delta, 0);
  }
}

void FrequencyDB::touchMessage(const Message &msg)
{
  assert(m_db.get());
  assert(msg.getDigest().length() > 0);

  for (int i = 0; i < msg.getTokenCount(); ++i) {
    const Token *tok = msg.getToken(i);
    touchWord(tok->getWord());
  }

  if (is_debug) {
    cerr << "Updated terms from message " << msg.getID()
         << "/" << msg.getDigest()
         << " in database"
         << endl;
  }
}

void FrequencyDB::addMessage(const Message &msg,
                             bool new_is_spam,
                             bool force_update)
{
  assert(m_db.get());
  assert(msg.getDigest().length() > 0);

  bool already_is_spam = false;
  bool already_exists = containsMessage(msg, already_is_spam);

  assert(!already_exists || (!new_is_spam == !already_is_spam));

  if (already_exists) {
    assert(!already_is_spam == !new_is_spam);

    if (!force_update) {
      // message already counted, do nothing
      return;
    }

    // force_update causes us to pretend it doesn't already exist so
    // we wind up adding its terms again even though they are already counted
  }

  for (int i = 0; i < msg.getTokenCount(); ++i) {
    const Token *tok = msg.getToken(i);
    adjustWordCounts(tok->getWord(), tok->getCount(), new_is_spam);
  }
  adjustWordCounts(DIGEST_PREFIX + msg.getDigest(), 1, new_is_spam);

  if (!already_exists) {
    adjustWordCounts(COUNT_WORD, 1, new_is_spam);
  }

  if (is_debug) {
    cerr << "Updated message " << msg.getID()
         << "/" << msg.getDigest()
         << " in database as "
         << (new_is_spam ? "spam." : "good.")
         << endl;
  }
}

void FrequencyDB::removeMessage(const Message &msg)
{
  assert(m_db.get());
  assert(msg.getDigest().length() > 0);

  bool is_spam = false;
  int message_count = getMessageCount(msg, is_spam);
  if (message_count == 0) {
    // not in database
    return;
  }

  for (int i = 0; i < msg.getTokenCount(); ++i) {
    const Token *tok = msg.getToken(i);
    adjustWordCounts(tok->getWord(), -message_count * tok->getCount(), is_spam);
  }
  adjustWordCounts(DIGEST_PREFIX + msg.getDigest(), -message_count, is_spam);
  adjustWordCounts(COUNT_WORD, -1, is_spam);

  assert(getMessageCount(msg, is_spam) == 0);

  if (is_debug) {
    cerr << "Removed message " << msg.getID()
         << "/" << msg.getDigest()
         << " from database." << endl;
  }
}

bool FrequencyDB::firstWord(string &word,
                            WordData &counts) const
{
  assert(m_db.get());

  return m_db->firstWord(word, counts);
}

bool FrequencyDB::nextWord(string &word,
                           WordData &counts) const
{
  assert(m_db.get());

  return m_db->nextWord(word, counts);
}

string FrequencyDB::getDatabaseType() const
{
  return m_db.get() ? m_db->getDatabaseType() : "unknown";
}

void FrequencyDB::sweepOutOldTerms(int junk_limit,
				   int max_age)
{
  assert(m_db.get());

  return m_db->sweepOutOldTerms(junk_limit, max_age);
}
