
/******************************************************************************
 *
 *  This file is part of canu, a software program that assembles whole-genome
 *  sequencing reads into contigs.
 *
 *  This software is based on:
 *    'Celera Assembler' (http://wgs-assembler.sourceforge.net)
 *    the 'kmer package' (http://kmer.sourceforge.net)
 *  both originally distributed by Applera Corporation under the GNU General
 *  Public License, version 2.
 *
 *  Canu branched from Celera Assembler at its revision 4587.
 *  Canu branched from the kmer project at its revision 1994.
 *
 *  Modifications by:
 *
 *    Brian P. Walenz from 2015-MAY-29 to 2015-JUL-01
 *      are Copyright 2015 Battelle National Biodefense Institute, and
 *      are subject to the BSD 3-Clause License
 *
 *    Brian P. Walenz beginning on 2015-DEC-07
 *      are a 'United States Government Work', and
 *      are released in the public domain
 *
 *  File 'README.licenses' in the root directory of this distribution contains
 *  full conditions and disclaimers for each license.
 */

#include "findErrors.H"

#include "Binomial_Bound.H"

void
Process_Olap(Olap_Info_t        *olap,
             char               *b_seq,
             bool                shredded,
             Thread_Work_Area_t *wa);

void
Read_Frags(feParameters   *G,
           gkStore        *gkpStore);

void
Read_Olaps(feParameters   *G,
           gkStore        *gkpStore);

void
Output_Corrections(feParameters *G);




//  Read fragments lo_frag..hi_frag (INCLUSIVE) from store and save the ids and sequences of those
//  with overlaps to fragments in global Frag .

static
void
extractReads(feParameters *G,
             gkStore      *gkpStore,
             Frag_List_t  *fl,
             uint64       &nextOlap) {

  //  Clear the buffer.

  fl->readsLen = 0;
  fl->basesLen = 0;

  //  The original converted to lowercase, and made non-acgt be 'a'.

  char  filter[256];

  for (uint32 i=0; i<256; i++)
    filter[i] = 'a';

  filter['A'] = filter['a'] = 'a';
  filter['C'] = filter['c'] = 'c';
  filter['G'] = filter['g'] = 'g';
  filter['T'] = filter['t'] = 't';

 //  Return if we've exhausted the overlaps.

  if (nextOlap >= G->olapsLen)
    return;

  //  Count the amount of stuff we're loading.

  uint64 lastOlap = nextOlap;
  uint32 loID     = G->olaps[lastOlap].b_iid;  //  Actual ID we're extracting
  uint32 hiID     = loID;
  uint64 maxBases = 512 * 1024 * 1024;

  //  Find the highest read ID that we can load without exceeding maxBases.

  while ((fl->basesLen < maxBases) &&
         (lastOlap     < G->olapsLen)) {
    hiID = G->olaps[lastOlap].b_iid;                        //  Grab the ID of the overlap we're at.

    gkRead *read = gkpStore->gkStore_getRead(hiID);         //  Grab that read.

    fl->readsLen += 1;                                      //  Add the read to our set.
    fl->basesLen += read->gkRead_sequenceLength() + 1;

   lastOlap++;                                             //  Advance to the next overlap
    while ((lastOlap < G->olapsLen) &&                      //
           (G->olaps[lastOlap].b_iid == hiID))              //  If we've exceeded the max size or hit the last overlap,
      lastOlap++;                                           //  the loop will stop on the next iteration.
  }

  //  If nothing to load, just return.

  if (fl->readsLen == 0)
    return;

  //  Report what we're going to do.

  fprintf(stderr, "extractReads()-- Loading reads " F_U32 " to " F_U32 " (" F_U32 " reads with " F_U64 " bases) overlaps " F_U64 " through " F_U64 ".\n", 
          loID, hiID, fl->readsLen, fl->basesLen, nextOlap, lastOlap);

  //  Ensure there is space.

  if (fl->readsMax < fl->readsLen) {
    delete [] fl->readIDs;
    delete [] fl->readBases;

    fl->readsMax  = 12 * fl->readsLen / 10;
    fl->readIDs   = new uint32 [fl->readsMax];
    fl->readBases = new char * [fl->readsMax];
  }

  if (fl->basesMax < fl->basesLen) {
    delete [] fl->bases;

    fl->basesMax    = 12 * fl->basesLen / 10;
    fl->bases       = new char [fl->basesMax];
  }

  //  Load the sequence data for reads loID to hiID, as long as the read has an overlap.

  gkReadData *readData = new gkReadData;

  fl->readsLen = 0;
  fl->basesLen = 0;

  while ((loID <= hiID) &&
         (nextOlap < G->olapsLen)) {
    gkRead *read       = gkpStore->gkStore_getRead(loID);

    fl->readIDs[fl->readsLen]   = loID;                          //  Save the ID of _this_ read.
    fl->readBases[fl->readsLen] = fl->bases + fl->basesLen;      //  Set the data pointer to where this read should start.

    gkpStore->gkStore_loadReadData(read, readData);

    uint32  readLen    = read->gkRead_sequenceLength();
    char   *readBases  = readData->gkReadData_getSequence();

    for (uint32 bb=0; bb<readLen; bb++)
      fl->readBases[fl->readsLen][bb] = filter[readBases[bb]];

    fl->readBases[fl->readsLen][readLen] = 0;                    //  All good reads end.

    fl->basesLen += read->gkRead_sequenceLength() + 1;           //  Update basesLen to account for this read.
    fl->readsLen += 1;                                           //  And note that we loaded a read.

    nextOlap++;                                                  //  Advance past all the overlaps for this read.
    while ((nextOlap < G->olapsLen) &&
           (G->olaps[nextOlap].b_iid == loID))
      nextOlap++;

    if (nextOlap < G->olapsLen)                                  //  If we have valid overlap, grab the read ID.
      loID = G->olaps[nextOlap].b_iid;                           //  If we don't have a valid overlap, the loop will stop.
  }

  delete readData;

  fprintf(stderr, "extractReads()-- Loaded.\n");
}



//  Process all old fragments in  Internal_gkpStore. Only
//  do overlaps/corrections with fragments where
//    frag_iid % Num_PThreads == thread_id

void *
Threaded_Process_Stream(void *ptr) {
  Thread_Work_Area_t  *wa = (Thread_Work_Area_t *)ptr;

  for (int32 i=0; i<wa->frag_list->readsLen; i++) {
    int32  skip_id = -1;

    while (wa->frag_list->readIDs[i] > wa->G->olaps[wa->nextOlap].b_iid) {
      if (wa->G->olaps[wa->nextOlap].b_iid != skip_id) {
        fprintf(stderr, "SKIP:  b_iid = %d\n", wa->G->olaps[wa->nextOlap].b_iid);
        skip_id = wa->G->olaps[wa->nextOlap].b_iid;
      }
      wa->nextOlap++;
    }

    if (wa->frag_list->readIDs[i] != wa->G->olaps[wa->nextOlap].b_iid) {
      fprintf (stderr, "ERROR:  Lists don't match\n");
      fprintf (stderr, "frag_list iid = %d  nextOlap = %d  i = %d\n",
               wa->frag_list->readIDs[i],
               wa->G->olaps[wa->nextOlap].b_iid, i);
      exit (1);
    }

    wa->rev_id = UINT32_MAX;

    while ((wa->nextOlap < wa->G->olapsLen) && (wa->G->olaps[wa->nextOlap].b_iid == wa->frag_list->readIDs[i])) {
      if (wa->G->olaps[wa->nextOlap].a_iid % wa->G->numThreads == wa->thread_id) {
        Process_Olap(wa->G->olaps + wa->nextOlap,
                     wa->frag_list->readBases[i],
                     false,  //  shredded
                     wa);
      }

      wa->nextOlap++;
    }
  }

  pthread_exit(ptr);

  return(NULL);
}



//  Read old fragments in  gkpStore  that have overlaps with
//  fragments in  Frag. Read a batch at a time and process them
//  with multiple pthreads.  Each thread processes all the old fragments
//  but only changes entries in  Frag  that correspond to its thread
//  ID.  Recomputes the overlaps and records the vote information about
//  changes to make (or not) to fragments in  Frag .


static
void
Threaded_Stream_Old_Frags(feParameters *G,
                          gkStore      *gkpStore,
                          uint64       &passedOlaps,
                          uint64       &failedOlaps) {

  pthread_attr_t  attr;

  pthread_attr_init(&attr);
  pthread_attr_setstacksize(&attr, THREAD_STACKSIZE);

  pthread_t           *thread_id = new pthread_t          [G->numThreads];
  Thread_Work_Area_t  *thread_wa = new Thread_Work_Area_t [G->numThreads];

  for (uint32 i=0; i<G->numThreads; i++) {
    thread_wa[i].thread_id    = i;
    thread_wa[i].nextOlap     = 0;
    thread_wa[i].G            = G;
    thread_wa[i].frag_list    = NULL;
    thread_wa[i].rev_id       = UINT32_MAX;
    thread_wa[i].passedOlaps  = 0;
    thread_wa[i].failedOlaps  = 0;

    memset(thread_wa[i].rev_seq, 0, sizeof(char) * AS_MAX_READLEN);

    double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN);

    thread_wa[i].ped.initialize(G, G->errorRate);
  }

  uint64 frstOlap = 0;
  uint64 nextOlap = 0;

  Frag_List_t   frag_list_1;
  Frag_List_t   frag_list_2;

  Frag_List_t  *curr_frag_list = &frag_list_1;
  Frag_List_t  *next_frag_list = &frag_list_2;

  extractReads(G, gkpStore, curr_frag_list, nextOlap);

  while (curr_frag_list->readsLen > 0) {

    // Process fragments in curr_frag_list in background

    for (uint32 i=0; i<G->numThreads; i++) {
      thread_wa[i].nextOlap  = frstOlap;
      thread_wa[i].frag_list = curr_frag_list;

      int status = pthread_create(thread_id + i, &attr, Threaded_Process_Stream, thread_wa + i);

      if (status != 0)
        fprintf(stderr, "pthread_create error:  %s\n", strerror(status)), exit(1);
    }

    // Read next batch of fragments

    frstOlap = nextOlap;

    extractReads(G, gkpStore, next_frag_list, nextOlap);

    // Wait for background processing to finish

    fprintf(stderr, "processReads()-- Waiting for compute.\n");

    for (uint32 i=0; i<G->numThreads; i++) {
      void  *ptr;

      int status = pthread_join(thread_id[i], &ptr);

      if (status != 0)
        fprintf(stderr, "pthread_join error: %s\n", strerror(status)), exit(1);
    }

    //  Swap the lists and compute another block

    {
      Frag_List_t *s = curr_frag_list;
      curr_frag_list = next_frag_list;
      next_frag_list = s;
    }
  }

  //  Threads all done, sum up stats.

  passedOlaps = 0;
  failedOlaps = 0;

  for (uint32 i=0; i<G->numThreads; i++) {
    passedOlaps += thread_wa[i].passedOlaps;
    failedOlaps += thread_wa[i].failedOlaps;
  }

  delete [] thread_id;
  delete [] thread_wa;
}









int
main(int argc, char **argv) {
  feParameters  *G = new feParameters();

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      G->gkpStorePath = argv[++arg];

    } else if (strcmp(argv[arg], "-R") == 0) {
      G->bgnID = atoi(argv[++arg]);
      G->endID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-O") == 0) {
      G->ovlStorePath = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      G->errorRate = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-l") == 0) {
      G->minOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-o") == 0) {  //  For 'corrections' file output
      G->outputFileName = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      G->numThreads = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-d") == 0) {
      G->Degree_Threshold = strtol(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-k") == 0) {
      G->Kmer_Len = strtol(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-p") == 0) {
      G->Use_Haplo_Ct = FALSE;

    } else if (strcmp(argv[arg], "-V") == 0) {
      G->Vote_Qualify_Len = strtol(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-x") == 0) {
      G->End_Exclude_Len = strtol(argv[++arg], NULL, 10);

    } else {
      fprintf(stderr, "Unknown option '%s'\n", argv[arg]);
      err++;
    }

    arg++;
  }

  if (G->gkpStorePath == NULL)
    err++;
  if (G->ovlStorePath == NULL)
    err++;
  if (G->numThreads == 0)
    err++;

  if (err > 0) {
    fprintf(stderr, "usage: %s[-ehp][-d DegrThresh][-k KmerLen][-x ExcludeLen]\n", argv[0]);
    fprintf(stderr, "        [-F OlapFile][-S OlapStore][-o CorrectFile]\n");
    fprintf(stderr, "        [-t NumPThreads][-v VerboseLevel]\n");
    fprintf(stderr, "        [-V Vote_Qualify_Len]\n");
    fprintf(stderr, "          <FragStore> <lo> <hi>\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Makes corrections to fragment sequence based on overlaps\n");
    fprintf(stderr, "and recomputes overlaps on corrected fragments\n");
    fprintf(stderr, "Fragments come from <FragStore> <lo> and <hi> specify\n");
    fprintf(stderr, "the range of fragments to modify\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Options:\n");
    fprintf(stderr, "-d   set keep flag on end of frags with less than this many olaps\n");
    fprintf(stderr, "-F   specify file of sorted overlaps to use (in the format produced\n");
    fprintf(stderr, "     by  get-olaps\n");
    fprintf(stderr, "-h   print this message\n");
    fprintf(stderr, "-k   minimum exact-match region to prevent change\n");
    fprintf(stderr, "-o   specify output file to hold correction info\n");
    fprintf(stderr, "-p   don't use haplotype counts to correct\n");
    fprintf(stderr, "-S   specify the binary overlap store containing overlaps to use\n");
    fprintf(stderr, "-t   set number of p-threads to use\n");
    fprintf(stderr, "-v   specify level of verbose outputs, higher is more\n");
    fprintf(stderr, "-V   specify number of exact match bases around an error to vote to change\n");
    fprintf(stderr, "-x   length of end of exact match to exclude in preventing change\n");

    if (G->gkpStorePath == NULL)
      fprintf(stderr, "ERROR: no gatekeeper store (-G) supplied.\n");
    if (G->ovlStorePath == NULL)
      fprintf(stderr, "ERROR: no overlap store (-O) supplied.\n");
    if (G->numThreads == 0)
      fprintf(stderr, "ERROR: number of compute threads (-t) must be larger than zero.\n");

    exit(1);
  }

  //  Initialize Globals

  double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN);

  Initialize_Match_Limit(G->Edit_Match_Limit, G->errorRate, MAX_ERRORS);

  for  (uint32 i = 0;  i <= AS_MAX_READLEN;  i++)
    G->Error_Bound[i] = (int)ceil(i * G->errorRate);

  //  Load data.

  gkStore *gkpStore = gkStore::gkStore_open(G->gkpStorePath);

  if (G->bgnID < 1)
    G->bgnID = 1;

  if (gkpStore->gkStore_getNumReads() < G->endID)
    G->endID = gkpStore->gkStore_getNumReads();

  Read_Frags(G, gkpStore);
  Read_Olaps(G, gkpStore);

  //  Sort overlaps, process each.

  sort(G->olaps, G->olaps + G->olapsLen);

  uint64  passedOlaps = 0;
  uint64  failedOlaps = 0;

  Threaded_Stream_Old_Frags(G, gkpStore, passedOlaps, failedOlaps);

  //  All done.  Sum up what we did.

  fprintf(stderr, "\n");
  fprintf(stderr, "Passed overlaps = %10" F_U64P " %8.4f%%\n", passedOlaps, 100.0 * passedOlaps / (failedOlaps + passedOlaps));
  fprintf(stderr, "Failed overlaps = %10" F_U64P " %8.4f%%\n", failedOlaps, 100.0 * failedOlaps / (failedOlaps + passedOlaps));

  //  Dump output.

  //Output_Details(G);
  Output_Corrections(G);

  //  Cleanup and exit!

  gkpStore->gkStore_close();

  delete G;

  fprintf(stderr, "\n");
  fprintf(stderr, "Bye.\n");

  exit(0);
}

