/* -*- C++ -*- */

/*
  The Hoard Multiprocessor Memory Allocator
  www.hoard.org

  Author: Emery Berger, http://www.cs.umass.edu/~emery
 
  Copyright (c) 1998-2004, The University of Texas at Austin

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/


/*
 * @file   libhoard.cpp
 * @brief  This file replaces malloc etc. in your application.
 * @author Emery Berger <http://www.cs.umass.edu/~emery>
 */

// Ensure that all pthread_* calls get strong linkage.
// Otherwise, our versions here won't replace them!

#include <new>

#undef __GXX_WEAK__ 

#if HOARD_NO_LOCK_OPT
// Disable lock optimization.
volatile int anyThreadCreated = 1;
#else
// The normal case. See heaplayers/spinlock.h.
volatile int anyThreadCreated = 0;
#endif

// True iff a user's thread stack has been assigned
// (see our definition of pthread_attr_setstackaddr).
// Disable the various thread stack based optimizations
// if anyone dynamically creates a special thread stack.

volatile int anyThreadStackCreated = 0;

/// The maximum amount of memory that each TLAB may hold, in bytes.
enum { MAX_MEMORY_PER_TLAB = 64 * 1024 };

/// The maximum number of threads supported (sort of).
enum { MaxThreads = 1024 };

/// The maximum number of heaps supported.
enum { NumHeaps = 128 };

#include "computethreadstacksize.h"
#include "cpuinfo.h"
#include "hoard.h"
#include "heapmanager.h"
#include "tlab.h"

// Define HL_EXECUTABLE_HEAP as 1 in heaplayers/hldefines.h if you
// want that (i.e., you're doing dynamic code generation).

#if HL_EXECUTABLE_HEAP
#define HOARD_MMAP_PROTECTION_MASK (PROT_READ | PROT_WRITE | PROT_EXEC)
#else
#define HOARD_MMAP_PROTECTION_MASK (PROT_READ | PROT_WRITE)
#endif


class TheCustomHeapType :
  public HeapManager<TheLockType, HoardHeap<MaxThreads, NumHeaps> > {};

/// Return the custom (Hoard) heap.

inline static TheCustomHeapType * getCustomHeap (void) {
  // This function is C++ magic to ensure that the heap is initialized
  // before its first use.

  // Allocate a static buffer to hold the heap.
  static double thBuf[sizeof(TheCustomHeapType) / sizeof(double) + 1];

  // Now initialize the heap into that buffer.
  static TheCustomHeapType * th = new (thBuf) TheCustomHeapType;
  return th;
}

#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#pragma inline_depth(255)
#pragma warning(disable:4273)
#endif

typedef ThreadLocalAllocationBuffer<bins<NoHeader, SUPERBLOCK_SIZE>::NUM_BINS,
				    bins<NoHeader, SUPERBLOCK_SIZE>::getSizeClass,
				    bins<NoHeader, SUPERBLOCK_SIZE>::getClassSize,
				    MAX_MEMORY_PER_TLAB,
				    TheCustomHeapType::SuperblockType,
				    SUPERBLOCK_SIZE,
				    TheCustomHeapType::PerThreadHeap> TLAB;

#if defined(_WIN32) // defined(USE_THREAD_KEYWORD) && defined(_WIN32)
__declspec(thread) char tlabBuf[sizeof(TLAB)];
__declspec(thread) TLAB * tlab;
#endif


/// The address of the main stack (before any thread is created).
static void * mainThreadStackLocation = 0x0;

#if defined(_WIN32)

static TLAB * getTLABslowPath (void) {
  tlab = new (tlabBuf) TLAB (&getCustomHeap()->getHeap());
  return tlab;
}

static __forceinline TLAB * getTLAB (void) {
  // We can just use thread-specific data here.
  if (tlab != NULL) {
    return tlab;
  } else {
    return getTLABslowPath();
  }
}

#else // !defined(_WIN32)

/// Return the current thread-local allocation buffer (TLAB).
static inline TLAB * getTLAB (void) {
  // Find our TLAB by bitmasking the address of a stack variable (this
  // depends on aligned thread stacks!).
  char dummy;
  unsigned long stackMask = ~((unsigned long) ComputeThreadStackSize::VALUE - 1UL);
  TLAB * t = (TLAB *) (((unsigned long) &dummy) & stackMask);
  if (mainThreadStackLocation == 0x0) {
    // This is the first call of getTLAB, from the main thread.
    // Set the main thread stack location.
    mainThreadStackLocation = t;
  }
  // If we masked to the main thread, return the 'main' TLAB.
  if (t == (TLAB *) mainThreadStackLocation) {
    // Force the 'main' TLAB to be initialized; that is, the TLAB for
    // the main thread, then return it.
    static TLAB mainTLAB (&getCustomHeap()->getHeap());
    t = &mainTLAB;
  }
  return t;
}

#endif // !defined(_WIN32)


extern "C" void * malloc (size_t sz) {

  if (sz < 2 * sizeof(size_t)) {
    // Make sure it's at least big enough to hold two pointers. 
    sz = 2 * sizeof(size_t);
  }

  // Allocate small objects locally.
  if (sz <= TheCustomHeapType::BIG_OBJECT) {
    // Use the TLAB, if we haven't created a stack yet.
    if (!anyThreadStackCreated) {
      TLAB * t = getTLAB();
      return t->malloc (sz);
    }
  }

  {
    // Otherwise, just use the base heap.
    // Wrapped in braces here to avoid the static check
    // when not needed.
    static TheCustomHeapType * heap = getCustomHeap();
    void * ptr = heap->malloc (sz);
    return ptr;
  }
}


extern "C" void free (void * ptr) {

  // Use the TLAB for small objects, and if we haven't created a thread stack.
  size_t sz = getCustomHeap()->getSize (ptr);
  if (!anyThreadStackCreated && (sz <= TheCustomHeapType::BIG_OBJECT)) {
    TLAB * t = getTLAB();
    t->free (ptr);
  } else {
    static TheCustomHeapType * heap = getCustomHeap();
    heap->free (ptr);
  }
}

/*** below are generic replacement functions for the malloc family ***/

extern "C" void * calloc (size_t nelem, size_t elsize)
{
  size_t n = nelem * elsize;
  void * ptr = malloc (n);
  // Zero out the malloc'd block.
  if (ptr != NULL) {
    memset (ptr, 0, n);
  }
  return ptr;
}


extern "C" char * strndup (const char * s, size_t sz)
{
  char * newString = NULL;
  if (s != NULL) {
#if defined(linux)
    size_t cappedLength = strnlen (s, sz);
#else
    size_t cappedLength = strlen (s);
#endif
    if ((newString = (char *) malloc(cappedLength + 1))) {
      strncpy(newString, s, cappedLength);
      newString[cappedLength] = '\0';
    }
  }
  return newString;
}

extern "C" char * strdup (const char * s)
{
  char * newString = NULL;
  if (s != NULL) {
    if ((newString = (char *) malloc(strlen(s) + 1))) {
      strcpy(newString, s);
    }
  }
  return newString;
}


extern "C" void * realloc (void * ptr, size_t sz)
{
  static TheCustomHeapType * theCustomHeap = getCustomHeap();
  if (ptr == NULL) {
    ptr = theCustomHeap->malloc (sz);
    return ptr;
  }
  if (sz == 0) {
    theCustomHeap->free (ptr);
    return NULL;
  }

  size_t objSize = theCustomHeap->getSize(ptr);
  if (objSize >= sz) {
    return ptr;
  }
  void * buf = theCustomHeap->malloc ((size_t) (sz));

  if (buf != NULL) {
    // Copy the contents of the original object
    // up to the size of the new block.
    size_t minSize = (objSize < sz) ? objSize : sz;
    memcpy (buf, ptr, minSize);
  }

  // Free the old block.
  theCustomHeap->free(ptr);

  // Return a pointer to the new one.
  return buf;
}


void * operator new (size_t sz)
{
  return malloc (sz);
}

namespace std {
  struct nothrow_t;
}

void * operator new (size_t sz, const std::nothrow_t&) throw() {
  return malloc (sz);
}

void operator delete (void * ptr)
{
  free (ptr);
}

void * operator new[] (size_t sz) {
  return malloc (sz);
}

void * operator new[] (size_t sz, const std::nothrow_t&) throw() {
  return malloc (sz);
}

void operator delete[] (void * ptr)
{
  free (ptr);
}


extern "C" void * memalign (size_t, size_t size)
{
  // NOTE: This function is deprecated and here just acts like malloc.
  return malloc (size);
}

extern "C" void * valloc (size_t sz) {
  return malloc (sz);
}

extern "C" void * pvalloc (size_t sz) {
  return malloc (sz);
}

extern "C" int mallopt (int, int) {
  return 0; // Always fail.
}


#if !defined(_WIN32)

/****************/
/***** UNIX *****/
/****************/

/* Here is where we hijack pthread_create and company.
   NOTE: Relies on libpthread being a shared library. */

#include <pthread.h>
#include <dlfcn.h>

#include <utility> // STL

typedef char * getcwdFunction (char *, size_t);

extern "C" char * getcwd (char * buf, size_t size)
{
  static getcwdFunction * real_getcwd
    = (getcwdFunction *) dlsym (RTLD_NEXT, "getcwd");
  
  if (!buf) {
    if (size == 0) {
      size = PATH_MAX;
    }
    buf = (char *) malloc (size);
  }
  return (real_getcwd)(buf, size);
}


extern "C" {
typedef void * (*threadFunctionType) (void *);
}

typedef
int (*pthread_attr_setstackaddr_function) (pthread_attr_t * attr,
					   void * stack);

typedef  
int (*pthread_create_function) (pthread_t *thread,
				const pthread_attr_t *attr,
				threadFunctionType start_routine,
				void *arg);

typedef
void (*pthread_exit_function) (void *);

// A special routine we call on thread exits to free up some resources.
static void exitRoutine (void) {

  // Clear the TLAB's buffer.
  TLAB * t = getTLAB();
  t->clear();

  // Relinquish the assigned heap.
  getCustomHeap()->releaseHeap();
}


extern "C" void * startMeUp (void * a)
{
  getCustomHeap()->findUnusedHeap();

  // Instantiate the thread-local allocation buffer here.
  char dummy;
  void * buf = (void *) (((unsigned long) &dummy) & ~(ComputeThreadStackSize::VALUE-1));
  TheCustomHeapType::PerThreadHeap * th = &getCustomHeap()->getHeap();
  TLAB * tl = new (buf) TLAB (th);
  
  pair<threadFunctionType, void *> * z
    = (pair<threadFunctionType, void *> *) a;
  threadFunctionType f = z->first;
  void * arg = z->second;
  delete z;
  void * result = (*f)(arg);
  exitRoutine();
  return result;
}

pthread_attr_setstackaddr_function getReal_pthread_attr_setstackaddr (void) {
  static pthread_attr_setstackaddr_function f = NULL;
  if (f == NULL) {
    f = (pthread_attr_setstackaddr_function)
      dlsym (RTLD_NEXT, "pthread_attr_setstackaddr");
    if (f == NULL) {
      abort();
    }
  }
  return f;
}


extern "C" int pthread_attr_setstackaddr (pthread_attr_t * attr,
					  void * stack)
{
  // Someone has created their own thread stack.
  // Now we have to abandon using thread-local allocation buffers :(.
  anyThreadStackCreated = true;
  return (getReal_pthread_attr_setstackaddr())(attr, stack);
}

extern "C" int pthread_create (pthread_t *thread,
			       const pthread_attr_t *attr,
			       void * (*start_routine) (void *),
			       void * arg)
{
  static pthread_create_function f = NULL;
  static TLAB * t = getTLAB(); // Force initialization of the TLAB before our first thread is created.
#if defined(linux)
  char fname[] = "pthread_create";
#else
  char fname[] = "_pthread_create";
#endif
  
  if (f == NULL) {
    f = (pthread_create_function) dlsym (RTLD_NEXT, fname);
    if (f == NULL) {
      abort();
    }
  }

  anyThreadCreated = 1;

  // Check to see whether the user set a custom stack size.
  // If so, we have to revert to using pthread_self() to get
  // thread IDs (see heaplayers/cpuinfo.h).

  size_t size;
  pthread_attr_getstacksize ((pthread_attr_t *) attr, &size);

  if (!anyThreadStackCreated) {

    // Use an aligned thread stack.
    // This allows us to compute thread IDs very cheaply: we take
    // address of a stack variable and mask it. Again, see
    // heaplayers/cpuinfo.h.
    int stackSize = ComputeThreadStackSize::VALUE;

    // NOTE: The default permissions here make the thread stack *non* executable.
    // This may provide some measure of safety from stack smashing attacks.
    // It may also interfere with JIT compilation.

#if defined(__SVR4) && defined(MAP_ALIGN)
    void * buf = mmap ((char *) stackSize, stackSize, HOARD_MMAP_PROTECTION_MASK, MAP_PRIVATE | MAP_ALIGN | MAP_ANON, -1, 0);
#else
    // We have to align the buffer ourselves.
    // Get a big chunk from mmap,
    // then unmap the non-aligned extra parts.

    //  012  345678  9..
    // [---][------][---]
    //  ^    ^
    //  |    +- buf
    //  |
    //  originalBuf

    void * originalBuf = mmap (NULL, stackSize * 2, HOARD_MMAP_PROTECTION_MASK, MAP_PRIVATE | MAP_ANON, -1, 0);

    // Get one aligned part.

    void * buf = (void *) (((size_t) originalBuf + stackSize - 1) & ~(stackSize - 1));

    // Chop off the extra part below the stack.
    munmap (originalBuf, (size_t) buf - (size_t) originalBuf - 1);


    // Chop off the extra part above the stack.
    munmap ((void *) ((size_t) buf + stackSize), ((size_t) originalBuf + stackSize * 2 - ((size_t) buf + stackSize - 1)));

#endif

    char * stack = (char *) buf + sizeof(TLAB);
    stackSize -= sizeof(TLAB);

    (getReal_pthread_attr_setstackaddr()) ((pthread_attr_t *) attr, stack);
    pthread_attr_setstacksize ((pthread_attr_t *) attr, stackSize);
  }
  
  pair<threadFunctionType, void *> * newarg
    = new pair<threadFunctionType, void *>(start_routine, arg);
  
  int result = (*f)(thread, attr, startMeUp, newarg);
  
  return result;
}

extern "C" void pthread_exit (void * arg)
{
#if defined(linux)
  static pthread_exit_function f
    = (pthread_exit_function) dlsym (RTLD_NEXT, "pthread_exit");
#else
  static pthread_exit_function f
    = (pthread_exit_function) dlsym (RTLD_NEXT, "_pthread_exit");
#endif

  exitRoutine();
  (*f)(arg);
}

#if defined(__SVR4) // Solaris

typedef
int (*thr_create_function) (void * stack_base,
			   size_t stack_size,
			   void * (*start_func) (void *),
			   void * arg,
			   long flags,
			   thread_t *new_thread_ID);


extern "C" int thr_create (void * stack_base,
			   size_t stack_size,
			   void * (*start_func) (void *),
			   void * arg,
			   long flags,
			   thread_t *new_thread_ID)
{
  static thr_create_function f = NULL;
  char fname[] = "thr_create";

  if (f == NULL) {
    f = (thr_create_function) dlsym (RTLD_NEXT, fname);
    if (f == NULL) {
      abort();
    }
  }

  anyThreadCreated = 1;
  
  pair<threadFunctionType, void *> * newarg
    = new pair<threadFunctionType, void *>(start_func, arg);
  
  if (stack_base != NULL) {
    anyThreadStackCreated = true;
  }

  int result = (*f)(stack_base, stack_size, startMeUp, newarg, flags, new_thread_ID);
  
  return result;
}

extern "C" void thr_exit (void * arg)
{
  static pthread_exit_function f
    = (pthread_exit_function) dlsym (RTLD_NEXT, "thr_exit");
  exitRoutine();
  (*f)(arg);
}
#endif


#elif defined(_WIN32)

/*****************/
/***** WIN32 *****/
/*****************/

#include <stdio.h>

extern "C"
BOOL WINAPI DllMain(HANDLE hinstDLL, DWORD fdwReason, LPVOID lpreserved)
{
  int i;
  int tid;
  static int np = CPUInfo::computeNumProcessors();

  switch (fdwReason) {

  case DLL_PROCESS_ATTACH:
    fprintf (stderr, "This software uses the Hoard scalable memory allocator (version 3.2.2).\nCopyright (C) 2004 Emery Berger, The University of Texas at Austin,\nand University of Massachusetts, Amherst.\nFor more information, see http://www.hoard.org\n");
    break;

  case DLL_THREAD_ATTACH:
    if (np == 1) {
      // Assign the thread to heap 0.
      getCustomHeap()->chooseZero();
    } else {
      getCustomHeap()->findUnusedHeap();
    }
    tlab = NULL;
    break;

  case DLL_THREAD_DETACH:
    // Dump the memory from the TLAB.
    tlab->clear();
    if (np != 1) {
      getCustomHeap()->releaseHeap();
    }
    break;

  default:
    return TRUE;
  }

  return TRUE;
}

#endif
