/*
  Copyright Mission Critical Linux, 2000

  Kimberlite is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2, or (at your option) any
  later version.

  Kimberlite is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with Kimberlite; see the file COPYING.  If not, write to the
  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
  MA 02139, USA.
*/
/*
 * Author: Tim Burke (burke@missioncriticallinux.com)
 * $Revision: 1.8 $
 *
 * Definitions related to the shared state disk subsystem.
 */
#ifndef	_DISKSTATE_H
#define	_DISKSTATE_H	1

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <time.h>
#include <clu_lock.h>
#include "clusterdefs.h"

/* 
 * The following defines describe the set of configurable parameters which
 * can be specified in the cluster configuration file.
 *
 * "quorum%sameTimeNetup" & "quorum%sameTimeNetdown"
 * The main mechanism used to detect if a partner node has died is to
 * read in the timestamp to see if it has changed.  Its probably not
 * prudent to assume that a partner is dead merely after missing only one
 * timestamp update.  Perhaps they hit an IO activity spike or something like
 * that.  The "quorum%sameTimeNetdown" parameter defines the number of 
 * consecutive polls that the timestamp is allowed to remain unchanging 
 * before initiating a shootdown of the failed partner cluster node.  
 * If the heartbeat daemon believes the other node to
 * be up, we will give it longer to update its disk timestamp based on the
 * "quorum%sameTimeNetup" parameter.  This allows
 * us to have quicker failover times for the case where a node is truly down;
 * while allowing us to not prematurely suspect a node is down due to an
 * IO activity spike. Therefore the value of the "quorum%sameTimeNetup"
 * parameter should be greater than the "quorum%sameTimeNetdown" parameter.
 * These 2 parameters are optional.
 *                                      
 * "quorum%pingInterval"
 * This parameter specifies the number of seconds to delay between
 * each iteration of the quorumd process.  It controls how often the
 * interval timer is updated on disk, as well as how often the
 * partner node's interval timer is read in.
 * This parameter is optional.
 *                                                          
 * "quorum%logLevel"
 * Controls the level of diagnostic messages.  
 *
 * "quorum%scanDelay"
 * Controls the delay between reading in a category of shared state disk
 * information for the purposes of performing a read verification check.
 * This will result in repair of any corruption due to user error.
 */                      

#define CFG_DISK_PING_INTERVAL		"quorum%pingInterval"
#define CFG_DISK_SAMETIME_NETUP		"quorum%sameTimeNetup"
#define CFG_DISK_SAMETIME_NETDOWN	"quorum%sameTimeNetdown"
#define CFG_DISK_POWER_CHECK_INTERVAL	"quorum%powerCheckInterval"
#define CFG_DISK_VERBOSE		"quorum%logLevel"
#define CFG_DISK_SCAN_DELAY		"quorum%scanDelay"

/*
 * Definitions of on disk formats for data structures representing
 * shared cluster state.
 */
#define SHARED_STATE_MAGIC_NUMBER	0x39119FCD	// Arbitrarily chosen
#define SHARED_STATE_LATEST_VERSION	1		// Header version #
#define STATUS_BLOCK_MAGIC_NUMBER	0xF1840DCE	// Arbitrarily chosen
#define STATUS_BLOCK_LATEST_VERSION	1		// Header version #
#define SERVICE_BLOCK_MAGIC_NUMBER	0x19FAD022	// Arbitrarily chosen
#define SERVICE_BLOCK_LATEST_VERSION	1		// Header version #
#define LOCK_BLOCK_MAGIC_NUMBER		0xFEEDFACE	// Arbitrarily chosen
#define LOCK_BLOCK_LATEST_VERSION	1		// Header version #
#define CONFIG_DB_MAGIC_NUMBER		0x0122345F	// Arbitrarily chosen
#define CONFIG_DB_LATEST_VERSION	1		// Header version #
#define NET_BLOCK_MAGIC_NUMBER         0xF08DEC99      // Arbitrarily chosen
#define NET_BLOCK_LATEST_VERSION       1               // Header version #

#define MAX_CONSECUTIVE_IO_ERRORS	1		// Max # of tolerated successive io errors
#define MAX_BOUNCEIO_LENGTH		4096		// 1 page

/*
 * When DISKSTATE_DEBUG is defined it enables a set of debug validation checks.
 */
//#define DISKSTATE_DEBUG 1

/*
 * Each of the shared state data structures resides at fixed offsets
 * within the partition.  The following are the offsets in bytes.
 */
#define OFFSET_HEADER	0

/*
 * Shared state disk header.  Describes cluster global information.
 */
typedef struct {
	ulong	magic_number;
        ulong   check_sum;
	int 	version;
	char	nodenames[MAX_NODES][MAXHOSTNAMELEN];
	char	description[MAX_DESC];
	time_t	timestamp;		// time of last update
	int	updateNodenum;		// ID of node doing last update
        ulong   pad;            // makes check sum easier
} SharedStateHeader;

/*
 * The node specific status structures will be after the shared state header.
 * Specify an offset which will safely exceed the size of the header by
 * an amount likely to meet disk block granularity sizes (which is assumed
 * to be of 512 byte units.
 * XXX - actually these offsets should be dynamically calculated.
 * Following that give each node specific status structure a comfortable
 * amount of space.  The node will write its own status at a location
 * equal to (OFFSET_FIRST_STATUS_BLOCK + (nodeNumber * SPACE_PER_STATUS_BLOCK))
 */
#define OFFSET_FIRST_STATUS_BLOCK	2048
#define SPACE_PER_STATUS_BLOCK		2048

/*
 * Node specific status structure.  Used to represent which services an
 * individual node is serving.  Also contains the interval timer which gets
 * updated to indicate that the server is alive.
 * Under normal operating circumstances, this status structure is 
 * write-only by the server and read-only to the other cluster members.
 *
 * I wanted to try to keep the size of this to be under 512 bytes in
 * length in order to enable it to be written atomicaly to the disk.
 * Initially I had the services as an array of character (strings) but
 * that ended up taking up too much space.
 */
typedef struct {
	ulong	magic_number;
        ulong   check_sum;
	int 	version;
	char	nodename[MAXHOSTNAMELEN];
	time_t	timestamp;		// time of last update
	int	updateNodenum;		// ID of node doing last update
	time_t	incarnationNumber;	// cluster startup time
	int	state;			// running or stopped
	time_t  configTimestamp;        // date on config database
        ulong   pad;            // makes check sum easier
} NodeStatusBlock;

/*
 * Each node has a "Lock Block" which is used to provide synchronized
 * access to the service state descriptions.
 */
#define OFFSET_FIRST_LOCK_BLOCK		8192
#define SPACE_PER_LOCK_BLOCK		512

/* DiskLockBlock defined in clu_lock.h */

/*
 * Allowable values representing the lockData member.
 * XXX - need to coordinate these values with Dave.
 */
#define DISK_LOCK_FREE		0	// Lock not held
#define DISK_LOCK_TAKEN		1	// Lock held

/*
 * The service descriptions are kept separate from the timestamp info
 * used by the quorum daemon.  There is a separate service description block
 * per service.
 */
#define OFFSET_FIRST_SERVICE_BLOCK	10240
#define SPACE_PER_SERVICE_BLOCK		512

/*
 * DiskServiceBlock
 *
 * This structure is the on-disk representation of a service.
 * Contained within it is the memory resident version of a service.
 */
typedef struct {
        ulong	magic_number;
        ulong   check_sum;
	int 	version;
	ServiceBlock svcblk;	// defined in clusterdefs.h
        ulong   pad;            // makes check sum easier
} DiskServiceBlock;

/*
 * The cluster configuration "database" (formerly /etc/cluster.cfg) is
 * stored on the shared state partition to avoid synchronization issues
 * inherent in a filesystem based scheme.  Place this after the service
 * descriptions with a little buffer zone for safety.
 * Arbitrarily chose a size of 1MB for the max length.
 */
#define SPACE_DB_HEADER 512
#define OFFSET_CONFIG_DB_HEADER ((OFFSET_FIRST_SERVICE_BLOCK + \
	(SPACE_PER_SERVICE_BLOCK * MAX_SERVICES)) + 1024)
#define OFFSET_CONFIG_DB_DATA (OFFSET_CONFIG_DB_HEADER + SPACE_DB_HEADER)
#define SPACE_FOR_CONFIG_DATABASE (1024 * 1024)		
/*
 * DiskConfigDBHeader
 *
 * Header describing the configuration database.  The actual contents or
 * data in the "database" is treated as a binary "blob". 
 * Must not exceed SPACE_DB_HEADER in size!
 */
typedef struct {
	ulong	magic_number;
        ulong   check_sum;
        ulong   db_check_sum;
	int 	version;
	ssize_t	length;		// Amount of real "data" saved.
} DiskConfigDBHeader;

/*
 * The "net block" is used to store communication authentication info.
 * stored on the shared state partition as is is a common token among 
 * cluster members.
 */
#define OFFSET_NET_BLOCK (OFFSET_CONFIG_DB_DATA + SPACE_FOR_CONFIG_DATABASE + 1024)
#define SPACE_NET_BLOCK_DATA (450)
#define SPACE_FOR_NET_BLOCK (512)
/*
 * DiskNetBlock
 *
 * Starts with the usual boilerplate header, followed by the actual data
 * payload itself.  
 * Non-ideal in that the data portion isn't block aligned.
 */
typedef struct {
	ulong	magic_number;
        ulong   check_sum;
	int 	version;
	int	state;			// valid or uninitialized
	char	data[SPACE_NET_BLOCK_DATA];
} DiskNetBlock;
// Values for DiskNetBlock state field.
#define DISK_NET_BLOCK_INVALID	0	// uninitialized, inactive
#define DISK_NET_BLOCK_VALID	1	// currently in use

/*
 * This define describes how big the shared state partition must be.
 * Since the config database is the last thing on the partition, it defines
 * the sizing requirements.
 */
#define END_OF_DISK (OFFSET_NET_BLOCK + SPACE_FOR_NET_BLOCK)

/*
 * Defining DO_RAW_BOUNCEIO will cause support for buffer alignment
 * on read and write operations as required by RAW IO. (gack)
 */
#define DO_RAW_BOUNCEIO
#ifdef DO_RAW_BOUNCEIO
#define BOUNCEIO_READ diskRawRead
#define BOUNCEIO_WRITE diskRawWrite
#else // DO_RAW_BOUNCEIO
#define BOUNCEIO_READ read
#define BOUNCEIO_WRITE write
#endif // DO_RAW_BOUNCEIO
#endif /* diskstate.h */
