/*
 *             Automatically Tuned Linear Algebra Software v3.1.1Dev
 **************** THIS IS AN UNSUPPORTED DEVELOPER RELEASE *****************
 *                      (C) Copyright 1999 Camm Maguire                      
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University, the ATLAS group, or the names of its 
 *      contributers may not be used to endorse or promote products derived
 *      from this software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */
#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>

#include "camm_util.h"


#if defined(ALIGN) && ( !defined(SREAL) || ( ( STRIDE % 4 != 0 ) && ( NDPM != 1 ) ) )
#error Can only align SREAL with NDPM 1 or STRIDE % 4 = 0
#endif


/******************************************************************************
 *  Single Precision Complex Macros
 ******************************************************************************/  

#ifdef SCPLX

#ifdef NO_TRANSPOSE

#if NDPM > 3 
#error Max NDPM is 3 for SCPLX NO_TRANSPOSE
#endif

#define plax

#define R1 2
#define R2 4
#define R3 6
#define R4 6

#define TREG 1
#define SREG 0
#define CREG 0

#ifdef GER
#define AREG 0
#define targ(a_)        AREG
#define wb(a_,b_)       pu(AREG,a_,b_)
#define wbd(a_,b_)      pud(AREG,a_,b_)
#define w(a_)
#define w1_2(a_)
#else
#define AREG TREG
#define targ(a_)        CREG
#define wb(a_,b_)
#define wbd(a_,b_)
#define w(a_)           pu(CREG,a_ ## 0,si)
#define w1_2(a_)        pud(CREG,a_ ## 0,si)
#endif

#define src(a_)         a_
#define mpx(a_)         pls(0,si,a_) ps(0,a_,a_) pls(4,si,P(a_,1)) \
                        ps(0,P(a_,1),P(a_,1)) sign(a_)
#define madd(a_,b_,c_)  pas(a_,b_,c_)
#define ulfa(a_)

#else

#define R1 4
#define R2 5
#define R3 6
#define R4 7

#define TREG 3
#define SREG 2
#define CREG 0
#define targ(a_)        a_
#define src(a_)         0
#define w(a_)
#define w1_2(a_)
#define mpx(a_)        px(a_)
#ifdef BETA0
#define ulfa(a_)       phl(a_,0) pa(0,a_) pud(a_,0,si)
#else
#define ulfa(a_)       pl(0,si,TREG) phl(a_,0) pa(0,a_) pa(TREG,a_) pud(a_,0,si)
#endif
#define AREG TREG
#define wb(a_,b_)
#define wbd(a_,b_)
#define wbs(a_,b_)


#define plax       pc(CREG,1) ps(160,CREG,CREG) ps(245,1,1) sign(CREG)



#endif

#if defined(Conj_) && ! defined(GER) 
#define sign(a_)       pm(SREG,a_)
#else		   
#define sign(a_)       pm(SREG,P(a_,1))
#endif



#define plb(a_,b_)           pl(a_,b_,AREG)
#define plbd(a_,b_)          px(AREG) pld(a_,b_,AREG)

#define dpr(a_)              pm(src(a_),TREG) pa(TREG,targ(a_))
#define dprp(a_,b_,c_)       pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
#define dpi(a_)              pm(P(src(a_),1),TREG) ps(177,TREG,TREG) pa(TREG,targ(a_))

#ifndef GER


#define plaa(a_)                pl(a_ ## 0,si,CREG) plax
#define wa(a_)                  w(a_)
#define dp(a_,b_,c_)            plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_)
#define dpp(a_,b_,c_,d_,e_)     plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_)
#define ddp(a_,b_,c_)           dp(a_,b_,c_)       
#define ddpp(a_,b_,c_,d_,e_)    dpp(a_,b_,c_,d_,e_)

#define plaa1_2(a_)             px(CREG) pld(a_ ## 0,si,CREG) plax
#define wa1_2(a_)               w1_2(a_)
#define dp1_2(a_,b_,c_)         plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_)
#define dpp1_2(a_,b_,c_,d_,e_)  plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_)
#define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)       
#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)


#else

#define lqc(a_)              pl(a_ ## 0,si,TREG)
#define lqc1_2(a_)           px(TREG) pld(a_ ## 0,si,TREG)


#define plaa(a_) 
#define wa(a_)
#define dp(a_,b_,c_)         lqc(a_) plb(a_ ## 0,b_) dpr(c_) \
                             lqc(a_) dpi(c_) wb(a_ ## 0,b_)
#define dpp(a_,b_,c_,d_,e_)  lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
                             lqc(a_) dpi(c_) wb(a_ ## 0,b_)
#define ddp(a_,b_,c_)        dp(a_,b_,c_)       
#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)

#define plaa1_2(a_)
#define wa1_2(a_)
#define dp1_2(a_,b_,c_)         lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \
                                lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_)
#define dpp1_2(a_,b_,c_,d_,e_)  lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
                                lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_)
#define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)       
#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)

#endif

#endif

/******************************************************************************
 *  Single Precision Real Macros
 ******************************************************************************/  

#ifdef SREAL

#ifdef NO_TRANSPOSE

#define mpx(a_)        pls(0,si,a_) ps(0,a_,a_)
#define madd(a_,b_,c_) pas(a_,b_,c_)
#define TREG 1
#define targ(a_)        0
#define src(a_)         a_
#define ulfa(a_)

#ifdef GER
#define w(a_)
#define w1_2(a_)
#define w1_4(a_)
#define CREG 2
#define AREG 0
#define cp pc(CREG,TREG)
#define wb(a_,b_) pu(AREG,a_,b_)
#define wbd(a_,b_) pud(AREG,a_,b_)
#define wbs(a_,b_) pus(AREG,a_,b_)
#else
#define CREG 0
#define AREG TREG
#define cp
#define wb(a_,b_)
#define wbd(a_,b_)
#define wbs(a_,b_)
#define w(a_)           pu(CREG,a_ ## 0,si)
#define w1_2(a_)        pud(CREG,a_ ## 0,si)
#define w1_4(a_)        pus(CREG,a_ ## 0,si)
#endif

#else

#define mpx(a_)        px(a_)
#ifdef BETA0
#define madd(a_,b_,c_)
#else
#define madd(a_,b_,c_) pas(a_,b_,c_)
#endif
#define TREG 3
#define targ(a_)        a_
#define src(a_)         0
#define w(a_)
#define w1_2(a_)
#define w1_4(a_)
#define ulfa(a_)       phl(a_,0) pa(0,a_) pc(a_,0) ps(1,0,0) pa(0,a_) \
                       madd(0,si,a_) pus(a_,0,si) 

#define CREG 0
#define AREG TREG
#define cp
#define wb(a_,b_)
#define wbd(a_,b_)
#define wbs(a_,b_)

#endif

#if defined(ALIGN) && ( ( STRIDE % 4  == 0 ) || ( NDPM == 1 ) )
#define plb(a_,b_)           pla(a_,b_,AREG)
#else
#define plb(a_,b_)           pl(a_,b_,AREG)
#endif
#define plbd(a_,b_)          px(AREG) pld(a_,b_,AREG)
#define plbs(a_,b_)          pls(a_,b_,AREG)
#define dpr(a_)              pm(src(a_),TREG) pa(TREG,targ(a_))
#define dprp(a_,b_,c_)       pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
#define dprs(a_)             pmsr(src(a_),TREG) pasr(TREG,targ(a_))
#define dprps(a_,b_,c_)      pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))

#define plaa(a_)             pl(a_ ## 0,si,CREG) 
#define wa(a_)               w(a_)
#define dp(a_,b_,c_)         cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_)
#define dpp(a_,b_,c_,d_,e_)  cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_)
#define ddp(a_,b_,c_)        dp(a_,b_,c_)       
#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)

#define plaa1_2(a_)             px(CREG) pld(a_ ## 0,si,CREG) 
#define wa1_2(a_)               w1_2(a_)
#define dp1_2(a_,b_,c_)         cp plbd(a_ ## 0,b_) dpr(c_) wbd(a_ ## 0,b_)
#define dpp1_2(a_,b_,c_,d_,e_)  cp plbd(a_ ## 0,b_) dprp(c_,d_,e_) wbd(a_ ## 0,b_)
#define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)       
#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)

#define plaa1_4(a_)             pls(a_ ## 0,si,CREG) 
#define wa1_4(a_)               w1_4(a_)
#define dp1_4(a_,b_,c_)         cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_)
#define dpp1_4(a_,b_,c_,d_,e_)  cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_)
#define ddp1_4(a_,b_,c_)        dp1_4(a_,b_,c_)       
#define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_)



#define R1 4
#define R2 5
#define R3 6
#define R4 7

#endif

/******************************************************************************
 *  Double Precision Real Macros
 ******************************************************************************/  

#ifdef DREAL

#ifdef NO_TRANSPOSE

#define t0(a_)         1
#define s0(a_)         a_
#define t8(a_)         2
#define s8(a_)         a_
#define w(a_)          fp(a_ ## 0,si) fp(a_ ## 8,si)
#define w1_2(a_)       fp(a_ ## 0,si)
#define mpx(a_)        fl(0,si) fc(M(a_,2))
#define madd(a_,b_,c_) faa(a_,b_)
#define ulfa(a_)       fc(0)

#else

#define t0(a_)         a_
#define s0(a_)         1
#define t8(a_)         a_
#define s8(a_)         2
#define w(a_)           
#define w1_2(a_)           
#define mpx(a_)        fz
#ifdef BETA0
#define madd(a_,b_,c_)
#else
#define madd(a_,b_,c_) faa(a_,b_)
#endif
#define ulfa(a_)       madd(0,si,a_) fp(0,si)

#endif


#ifndef GER

#define plaa1_2(a_)              fl(a_ ## 0,si) 
#define wa1_2(a_)                w1_2(a_)
#ifdef NO_TRANSPOSE
#define ddp1_2(a_,b_,c_)         fl(a_ ## 0,b_) fm(M(s0(c_),1),0) fap(0,t0(c_)) 
#define dp1_2(a_,b_,c_)          ddp1_2(a_,b_,c_)
#else
#define ddp1_2(a_,b_,c_)         fl(a_ ## 0,b_) fm(s0(c_),0) fap(0,M(t0(c_),1)) 
#define dp1_2(a_,b_,c_)          fl(a_ ## 0,b_) fmp(0,s0(c_)) fap(0,M(t0(c_),2))
#endif

#else

#define plaa1_2(a_)              fl(a_ ## 0,si) 
#define wa1_2(a_)
#define ddp1_2(a_,b_,c_)         fd(M(s0(c_),2)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) 
#define dp1_2(a_,b_,c_)          fm(M(s0(c_),2),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) 

#endif



#define plaa(a_)                 fl(a_ ## 0,si) fl(a_ ## 8,si) fx1

#ifndef GER


#define wa(a_)                   w(a_)


#define ddp(a_,b_,c_)            fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \
                                 fm(P(s8(c_),1),0) fx1 fap(0,P(t0(c_),1)) \
                                 fap(0,t8(c_))
#define ddpp(a_,b_,c_,d_,e_)     fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \
                                 fm(P(s8(c_),1),0)  pf(d_,e_) fx1 fap(0,P(t0(c_),1)) \
                                 fap(0,t8(c_))

/* #define ddp(a_,b_,c_)            fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */
/*                                  fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) */
/* #define ddpp(a_,b_,c_,d_,e_)     fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */
/*                                   \ */
/*                                  fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) pf(d_,e_) */

#ifdef NO_TRANSPOSE

#define dp(a_,b_,c_)             ddp(a_,b_,c_)
#define dpp(a_,b_,c_,d_,e_)      ddpp(a_,b_,c_,d_,e_)

#else

#define dp(a_,b_,c_)             fl(a_ ## 0,b_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \
                                 fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2))
#define dpp(a_,b_,c_,d_,e_)      fl(a_ ## 0,b_)  pf(d_ ,e_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \
                                 fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2))

/* #define dp(a_,b_,c_)             fma(a_ ## 0,b_) fap(0,M(t0(c_),1))  \ */
/*                                  fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) */
/* #define dpp(a_,b_,c_,d_,e_)      fma(a_ ## 0,b_) fap(0,M(t0(c_),1))  \ */
/*                                   \ */
/* 			         fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) pf(d_,e_) */

#endif


#else

#define wa(a_)
#define ddp(a_,b_,c_)            fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
                                 fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_)
#define ddpp(a_,b_,c_,d_,e_)     fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
                                 fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_)

#define dp(a_,b_,c_)             fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
                                 fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_)
#define dpp(a_,b_,c_,d_,e_)      fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
                                 fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_)

#endif


#define R1 3
#define R2 4
#define R3 5
#define R4 6

#endif

/******************************************************************************
 *  Double Precision Complex Macros
 ******************************************************************************/  

#ifdef DCPLX


#if NDPM > 2
#error Max NDPM is 2 for DCPLX
#endif

#define TREG           2

#ifdef NO_TRANSPOSE

#define w(a_)          fp(a_ ## 0,si) fp(a_ ## 8,si)
#define plax           fx1
#define srr(a_)        a_
#define sri(a_)        a_
#define sir(a_)        a_
#define sii(a_)        a_
#define trr(a_)        P(TREG,1)
#define tri(a_)        M(TREG,1)
#define tir(a_)        TREG
#define tii(a_)        TREG
#define mpx(a_)        fl(0,si) fl(8,si) fc(M(a_,2)) fc(M(a_,2)) 
#define madd(a_,b_,c_) faa(a_,b_)
#define ulfa(a_)       fc(0) fc(0)

#else

#define srr(a_)       P(TREG,1)
#define sri(a_)       M(TREG,1)
#define sir(a_)       TREG
#define sii(a_)       TREG
#define trr(a_)       a_
#define tri(a_)       a_
#define tir(a_)       a_
#define tii(a_)       a_
#define w(a_)           
#define plax  
#define mpx(a_)        fz fz
#ifdef BETA0
#define madd(a_,b_,c_)
#else
#define madd(a_,b_,c_) faa(a_,b_)
#endif
#define ulfa(a_)       madd(0,si,a_) fp(0,si) madd(8,si,a_) fp(8,si)

#endif



#ifdef Conj_
#define fapi(a_,b_)   fsp(b_)
#define fspi(a_,b_)   fap(a_,b_)
#else
#define fapi(a_,b_)   fap(a_,b_)
#define fspi(a_,b_)   fsp(b_)
#endif

#ifndef GER


#define plaa(a_)             fl(a_ ## 0,si) fl(a_ ## 8,si) plax
#define wa(a_)               w(a_)
#define ddp(a_,b_,c_)        fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
                                                  fm(sri(c_),0) fap(0,tri(c_))\
                             fl(a_ ## 8,b_) fd(0) fm(sir(c_),0) fspi(0,tir(c_)) \
                                                  fm(sii(c_),0) fapi(0,tii(c_))
#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
                                                  fm(sri(c_),0) fap(0,tri(c_))\
                             fl(a_ ## 8,b_) fd(0) pf(d_,e_) fm(sir(c_),0) fspi(0,tir(c_))\
                                                  fm(sii(c_),0) fapi(0,tii(c_))



#ifdef NO_TRANSPOSE



#define dp(a_,b_,c_)         ddp(a_,b_,c_)
#define dpp(a_,b_,c_,d_,e_)  ddpp(a_,b_,c_,d_,e_)



#else

#define dp(a_,b_,c_)        fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
                                                 fm(sri(c_),0) fap(0,tri(c_))\
                            fl(a_ ## 8,b_)       fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \
                                                 fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))

#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
                                                 pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\
                            fl(a_ ## 8,b_)       fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \
                                                 fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))


#endif

#else

#define plaa(a_)            fl(a_ ## 0,si) fl(a_ ## 8,si) plax
#define wa(a_)

#define ddprr(a_,b_,c_)     fl(a_ ## 0,b_) \
                                              fd(tri(c_))           fm(P(sri(c_),1),0)      fap(0,1) \
                                              fd(M(trr(c_),1))      fm(srr(c_),0)           fspi(0,1) \
                            fp(a_ ## 0,b_) 
#define ddpri(a_,b_,c_)     fl(a_ ## 8,b_) \
                                              fd(tii(c_))           fm(P(sii(c_),1),0)      fap(0,1) \
                                              fd(M(tir(c_),1))      fm(sir(c_),0)           fapi(0,1) \
                            fp(a_ ## 8,b_) 
#define dpri(a_,b_,c_)      fl(a_ ## 8,b_) \
                                              fx(2)                 fm(sir(c_),0)           fap(0,2) \
                                                                    fm(M(sii(c_),2),0)      fapi(0,1) \
                            fp(a_ ## 8,b_)


#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_)
#define ddp(a_,b_,c_)        ddprr(a_,b_,c_)           ddpri(a_,b_,c_)
#define dpp(a_,b_,c_,d_,e_)  ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_)
#define dp(a_,b_,c_)         ddprr(a_,b_,c_)           dpri(a_,b_,c_)

#endif


#define R1 4
#define R2 6
#define R3 6
#define R4 6


#endif


/******************************************************************************
 *  General Macros
 ******************************************************************************/  




#define bla1(a_,b_)          plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_) 
#define blb1(a_,b_)          plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_)
			     
#define bla2(a_,b_)          pf(b_,si) plaa(a_) ddp(a_,ax,R1)        pf(b_,ax) dp(a_,bx,R2) wa(a_)
#define blb2(a_,b_)                    plaa(a_) ddpp(a_,ax,R1,b_,bx)           dp(a_,bx,R2) wa(a_) 
			     
#define bla3(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \
                             dpp(a_,cx,R3,b_,ax) wa(a_)
#define blb3(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \
                             dpp(a_,cx,R3,b_,cx) wa(a_)
			     
#define bla4(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \
                             ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_)
#define blb4(a_,b_)          plaa(a_) ddp(a_,ax,R1)        ddpp(a_,bx,R2,b_,cx) \
                             ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_)

#define bla(a_,b_)      Mjoin(bla,NDP)(a_,b_)
#define blb(a_,b_)      Mjoin(blb,NDP)(a_,b_)



#define bla11_2(a_)    plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_) 
#define bla21_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_)
#define bla31_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \
                          dp1_2(a_,cx,R3) wa1_2(a_)
#define bla41_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \
                          ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_)

#define bla1_2(a_)     Mjoin(Mjoin(bla,NDP),1_2)(a_)



#define bla11_4(a_)    plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_) 
#define bla21_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_)
#define bla31_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \
                          dp1_4(a_,cx,R3) wa1_4(a_)
#define bla41_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \
                          ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_)

#define bla1_4(a_)     Mjoin(Mjoin(bla,NDP),1_4)(a_)



#define inc1(a_)        a(a_,si) a(a_,ax)
#define inc2(a_)        inc1(a_) a(a_,bx)
#define inc3(a_)        inc2(a_) a(a_,cx)
#define inc4(a_)        inc3(a_) a(a_,dx)

#define inc(a_)         Mjoin(inc,NDP)(a_)


#ifdef PREFETCH
#include "camm_arith.h"
#define PF1 PREFETCH
#define PF2 S(PF1,32)
#define PF3 S(PF1,64)
#define PF4 S(PF1,96)
#define PF5 S(PF1,128)
#define PF6 S(PF1,160)
#define PF7 S(PF1,192)
#define PF8 S(PF1,224)
#else
#define PF1 64
#define PF2 96
#define PF3 128
#define PF4 160
#define PF5 192
#define PF6 224
#define PF7 256
#define PF8 288
#endif


#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER)
#define pf(a_,b_)  f(t0,a_,b_)
#else
#define pf(a_,b_)  f(nta,a_,b_)
#endif

#define bl1            bla1_4(0x0) inc(4)
#define bl2            bla1_2(0x0) inc(8)
#define bl4            bla(0x0,PF1) inc(16)
#define bl8            bla(0x0,PF1) blb(0x1,PF1) inc(32) 
#define bl16           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64)
#define bl32           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \
                       bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128)
#define bl64           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \
                       bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \
                       bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \
                       bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256)

/* #define in2           inc(8) */
/* #define in4           inc(16) */
/* #define in8           inc(32) */
/* #define in16          inc(64) */

#define in2  
#define in4  
#define in8  
#define in16 

#ifdef NO_TRANSPOSE
#define incf           ra(di,si)
#else
#define incf
#endif

#define lf1            mpx(R1)
#define lf2            lf1 incf mpx(R2)
#define lf3            lf2 incf mpx(R3)
#define lf4            lf3 incf mpx(R4)

#define lf             Mjoin(lf,NDP)


#define ulf1           ulfa(R1)
#define ulf2           ulf1 ra(di,si) ulfa(R2) 
#define ulf3           ulf2 ra(di,si) ulfa(R3) 
#define ulf4           ulf3 ra(di,si) ulfa(R4) 

#define ulf            Mjoin(ulf,NDP)

#define lpba(a_)      "movl %%esi,%%e" #a_ "\n\t"

#define lpb1          lpba(ax)
#define lpb2          lpb1 ra(di,si) lpba(bx)
#define lpb3          lpb2 ra(di,si) lpba(cx)
#define lpb4          lpb3 ra(di,si) lpba(dx)

#define lpb           Mjoin(lpb,NDP)

#define ipf1(a_)   pf(a_,si) pf(a_,ax)
#define ipf2(a_)   ipf1(a_)  pf(a_,bx) 
#define ipf3(a_)   ipf2(a_)  pf(a_,cx) 
#define ipf4(a_)   ipf3(a_)  pf(a_,dx) 

#define ipf(a_)     Mjoin(ipf,NDP)(a_)

#ifdef LUNROLL
#undef UNROLL
#ifdef SREAL
#define UNROLL LUNROLL
#elif defined(DREAL) || defined(SCPLX)
#define UNROLL LUNROLL*2
#elif defined(DCPLX)
#define UNROLL LUNROLL*4
#endif
#else
#define UNROLL 16
#endif

#undef UNROLL1_2
#if UNROLL == 64
#define blUNROLL bl64
#define UNROLL1_2 32
#elif UNROLL == 32
#define blUNROLL bl32
#define UNROLL1_2 16
#elif UNROLL == 16
#define blUNROLL bl16
#define UNROLL1_2 8
#elif UNROLL == 8
#define blUNROLL bl8
#define UNROLL1_2 4
#elif UNROLL == 4
#define blUNROLL bl4
#define UNROLL1_2 2
#elif UNROLL == 2
#define blUNROLL bl2
#define UNROLL1_2 1
#elif UNROLL == 1
#define blUNROLL bl1
#define UNROLL1_2 stop
#endif
#ifndef UNROLL1_2
#error UNROLL must be set to power of 2 < 128
#endif


#ifdef GER
#define aconst
#define cconst const
#else
#define aconst const
#define cconst
#endif

static void
Mjoin(dp,EXT)(aconst TYPE *a,int lda,
	      const TYPE *b,
	      cconst TYPE *c,int stride,int len) {

#ifdef SCPLX
#if defined(GER) && defined(Conj_)
    const TYPE w[2]={{-1.0,1.0},{-1.0,1.0}};
#else
    const TYPE w[2]={{1.0,-1.0},{1.0,-1.0}};
#endif
#endif
#ifdef NO_TRANSPOSE
#define movm c
#define fixm b
#else
#define movm b
#define fixm c
#endif    
    NO_INLINE;

    ASM (

	 "pushl %%ebx\n\t"
	 a(4,sp)

#ifdef SCPLX
	 "movl %6,%%esi\n\t"
	 pl(0,si,SREG)
#endif
	 
#ifdef NO_TRANSPOSE
	 "movl %1,%%esi\n\t"  /* fixm */
	 "movl %2,%%edi\n\t"  /* fixm2fixm */
#endif

	 lf

	 "movl %3,%%esi\n\t"  /* a */
	 "movl %4,%%edi\n\t"  /* a2a */

	 lpb

	 ipf(0)

	 "movl %0,%%esi\n\t"  /* movm */
	 "movl %5,%%edi\n\t"  /* len */

#if defined(ALIGN) && defined (SREAL) && ( ( STRIDE % 4  == 0 ) || ( NDPM == 1 ) )

	 test(4,ax)
	 je(Mjoin(a1,EXT))
	 test(-1,di)
	 je(Mjoin(a1,EXT))
	 sub(1,di)
	 bl1

	 lab(Mjoin(a1,EXT))

	 test(8,ax)
	 je(Mjoin(as,EXT))
	 test(-2,di)
	 je(Mjoin(as,EXT))
	 sub(2,di)
	 bl2

	 lab(Mjoin(as,EXT))

#endif
	      

	 ipf(32)

	 lab(Mjoin(loop,EXT))

	 test(-UNROLL,di)
	 je(Mjoin(UNROLL1_2,EXT))
	 sub(UNROLL,di)

	 blUNROLL
	 
	 jmp(Mjoin(loop,EXT))

#if UNROLL > 32
	 lab(Mjoin(32,EXT))
	 test(32,di)
	 je(Mjoin(16,EXT))
	 bl32
#endif	 

#if UNROLL > 16
	 lab(Mjoin(16,EXT))
	 test(16,di)
	 je(Mjoin(8,EXT))
	 bl16
#endif	 

#if UNROLL > 8
	 lab(Mjoin(8,EXT))
	 test(8,di)
	 je(Mjoin(4,EXT))
	 bl8
#endif	 

#if UNROLL > 4
	 lab(Mjoin(4,EXT))
	 test(4,di)
	 je(Mjoin(2,EXT))
	 bl4
#endif

#if UNROLL > 2	 
	 lab(Mjoin(2,EXT))
#ifndef DCPLX
	 test(2,di)
	 je(Mjoin(1,EXT))
	 bl2
#endif
#endif

#if UNROLL > 1
	 lab(Mjoin(1,EXT))
#ifdef SREAL
	 test(1,di)
	 je(Mjoin(stop,EXT))
	 bl1
#endif
#endif

	 lab(Mjoin(stop,EXT))

#ifndef NO_TRANSPOSE
	 "movl %1,%%esi\n\t"  /* fixm */
	 "movl %2,%%edi\n\t"  /* fixm2fixm */
#endif

	 ulf

	 a(-4,sp)
	 "popl %%ebx\n\t"


	 ::"m" (movm),"m" (fixm),"m" (stride*sizeof(*fixm)),"m" (a),"m" (lda*sizeof(*a)),
	   "m" (len*sizeof(*movm)/sizeof(float))

#ifdef SCPLX
	 ,"m" (w)
#endif
	 :"ax","bx","cx","dx","si","di");


}

