#include "camm_util.h"
#include "camm_arith.h"


#define pf(a_,b_) /*   f(nta,a_,b_) */

#define p1_1(a_,b_)  \
      pmm(S(S(a_,KB4),KB4),ax,2) \
      pa(0,4) \
      pla(S(a_,RS4),bx,0) \
      pf(S(b_,a_),ax) \
      pmm(S(S(S(a_,KB4),KB4),KB4),ax,3) \
      pa(1,5) \
      pa(2,6) \
      pc(0,2) \
      pmm(S(a_,RS4),ax,0) \
      pa(3,7) \
      pla(S(a_,RS4),bx,1) \
      pf(S(S(b_,KB4),a_),ax) \
      pc(1,3) \
      pmm(S(S(a_,RS4),KB4),ax,1) 
#define lp1(a_) \
      pla(a_,bx,0) \
      pla(a_,bx,1) \
      pc(0,2) \
      pc(1,3) \
      pmm(a_,ax,0) \
      pmm(S(a_,KB4),ax,1)      
#define dp1(a_) \
      pmm(S(S(a_,KB4),KB4),ax,2) \
      pa(0,4) \
      pmm(S(S(S(a_,KB4),KB4),KB4),ax,3) \
      pa(1,5) \
      pa(2,6) \
      pa(3,7) 
#define pl1 RS

      
#define p1_4(a_,b_) \
      pm(0,2) \
      pa(1,4) \
      pla(S(S(a_,KB4),KB4),ax,1) \
      pf(S(a_,b_),ax) \
      pla(S(S(S(a_,KB4),KB4),KB4),ax,3) \
      pm(0,1) \
      pa(2,5) \
      pm(0,3) \
      pa(1,6) \
      pla(S(a_,RS4),bx,0) \
      pla(S(a_,RS4),ax,1) \
      pf(S(S(a_,b_),KB4),ax) \
      pla(S(S(a_,KB4),RS4),ax,2) \
      pm(0,1) \
      pa(3,7) 
#define lp4(a_) \
      pla(a_,bx,0) \
      pla(a_,ax,1) \
      pla(S(a_,KB4),ax,2) \
      pm(0,1) 
#define dp4(a_) \
      pm(0,2) \
      pa(1,4) \
      pla(S(S(a_,KB4),KB4),ax,1) \
      pla(S(S(S(a_,KB4),KB4),KB4),ax,3) \
      pm(0,1) \
      pa(2,5) \
      pm(0,3) \
      pa(1,6) \
      pa(3,7) 
#define pl4 RS


#define p1_6(a_,b_) \
      pm(0,2) \
      pa(1,5) \
      pla(S(S(S(a_,KB4),KB4),KB4),ax,1) \
      pf(S(a_,b_),ax) \
      pla(S(a_,RS4),ax,3) \
      pm(0,1) \
      pa(2,6) \
      pla(S(S(S(a_,KB4),KB4),RS4),ax,2) \
      pla(S(a_,RS4),bx,0) \
      pm(0,3) \
      pa(1,7) \
      pla(S(S(a_,KB4),RS4),ax,1) \
      pf(S(S(a_,b_),KB4),ax) \
      pm(0,1) \
      pa(3,4) 
#define lp6(a_) \
      pla(a_,bx,0) \
      pla(a_,ax,3) \
      pla(S(a_,KB4),ax,1) \
      pla(S(S(a_,KB4),KB4),ax,2) \
      pm(0,3) \
      pa(3,4) \
      pm(0,1)
#define dp6(a_) \
      pm(0,2) \
      pa(1,5) \
      pla(S(S(S(a_,KB4),KB4),KB4),ax,1) \
      pm(0,1) \
      pa(2,6) \
      pa(1,7) 
#define pl6 RS




#define p1_7(a_,b_,c_,d_,e_) \
      pla(S(S(a_,KB4),KB4),ax,c_) \
      pm(0,d_) \
      pa(e_,4) \
      pm(0,c_) \
      pmm(S(S(S(a_,KB4),KB4),KB4),ax,0) \
      pa(d_,5) \
      pla(S(a_,RS4),ax,d_) \
      pa(0,7) \
      pla(S(a_,RS4),bx,0) \
      pa(c_,6) \
      pla(S(S(a_,KB4),RS4),ax,c_) \
      pm(0,d_) 
#define lp7(a_,c_,d_,e_) \
      pla(a_,bx,0) \
      pla(a_,ax,d_) \
      pla(S(a_,KB4),ax,c_) \
      pm(0,d_)

#define dp7(a_,c_,d_,e_) \
      pla(S(S(a_,KB4),KB4),ax,c_) \
      pm(0,d_) \
      pa(e_,4) \
      pla(S(S(S(a_,KB4),KB4),KB4),ax,e_) \
      pm(0,c_) \
      pa(d_,5) \
      pm(0,e_) \
      pa(c_,6) \
      pa(e_,7)
#define pl7 RS




#define p1_5(a_,b_) \
      pla(a_,bx,0) \
      pla(a_,ax,1) \
      pla(S(a_,KB4),ax,2) \
      pla(S(S(a_,KB4),KB4),ax,3) \
      pm(0,1) \
      pm(0,2) \
      pm(0,3) \
      pa(1,4) \
      pla(S(S(S(a_,KB4),KB4),KB4),ax,1) \
      pa(2,5) \
      pm(1,0) \
      pa(3,6) \
      pa(0,7)

#define lp5(a_)
#define dp5(a_) 
#define pl5 0

#define p1_8(a_,b_) \
      pla(a_,bx,0) \
      pla(a_,ax,1) \
      pm(0,1) \
      pa(1,4) \
      pla(S(a_,KB4),ax,2) \
      pm(0,2) \
      pa(2,5) \
      pla(S(S(a_,KB4),KB4),ax,3) \
      pm(0,3) \
      pa(3,6) \
      pla(S(S(S(a_,KB4),KB4),KB4),ax,1) \
      pm(0,1) \
      pa(1,7)

#define lp8(a_)
#define dp8(a_) 
#define pl8 0

#define p1_9(a_,b_) \
      pla(a_,bx,0) \
      pla(a_,ax,1) \
      pm(0,1) \
      pa(1,4) \
      pla(S(a_,KB4),ax,2) \
      pm(0,2) \
      pa(2,5) \
      pla(S(S(a_,KB4),KB4),ax,3) \
      pm(0,3) \
      pa(3,6) \
      pmm(S(S(S(a_,KB4),KB4),KB4),ax,0) \
      pa(0,7)

#define lp9(a_) 
#define dp9(a_) 
#define pl9 0

#define p1_10(a_,b_) \
      pla(a_,ax,1) \
      pla(a_,bx,0) \
      pf(S(a_,b_),ax) \
      pm(0,1) \
      pa(1,4) \
      pla(S(a_,KB4),ax,2) \
      pm(0,2) \
      pa(2,5) \
      pf(S(S(a_,b_),KB4),ax) \
      pla(S(S(a_,KB4),KB4),ax,1) \
      pm(0,1) \
      pa(1,6) \
      pmm(S(S(S(a_,KB4),KB4),KB4),ax,0) \
      pa(0,7) 

#define lp10(a_) 
#define dp10(a_)
#define pl10 0



#define N 10

#define p1(a_,b_)      Mjoin(p1_,N)(a_,b_)
#define load_pipe(a_)  Mjoin(lp,N)(a_)
#define drain_pipe(a_) Mjoin(dp,N)(a_)
#define pipe_len       Mjoin(pl,N)      


#define p2(a_) pf(S(a_,80),bx) p1(a_,80) p1(S(a_,16),S(S(64,KB4),KB4))
#define p4(a_) p2(a_) p2(S(a_,32))
#define p8(a_) p4(a_) p4(S(a_,64))
#define p16(a_) p8(a_) p8(S(a_,128))

#if KB >= 128
#error KB must be less than 128
#endif

#define x1 load_pipe(0)
#define o1 0

#define KBB ( KB - pipe_len )

#if KBB >= 64
#define x2  x1 p16(o1)
#define o2 S(256,o1)
#else
#define x2 x1
#define o2 o1
#endif

#if pipe_len == 64
#define x2a x2 drain_pipe(o2)
#define o2a S(256,o2)
#undef KBB
#define KBB KB
#else
#define x2a x2
#define o2a o2
#endif

#if ( KBB / 32 ) % 2
#define x3  x2 p8(o2a)
#define o3 S(128,o2a)
#else
#define x3 x2a
#define o3 o2a
#endif

#if pipe_len == 32
#define x3a x3 drain_pipe(o3)
#define o3a S(128,o3)
#undef KBB
#define KBB KB
#else
#define x3a x3
#define o3a o3
#endif

#if ( KBB / 16 ) % 2
#define x4 x3 p4(o3a)
#define o4 S(64,o3a)
#else
#define x4 x3a
#define o4 o3a
#endif

#if pipe_len == 16
#define x4a x4 drain_pipe(o4)
#define o4a S(64,o4)
#undef KBB
#define KBB KB
#else
#define x4a x4
#define o4a o4
#endif

#if ( KBB / 8 ) % 2
#define x5  x4 p2(o4a)
#define o5 S(32,o4a)
#else
#define x5 x4a
#define o5 o4a
#endif

#if pipe_len == 8
#define x5a x5 drain_pipe(o5)
#define o5a S(32,o5)
#undef KBB
#define KBB KB
#else
#define x5a x5
#define o5a o5
#endif

#if ( KBB / 4 ) % 2
#define x6  x5 p1(o5a,o5a)
#define o6 S(16,o5a)
#else
#define x6 x5a
#define o6 o5a
#endif

#if pipe_len == 4
#define x6a x6 drain_pipe(o6)
#define o6a S(16,o6)
#undef KBB
#define KBB KB
#else
#define x6a x6
#define o6a o6
#endif

#if ( KB / 2 ) % 2
#define x7  x6 p1_2(o6a)
#define o7 S(8,o6a)
#else
#define x7 x6a
#define o7 o6a
#endif

#if pipe_len == 2
#define x7a x7 drain_pipe(o7)
#define o7a S(8,o7)
#undef KBB
#define KBB KB
#else
#define x7a x7
#define o7a o7
#endif

#if ( KB / 1 ) % 2 
#define x8 x7 p1_4(o7a)
#define o8 S(4,o7a)
#else
#define x8 x7a
#define o8 o7a
#endif

#if pipe_len == 1
#define x8a x8 drain_pipe(o8)
#define o8a S(4,o8)
#undef KBB
#define KBB KB
#else
#define x8a x8
#define o8a o8
#endif


#ifdef SREAL
#define CS 4
#define LDCM 1
#define CINC 16
#else
#define CS 8
#define LDCM 2
#define CINC 32
#endif


#ifdef SREAL
#define z f(t0,0,cx) pc(4,0) pul(5,4) pc(6,1) puh(5,0) pul(7,6)  \
          pa(0,4) puh(7,1) pc(4,2) pa(1,6) ps(68,6,4) ps(238,6,2) pa(4,2) pu(2,0,cx)
#else
#define z1(a_,b_) phl(a_,b_) pa(b_,a_) pc(a_,b_) ps(1,b_,b_) pasr(b_,a_)
#define z    z1(4,0) pus(4,0,cx) z1(5,1) pus(5,CS,cx) \
             z1(6,2) pus(6,S(CS,CS),cx) z1(7,0) pus(7,S(S(CS,CS),CS),cx)
#endif

#ifdef BETA0
#define w    px(4) px(5) px(6) px(7)
#endif
#ifdef BETA1
#define w    pls(0,cx,4) pls(CS,cx,5) pls(S(CS,CS),cx,6) pls(S(S(CS,CS),CS),cx,7)
#endif
#ifdef BETAX
#define w    pls(0,cx,4) pls(CS,cx,5) pls(S(CS,CS),cx,6)  \
             pls(S(S(CS,CS),CS),cx,7) pmsr(3,4) pmsr(3,5) pmsr(3,6) pmsr(3,7)
#endif



void
ATL_USERMM (int m, int n, int k, float alpha, const float *a,
	    int lda,const float *b, int ldb, float beta, float *c,
	    int ldc) {

  float *bbp=&beta;

  ASM ( 

#if KB % 4
#error KB must be divisible by 4
#endif
/*  #if KB != MB */
/*  #error KB must be equal to MB */
/*  #endif */
/*  #if KB != NB */
/*  #error KB must be equal to NB */
/*  #endif */

#ifdef BETAX
       pl(0,di,3)
#endif

       "pushl %%ebp\n\t"
       mm(KB4,bp)
       m(NB,bp)
       ra(si,bp)
       
       "pushl %%ebx\n\t"
       "movl  %%esi,%%ebx\n\t"

       mm(KB4,si)
       m(MB,si)
       ra(ax,si)
       
       lab(loopb)
       
       "pushl %%eax\n\t"
       "pushl %%esi\n\t"

       lab(loopa)

       w

       x8a

       z

       a(S(KB8,KB8),ax)
       a(CINC,cx)

       cmp(ax,si)
       jne(loopa)

       "popl %%esi\n\t"
       "popl %%eax\n\t"

       ra(dx,cx)
       a(KB4,bx)
       
       cmp(bx,bp)
       jne(loopb)
       
       "popl %%ebx\n\t"
       "popl %%ebp\n\t"

       ::"a" (a),"S" (b),"c" (c),"d" ((ldc-MB)*LDCM*sizeof(*c))
#ifdef BETAX
       ,"D" (bbp):"memory");
#else
       :"memory");
#endif
  
}
