/*
    libfame - Fast Assembly MPEG Encoder Library
    Copyright (C) 2000-2001 Vivien Chappelier
                            Damien Vincent

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

static unsigned int MAE8x8_withmask(unsigned char *ref,
				    unsigned char *input,
				    unsigned char *shape,
				    int pitch)
{
  int dummy;
  unsigned long retval;

#define MAE_STEP_MASK() 			\
    "movq (%0), %%mm0\n"  			\
    "movq (%1), %%mm1\n"			\
    "movq (%4), %%mm5\n"			\
    "pcmpgtb %%mm7, %%mm5\n"			\
    "movq %%mm0, %%mm2\n"			\
    "addl %2, %0\n"				\
    "addl %2, %1\n"				\
    "addl %2, %4\n"				\
    "addl $32, %1\n"				\
    "psubusb %%mm1, %%mm0\n"			\
    "psubusb %%mm2, %%mm1\n"			\
    "por %%mm1, %%mm0\n"			\
    "pand %%mm5,%%mm0\n"			\
    "movq %%mm0, %%mm2\n"			\
    "punpcklbw %%mm7, %%mm0\n"			\
    "punpckhbw %%mm7, %%mm2\n"			\
    "paddw %%mm0, %%mm6\n"			\
    "paddw %%mm2, %%mm6\n"			\
    
  asm volatile ("pxor %%mm7, %%mm7\n"
		"pxor %%mm6, %%mm6\n" 
		MAE_STEP_MASK()
		MAE_STEP_MASK()
		MAE_STEP_MASK()
		MAE_STEP_MASK()
		MAE_STEP_MASK()
		MAE_STEP_MASK()
		MAE_STEP_MASK()
		MAE_STEP_MASK()
		"movq %%mm6, %%mm7\n"
		"psrlq $0x20, %%mm7\n"
		"paddw %%mm7, %%mm6\n"
		"movq %%mm6, %%mm7\n"
		"psrlq $0x10, %%mm7\n"
		"paddw %%mm7, %%mm6\n"
		"movd %%mm6, %3\n"
		: "=r"(dummy), "=r"(ref), "=r"(pitch), "=r"(retval), "=r"(shape)
		: "0"(input), "1"(ref), "2"(pitch), "3"(0), "4"(shape)
		: "memory");

  return (retval&65535);
}

static unsigned int MAE8x8_withoutmask(unsigned char *ref,
				       unsigned char *input,
				       unsigned char *shape,
				       int pitch)
{
  int dummy;
  unsigned long retval;

#define MAE_STEP_NOMASK()			\
    "movq (%0), %%mm0\n"			\
    "movq (%1), %%mm1\n"			\
    "addl %2, %1\n"				\
    "addl %2, %0\n"				\
    "addl $32, %1\n"				\
    "movq (%0), %%mm2\n"			\
    "movq %%mm0, %%mm4\n"			\
    "movq (%1), %%mm3\n"			\
    "movq %%mm2, %%mm5\n"			\
    "addl %2, %1\n"				\
    "addl %2, %0\n"				\
    "addl $32, %1\n"				\
    "psubusb %%mm1, %%mm0\n"			\
    "psubusb %%mm3, %%mm2\n"			\
    "psubusb %%mm4, %%mm1\n"			\
    "psubusb %%mm5, %%mm3\n"			\
    "por %%mm1, %%mm0\n"			\
    "por %%mm3, %%mm2\n"			\
    "movq %%mm0, %%mm4\n"			\
    "movq %%mm2, %%mm5\n"			\
    "punpcklbw %%mm7, %%mm0\n"			\
    "punpcklbw %%mm7, %%mm2\n"			\
    "punpckhbw %%mm7, %%mm4\n"			\
    "punpckhbw %%mm7, %%mm5\n"			\
    "paddw %%mm0, %%mm4\n"			\
    "paddw %%mm2, %%mm5\n"                      \
    "paddw %%mm4, %%mm6\n"			\
    "paddw %%mm5, %%mm6\n"

  asm volatile ("pxor %%mm7, %%mm7\n"
		"pxor %%mm6, %%mm6\n" 
		MAE_STEP_NOMASK()
		MAE_STEP_NOMASK()
		MAE_STEP_NOMASK()
		MAE_STEP_NOMASK()
		"movq %%mm6, %%mm7\n"
		"psrlq $0x20, %%mm7\n"
		"paddw %%mm7, %%mm6\n"
		"movq %%mm6, %%mm7\n"
		"psrlq $0x10, %%mm7\n"
		"paddw %%mm7, %%mm6\n"
		"movd %%mm6, %3\n"
		: "=r"(dummy), "=r"(ref), "=r"(pitch), "=r"(retval)
		: "0"(input), "1"(ref), "2"(pitch), "3"(0)
		: "memory");

  return (retval&65535);
}

