/* ***** BEGIN LICENSE BLOCK *****
 * Source last modified: $Id: mcomp.c,v 1.2.42.1 2004/07/09 01:56:22 hubbe Exp $
 * 
 * Portions Copyright (c) 1995-2004 RealNetworks, Inc. All Rights Reserved.
 * 
 * The contents of this file, and the files included with this file,
 * are subject to the current version of the RealNetworks Public
 * Source License (the "RPSL") available at
 * http://www.helixcommunity.org/content/rpsl unless you have licensed
 * the file under the current version of the RealNetworks Community
 * Source License (the "RCSL") available at
 * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
 * will apply. You may also obtain the license terms directly from
 * RealNetworks.  You may not use this file except in compliance with
 * the RPSL or, if you have a valid RCSL with RealNetworks applicable
 * to this file, the RCSL.  Please see the applicable RPSL or RCSL for
 * the rights, obligations and limitations governing use of the
 * contents of the file.
 * 
 * Alternatively, the contents of this file may be used under the
 * terms of the GNU General Public License Version 2 or later (the
 * "GPL") in which case the provisions of the GPL are applicable
 * instead of those above. If you wish to allow use of your version of
 * this file only under the terms of the GPL, and not to allow others
 * to use your version of this file under the terms of either the RPSL
 * or RCSL, indicate your decision by deleting the provisions above
 * and replace them with the notice and other provisions required by
 * the GPL. If you do not delete the provisions above, a recipient may
 * use your version of this file under the terms of any one of the
 * RPSL, the RCSL or the GPL.
 * 
 * This file is part of the Helix DNA Technology. RealNetworks is the
 * developer of the Original Code and owns the copyrights in the
 * portions it created.
 * 
 * This file, and the files included with this file, is distributed
 * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
 * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
 * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
 * ENJOYMENT OR NON-INFRINGEMENT.
 * 
 * Technology Compatibility Kit Test Suite(s) Location:
 *    http://www.helixcommunity.org/content/tck
 * 
 * Contributor(s):
 * 
 * ***** END LICENSE BLOCK ***** */

/*-----------------------------------------------------------------------------
 *  MCOMP.C
 *
 *  DESCRIPTION
 *      mcomp.c - Motion compensation for H.263 codec
 *          Performs motion compensation for a macroblock with half-pixel accuracy
 *          Supports Unrestricted Motion Vector mode, i.e., motion vectors are allowed
 *          to point outside picture boundaries; uses edge pixels when needed.
 *          Supports Advanced Prediction mode, i.e., four motion vectors per macroblock
 *          and overlapped motion compensation for luminance.
 *
 *          In Advanced Prediction mode, the prediction is formed in two steps.
 *          First, MotionComp263 is called to generate non-overlapped motion compensation.
 *          Then, OverlapMC is called to perform the overlapping on the luminance.
 *          OverlapMC requires mv's from left, top, and right neighbors.  Therefore,
 *          the encoder needs to decide on the coding mode for the next mb (i.e., to the
 *          right) before overlap can be done.
 *
 *          In PB-frame mode, the B-frame prediction is generated by PredBframe.  It uses
 *          previous P-frame and the corresponding macroblock from the next P-frame.
 *          Hence, the next P-frame macroblock needs to be reconstructed before calling 
 *          PredBframe.
 *
 *          This module also contains the motion compensation routine for 
 *          H.261 (MotionComp), which performs integer motion compensation.
 *          MotionComp was extracted from file PREDSEL.C and modified.
 *
 *  CALLING SEQUENCE
 *      MotionComp263(  MACROBLOCK_DESCR * mb,  // Describes block to be motion-compensated
 *                      PICTURE * prev_pic,     // Describes previous picture used to form MC
 *                      PICTURE * pic)          // Output picture where MC block is placed
 *      OverlapMC( MACROBLOCK_DESCR * mb,   // Describes block to be motion-compensated
 *                      PICTURE * prevPic,  // Describes previous picture used to form MC
 *                      PICTURE * pic,      // Output picture where MC block is placed
 *                      int     mbWidth,    // Macroblocks per row
 *                      int     mbOffset,   // Row offset; (mb-mbOffset) is neighbor on top
 *                      int     overlap[4]  // Returns YES or NO to indicate whether overlap
 *                      )                   // was done in each 8x8 subblock
 *      PredBframe( MACROBLOCK_DESCR * mb,  // Macroblock to be predicted
 *                      PICTURE * prevPic,      // Prev. picture (forward pred)
 *                      PICTURE * nextPic,      // Next P-picture (backward pred)
 *                      PICTURE * Bpic          // Output picture where pred is placed
 *                      )
 *
 *          The routines do not return a value.
 *
 *          Assumptions:
 *          1. "pic", "prev_pic", "nextPic", and "Bpic" have same line offset and size
 *          2. Chroma components Cb and Cr have same line offset and size
 *          3. All arrays are quad-aligned, i.e., start address is multiple of 4.
 *          4. Motion vectors are represented with one fractional bit
 *
 *
 *      MotionComp(     MACROBLOCK_DESCR * mb,  // Describes block to be motion-compensated
 *                      PICTURE * prev_pic,     // Describes previous picture used to form MC
 *                      PICTURE * pic )         // Output picture where MC block is placed
 *
 *          The routine returns H261_ERROR if motion vectors point outside picture, otherwise OK.
 *
 *          Assumptions:
 *          1-3. As above
 *          4. Motion vectors are represented by integers.
 *
 *
 -----------------------------------------------------------------------------*/ 

//#include <stdio.h>
//#include <stdlib.h>
#ifdef _MACINTOSH
#include <stdlib.h> // for exit()
#endif
#include "dllindex.h"
#include "h261defs.h"
#include "h261func.h"


#ifdef COMPILE_MMX
#include "mmxcpuid.h"
#endif

//#define VVPROFILER
#ifdef VVPROFILER
#include "hvdebtim.h"
extern struct CVvDebugTimer * pVvProf[];
extern unsigned long			pVvProfCount[];
#endif

// Declarations of local functions
static int chromaMVComp( int mvLuma );
static int chromaMvComp4V( S8 mvLuma[4] );
void mc( int hSize, int vSize, 
                    PIXEL in[], PIXEL out[], int hdim,
                    int mvX, int mvY    // Motion vector
                    );
void mcMMX( int hSize, int vSize, 
                    PIXEL in[], PIXEL out[], int hdim,
                    int mvX, int mvY    // Motion vector
                    );
static void limitMC( int hSize, int vSize, 
                    PIXEL const *in, PIXEL *out, int hdim,
                    int mvX, int mvY,   // Motion vector
                    int minX, int maxX, int minY, int maxY // Limits for hor/vert indices
                    );
static int mvDiff( int mvX, int mvY,    // motion vector
                MACROBLOCK_DESCR const *borderMB,   // adjacent macroblock
                int subBlk, // adjacent subblock (needed if borderMB has 4 motion vectors)
                int PBframe,    // If PBframe: do overlap also with INTRA neighbor
                int border[2]   // return motion vector components for adjacent block
                );
static void doOverlapMC( int subBlk,    // Indicates subblock to process (UL, UR, LL, LR)
                    MACROBLOCK_DESCR *mb,   // Used to determine (x,y) coordinates for block
                    PICTURE *prevPic,   // Previous picture; used to create overlapping MC
                    PICTURE *pic, // Contains non-overlapped MC on entry; returns overlapped MC
                    int borderMv[4][2], // Motion vectors for adjacent blocks (L,Top,R,Bottom)
                    int left,           // If YES, overlap using LEFT mv
                    int top,            // If YES, overlap using TOP mv
                    int right,          // If YES, overlap using RIGHT mv
                    int bottom          // If YES, overlap using BOTTOM mv
                    );
static void weigh8x4( PIXEL dest[], PIXEL const *p, int hdim, int vert, int left, int right );
static void weighLeft_Vert( PIXEL *dest, PIXEL const *p, int hdim );
static void weighNoLeft_Vert( PIXEL *dest, PIXEL const *p, int hdim );
static void weighLeft( PIXEL *dest, PIXEL const *p, int hdim );
static void weighRight_Vert( PIXEL *dest, PIXEL const *p, int hdim );
static void weighNoRight_Vert( PIXEL *dest, PIXEL const *p, int hdim );
static void weighRight( PIXEL *dest, PIXEL const *p, int hdim );
static void mc16pelsNoInterpol( PIXEL *in, PIXEL out[], int hdim, int vSize );
static void mc8pelsNoInterpol( PIXEL *in, PIXEL out[], int hdim, int vSize );
static void mc4pelsNoInterpol( PIXEL *in, PIXEL out[], int hdim, int vSize );
static void mc16pelsHorInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void mc8pelsHorInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void mc4pelsHorInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void mc16pelsVertInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void mc8pelsVertInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void mc4pelsVertInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void mc16pels2DInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void mc8pels2DInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void mc4pels2DInterpol( PIXEL const *in, PIXEL out[], int hdim, int vSize );
static void saveBackwardPred( MACROBLOCK_DESCR * mb,    // Describes block to be saved
                              PICTURE * pic // Picture with pixels to be saved
                              );
static void averageForBack( MACROBLOCK_DESCR * mb,  // Describes block to be averaged
                            PICTURE * pic   // Input: contains forward prediction
                                            // Output: contains B-frame prediction
                            );
static void copyBlock( S32 source[], int hSize, int vSize, int sourceOffset,
                       S32 dest[], int destOffset );
void averageBlock( PIXEL forPred[], int hSize, int vSize, int forOffset,
                          PIXEL backPred[], int backOffset );
void averageBlockMMX( PIXEL forPred[], int hSize, int vSize, int forOffset,
                          PIXEL backPred[], int backOffset );


// MotionComp - perform integer-pixel motion compensation for H.261
extern int  MotionComp( MACROBLOCK_DESCR *mb, PICTURE *prev_pic, PICTURE *pic )
{
    int     row, col, status, i, cx, cy, pic_offset, prev_offset;
    union {     // Copy words to speed up routine
        PIXEL   * pix;
        S32    * word;
    } pixel, prev;
    PIXEL   * source, * dest;

    status = OK;
    if (mb->mv_x == 0  &&  mb->mv_y == 0) {   // Copy macroblock
        col = 16 * mb->x;
        row = 16 * mb->y;
        pic_offset = col + row * pic->y.hoffset;
        pixel.pix = pic->y.ptr + pic_offset;
        prev.pix = prev_pic->y.ptr + pic_offset;
        for (i = 0; i < 16; i++) {
            *(pixel.word) = *(prev.word);
            *(pixel.word + 1) = *(prev.word + 1);
            *(pixel.word + 2) = *(prev.word + 2);
            *(pixel.word + 3) = *(prev.word + 3);
            pixel.pix += pic->y.hoffset;
            prev.pix += pic->y.hoffset;
        }
        if (pic->color) {
            col = 8 * mb->x;
            row = 8 * mb->y;
            // Assuming same offset for Cr and Cb
            pic_offset = col + row * pic->cb.hoffset;
            pixel.pix = pic->cb.ptr + pic_offset;
            prev.pix = prev_pic->cb.ptr + pic_offset;
            for (i = 0; i < 8; i++) {
                *(pixel.word) = *(prev.word);
                *(pixel.word + 1) = *(prev.word + 1);
                pixel.pix += pic->cb.hoffset;
                prev.pix += pic->cb.hoffset;
            }
            pixel.pix = pic->cr.ptr + pic_offset;
            prev.pix = prev_pic->cr.ptr + pic_offset;
            for (i = 0; i < 8; i++) {
                *(pixel.word) = *(prev.word);
                *(pixel.word + 1) = *(prev.word + 1);
                pixel.pix += pic->cr.hoffset;
                prev.pix += pic->cr.hoffset;
            }
        }
    } else {    // Non-zero motion vector
        // Wrap motion vectors to allowed range
        while (mb->mv_x < MV_MIN) {
            mb->mv_x += MV_WRAP;
        }
        while (mb->mv_x > MV_MAX) {
            mb->mv_x -= MV_WRAP;
        }
        while (mb->mv_y < MV_MIN) {
            mb->mv_y += MV_WRAP;
        }
        while (mb->mv_y > MV_MAX) {
            mb->mv_y -= MV_WRAP;
        }
        // Compute pointers
        col = 16 * mb->x;
        row = 16 * mb->y;
        if (col == 0  &&  mb->mv_x < 0) {    // Pointing left of first col?
            mb->mv_x = 0, status = H261_ERROR;
        }
        if (col == pic->y.nhor - 16  &&  mb->mv_x > 0) {  // Right of last col?
            mb->mv_x = 0, status = H261_ERROR;
        }
        if (row == 0  &&  mb->mv_y < 0) {    // Pointing above first row?
            mb->mv_y = 0, status = H261_ERROR;
        }
        if (row == pic->y.nvert - 16  &&  mb->mv_y > 0) {  // Below last row?
            mb->mv_y = 0, status = H261_ERROR;
        }
        // Copy displaced macroblock
        pic_offset = col + row * pic->y.hoffset;
        prev_offset = col + mb->mv_x + (row + mb->mv_y) * pic->y.hoffset;
        pixel.pix = pic->y.ptr + pic_offset;
        prev.pix = prev_pic->y.ptr + prev_offset;
        for (i = 0; i < 16; i++) {
            source = prev.pix;
            for (dest = pixel.pix; dest < pixel.pix + 16; dest++) {
                *dest = *(source++);
            }
            pixel.pix += pic->y.hoffset;
            prev.pix += pic->y.hoffset;
        }
        if (pic->color) {
            col = 8 * mb->x;
            row = 8 * mb->y;
            // Truncate motion vectors for chroma towards zero
            if (mb->mv_x < 0) {
                cx = (mb->mv_x + 1) >> 1;
            } else {
                cx = mb->mv_x >> 1;
            }
            if (mb->mv_y < 0) {
                cy = (mb->mv_y + 1) >> 1;
            } else {
                cy = mb->mv_y >> 1;
            }
            // Assuming same offset for Cr and Cb
            pic_offset = col + row * pic->cb.hoffset;
            prev_offset = col + cx + (row + cy) * pic->cb.hoffset;
            pixel.pix = pic->cb.ptr + pic_offset;
            prev.pix = prev_pic->cb.ptr + prev_offset;
            for (i = 0; i < 8; i++) {
                source = prev.pix;
                for (dest = pixel.pix; dest < pixel.pix + 8; dest++) {
                    *dest = *(source++);
                }
                pixel.pix += pic->cb.hoffset;
                prev.pix += pic->cb.hoffset;
            }
            pixel.pix = pic->cr.ptr + pic_offset;
            prev.pix = prev_pic->cr.ptr + prev_offset;
            for (i = 0; i < 8; i++) {
                source = prev.pix;
                for (dest = pixel.pix; dest < pixel.pix + 8; dest++) {
                    *dest = *(source++);
                }
                pixel.pix += pic->cr.hoffset;
                prev.pix += pic->cr.hoffset;
            }
        }
    }
    return (status);
}



// MotionComp263 - perform half-pixel motion compensation for H.263
extern void MotionComp263( MACROBLOCK_DESCR * mb, // Describes block to be motion-compensated
                            PICTURE * prevPic,  // Describes previous picture used to form MC
                            PICTURE * pic       // Output picture where MC block is placed
                            )
{
    int     row, col, cX, cY, picOffset, hdim, nhor, nvert;
    PIXEL   * source, * dest;
	void (*pMC) ( int hSize, int vSize, PIXEL *in, PIXEL *out, int hdim, int mvX, int mvY);
#if defined(COMPILE_MMX)
	if(cpuid_is_mmx_motion_on()) {
		//do mmx if compiler switch AND initialized AND detected
		pMC = mcMMX;
	} else 
#endif
	{
		pMC = mc;
	}

    // Compute luma pointers
    col = 16 * mb->x;
    row = 16 * mb->y;
    hdim = pic->y.hoffset;
    nhor = pic->y.nhor, nvert = pic->y.nvert;
    picOffset = col + row * hdim;
    dest = pic->y.ptr + picOffset;          // Point to output luma
    source = prevPic->y.ptr + picOffset;    // Point to input luma (without motion comp)
    // Do motion compensation for luma
    if (mb->mtype == MTYPE263_INTER4V) {    // 4 motion vectors
        // Upper left block
        if (PointingOutside( col, col+7, row, row+7, mb->blkMvX[UPPER_LEFT_BLK], 
                            mb->blkMvY[UPPER_LEFT_BLK], nhor, nvert )  ==  NO) {
            pMC( 8, 8, source, dest, hdim, 
                        mb->blkMvX[UPPER_LEFT_BLK], mb->blkMvY[UPPER_LEFT_BLK] );
        } else {
            limitMC( 8, 8, source, dest, hdim,
                        mb->blkMvX[UPPER_LEFT_BLK], mb->blkMvY[UPPER_LEFT_BLK],
                        -col, nhor - 1 - col, -row, nvert - 1 - row );
        }
        // Upper right block
        if (PointingOutside( col+8, col+15, row, row+7, mb->blkMvX[UPPER_RIGHT_BLK], 
                            mb->blkMvY[UPPER_RIGHT_BLK], nhor, nvert )  ==  NO) {
            pMC( 8, 8, source + 8, dest + 8, hdim,
                        mb->blkMvX[UPPER_RIGHT_BLK], mb->blkMvY[UPPER_RIGHT_BLK] );
        } else {
            limitMC( 8, 8, source + 8, dest + 8, hdim,
                        mb->blkMvX[UPPER_RIGHT_BLK], mb->blkMvY[UPPER_RIGHT_BLK],
                        -col - 8, nhor - 1 - col - 8, -row, nvert - 1 - row );
        }
        // Lower left block
        source += 8 * hdim;   // Advance 8 lines
        dest += 8 * hdim;
        if (PointingOutside( col, col+7, row+8, row+15, mb->blkMvX[LOWER_LEFT_BLK], 
                            mb->blkMvY[LOWER_LEFT_BLK], nhor, nvert )  ==  NO) {
            pMC( 8, 8, source, dest, hdim,
                        mb->blkMvX[LOWER_LEFT_BLK], mb->blkMvY[LOWER_LEFT_BLK] );
        } else {
            limitMC( 8, 8, source, dest, hdim,
                        mb->blkMvX[LOWER_LEFT_BLK], mb->blkMvY[LOWER_LEFT_BLK],
                        -col, nhor - 1 - col, -row - 8, nvert - 1 - row - 8 );
        }
        // Lower right block
        if (PointingOutside( col+8, col+15, row+8, row+15, mb->blkMvX[LOWER_RIGHT_BLK],
                            mb->blkMvY[LOWER_RIGHT_BLK], nhor, nvert )  ==  NO) {
            pMC( 8, 8, source + 8, dest + 8, hdim,
                        mb->blkMvX[LOWER_RIGHT_BLK], mb->blkMvY[LOWER_RIGHT_BLK] );
        } else {
            limitMC( 8, 8, source + 8, dest + 8, hdim,
                        mb->blkMvX[LOWER_RIGHT_BLK], mb->blkMvY[LOWER_RIGHT_BLK],
                        -col - 8, nhor - 1 - col - 8, -row - 8, nvert - 1 - row - 8 );
        }
        // Compute chroma mv
        cX = chromaMvComp4V( mb->blkMvX );
        cY = chromaMvComp4V( mb->blkMvY );
    
    } else {    // One motion vector
        if (PointingOutside( col, col+15, row, row+15, mb->mv_x, mb->mv_y,
                                nhor, nvert )  ==  NO) {
            pMC( 16, 16, source, dest, hdim, mb->mv_x, mb->mv_y );
        } else {    // Pointing outside
            limitMC( 16, 16, source, dest, hdim, mb->mv_x, mb->mv_y,
                        -col, nhor - 1 - col, -row, nvert - 1 - row );
        }
        // Compute chroma mv
        cX = chromaMVComp( mb->mv_x );
        cY = chromaMVComp( mb->mv_y );
    }
    
    // Do motion compensation for chroma
    if (pic->color) {
        // Compute chroma pointers
        col = 8 * mb->x;
        row = 8 * mb->y;
        hdim = pic->cb.hoffset;
        nhor = pic->cb.nhor, nvert = pic->cb.nvert;
        picOffset = col + row * hdim;
        if (PointingOutside( col, col+7, row, row+7, cX, cY, nhor, nvert )  ==  NO) {
            pMC( 8, 8, prevPic->cb.ptr + picOffset, pic->cb.ptr + picOffset, hdim, cX, cY );
            pMC( 8, 8, prevPic->cr.ptr + picOffset, pic->cr.ptr + picOffset, hdim, cX, cY );
        } else {
            limitMC( 8, 8, prevPic->cb.ptr + picOffset, pic->cb.ptr + picOffset,
                    hdim, cX, cY, -col, nhor - 1 - col, -row, nvert - 1 - row );
            limitMC( 8, 8, prevPic->cr.ptr + picOffset, pic->cr.ptr + picOffset,
                    hdim, cX, cY, -col, nhor - 1 - col, -row, nvert - 1 - row );
        }
    }
    return;
}


// printBlk - for debugging
/*static void printBlk( int hSize, int vSize, PIXEL *p, int hdim)
{
    int i,j;
    
    for (i = 0; i < vSize; ++i) {
        for (j = 0; j < hSize; ++j) {
            printf(" %3d", *(p + j));
        }
        printf("\n");
        p += hdim;
    }
    return;
}*/


// Enumerate array of motion vectors
#define LEFT        (0)
#define TOP         (1)
#define RIGHT       (2)
#define BOTTOM      (3)

// Do overlapped motion comp. for luma
extern void OverlapMC( MACROBLOCK_DESCR * mb,   // Describes block to be motion-compensated
                        int     PBframe,    // Non-zero if PB frame
                        PICTURE * prevPic,  // Describes previous picture used to form MC
                        PICTURE * pic,      // Output picture where MC block is placed
                        int     mbWidth,    // Macroblocks per row
                        int     mbOffset,   // Row offset; (mb-mbOffset) is neighbor on top
                        int     overlap[4]  // Returns YES or NO to indicate whether overlap
                                            // was done in each 8x8 subblock
                        )
{
    int left, top, right, bottom, mvX, mvY;
    int borderMv[4][2]; // motion vectors for neighbors (left, top, right, bottom)
    
    if (mb->mtype == MTYPE263_INTER4V) {
        // Upper left block
        mvX = mb->blkMvX[UPPER_LEFT_BLK];
        mvY = mb->blkMvY[UPPER_LEFT_BLK];
        if (mb->x == 0)  left = NO;
        else  left = mvDiff( mvX, mvY, mb-1, UPPER_RIGHT_BLK, PBframe, borderMv[LEFT] );
        if (mb->y == 0)  top = NO;
        else  top = mvDiff( mvX, mvY, mb-mbOffset, LOWER_LEFT_BLK, PBframe, borderMv[TOP] );
        right = mvDiff( mvX, mvY, mb, UPPER_RIGHT_BLK, PBframe, borderMv[RIGHT] );
        bottom = mvDiff( mvX, mvY, mb, LOWER_LEFT_BLK, PBframe, borderMv[BOTTOM] );
        if (left == YES || right == YES || top == YES || bottom == YES) {
            doOverlapMC( UPPER_LEFT_BLK, mb, prevPic, pic, borderMv, left,top,right,bottom);
            overlap[UPPER_LEFT_BLK] = YES;
        } else {
            overlap[UPPER_LEFT_BLK] = NO;
        }
        // Upper right block
        mvX = mb->blkMvX[UPPER_RIGHT_BLK];
        mvY = mb->blkMvY[UPPER_RIGHT_BLK];
        left = mvDiff( mvX, mvY, mb, UPPER_LEFT_BLK, PBframe, borderMv[LEFT] );
        if (mb->y == 0)  top = NO;
        else  top = mvDiff( mvX, mvY, mb-mbOffset, LOWER_RIGHT_BLK, PBframe, borderMv[TOP] );
        if (mb->x == mbWidth-1)  right = NO;
        else  right = mvDiff( mvX, mvY, mb+1, UPPER_LEFT_BLK, PBframe, borderMv[RIGHT] );
        bottom = mvDiff( mvX, mvY, mb, LOWER_RIGHT_BLK, PBframe, borderMv[BOTTOM] );
        if (left == YES || right == YES || top == YES || bottom == YES) {
            doOverlapMC( UPPER_RIGHT_BLK, mb, prevPic, pic, borderMv, left,top,right,bottom);
            overlap[UPPER_RIGHT_BLK] = YES;
        } else {
            overlap[UPPER_RIGHT_BLK] = NO;
        }
        // Lower left block
        mvX = mb->blkMvX[LOWER_LEFT_BLK];
        mvY = mb->blkMvY[LOWER_LEFT_BLK];
        if (mb->x == 0)  left = NO;
        else  left = mvDiff( mvX, mvY, mb-1, LOWER_RIGHT_BLK, PBframe, borderMv[LEFT] );
        top = mvDiff( mvX, mvY, mb, UPPER_LEFT_BLK, PBframe, borderMv[TOP] );
        right = mvDiff( mvX, mvY, mb, LOWER_RIGHT_BLK, PBframe, borderMv[RIGHT] );
        if (left == YES || right == YES || top == YES) {
            doOverlapMC( LOWER_LEFT_BLK, mb, prevPic, pic, borderMv, left,top,right,NO);
            overlap[LOWER_LEFT_BLK] = YES;
        } else {
            overlap[LOWER_LEFT_BLK] = NO;
        }
        // Lower right block
        mvX = mb->blkMvX[LOWER_RIGHT_BLK];
        mvY = mb->blkMvY[LOWER_RIGHT_BLK];
        left = mvDiff( mvX, mvY, mb, LOWER_LEFT_BLK, PBframe, borderMv[LEFT] );
        top = mvDiff( mvX, mvY, mb, UPPER_RIGHT_BLK, PBframe, borderMv[TOP] );
        if (mb->x == mbWidth-1)  right = NO;
        else  right = mvDiff( mvX, mvY, mb+1, LOWER_LEFT_BLK, PBframe, borderMv[RIGHT] );
        if (left == YES || right == YES || top == YES) {
            doOverlapMC( LOWER_RIGHT_BLK, mb, prevPic, pic, borderMv, left,top,right,NO);
            overlap[LOWER_RIGHT_BLK] = YES;
        } else {
            overlap[LOWER_RIGHT_BLK] = NO;
        }
        
    } else {    // One motion vector for current macroblock; neighbors can be INTER4V
        mvX = mb->mv_x;
        mvY = mb->mv_y;
        // Upper left block
        if (mb->x == 0)  left = NO;
        else  left = mvDiff( mvX, mvY, mb-1, UPPER_RIGHT_BLK, PBframe, borderMv[LEFT] );
        if (mb->y == 0)  top = NO;
        else  top = mvDiff( mvX, mvY, mb-mbOffset, LOWER_LEFT_BLK, PBframe, borderMv[TOP] );
        if (left == YES || top == YES) {
            doOverlapMC( UPPER_LEFT_BLK, mb, prevPic, pic, borderMv, left,top,NO,NO);
            overlap[UPPER_LEFT_BLK] = YES;
        } else {
            overlap[UPPER_LEFT_BLK] = NO;
        }
        // Upper right block
        if (mb->y == 0)  top = NO;
        else  top = mvDiff( mvX, mvY, mb-mbOffset, LOWER_RIGHT_BLK, PBframe, borderMv[TOP] );
        if (mb->x == mbWidth-1)  right = NO;
        else  right = mvDiff( mvX, mvY, mb+1, UPPER_LEFT_BLK, PBframe, borderMv[RIGHT] );
        if (right == YES || top == YES) {
            doOverlapMC( UPPER_RIGHT_BLK, mb, prevPic, pic, borderMv, NO,top,right,NO);
            overlap[UPPER_RIGHT_BLK] = YES;
        } else {
            overlap[UPPER_RIGHT_BLK] = NO;
        }
        // Lower left block
        if (mb->x == 0)  left = NO;
        else  left = mvDiff( mvX, mvY, mb-1, LOWER_RIGHT_BLK, PBframe, borderMv[LEFT] );
        if (left == YES) {
            doOverlapMC( LOWER_LEFT_BLK, mb, prevPic, pic, borderMv, left,NO,NO,NO);
            overlap[LOWER_LEFT_BLK] = YES;
        } else {
            overlap[LOWER_LEFT_BLK] = NO;
        }
        // Lower right block
        if (mb->x == mbWidth-1)  right = NO;
        else  right = mvDiff( mvX, mvY, mb+1, LOWER_LEFT_BLK, PBframe, borderMv[RIGHT] );
        if (right == YES) {
            doOverlapMC( LOWER_RIGHT_BLK, mb, prevPic, pic, borderMv, NO,NO,right,NO);
            overlap[LOWER_RIGHT_BLK] = YES;
        } else {
            overlap[LOWER_RIGHT_BLK] = NO;
        }
    }
    return;
}


// PointingOutside - determine whether motion-comp routine needs to worry about 
//  the borders of the previous picture (use edge pixels instead of non-existent
//  pixels "outside" the border of the previous picture).
//  Returns YES if the motion-compensated block needs pixels outside the previous
//  picture; NO if picture boundary is not crossed.
extern int PointingOutside( int col1, int col2, // First and last column of block
                            int row1, int row2, // First and last row of block
                            int mvX, int mvY,   // Motion vector; one fractional bit
                            int nCols, int nRows    // Picture size
                            )
{
    if (col1 + (mvX >> 1)  <  0)            // Check left border
        return( YES );
    if (col2 + ((mvX + 1) >> 1)  >=  nCols) // Check right border
        return( YES );
    if (row1 + (mvY >> 1)  <  0)            // Check top border
        return( YES );
    if (row2 + ((mvY + 1) >> 1)  >=  nRows) // Check bottom border
        return( YES );
    return( NO );
}


//  PredBframe - Form prediction for B-frame
extern void PredBframe( MACROBLOCK_DESCR * mb,  // Macroblock to be predicted
                        PICTURE * prevPic,      // Prev. picture (forward pred)
                        PICTURE * nextPic,      // Next P-picture (backward pred)
                        PICTURE * Bpic          // Output picture where pred is placed
                        )
{
    int     i;
    S8      saveMvX[4], saveMvY[4];

    // Perform backward prediction
    if (mb->mtype == MTYPE263_INTER4V) {
        for (i = 0; i < 4; ++i) {
            saveMvX[i] = mb->blkMvX[i];
            mb->blkMvX[i] = mb->blkMvBx[i];
            saveMvY[i] = mb->blkMvY[i];
            mb->blkMvY[i] = mb->blkMvBy[i];
        }
    } else {
        saveMvX[0] = mb->mv_x;
        mb->mv_x = mb->blkMvBx[0];
        saveMvY[0] = mb->mv_y;
        mb->mv_y = mb->blkMvBy[0];
    }
    MotionComp263( mb, nextPic, Bpic );
    // Save backward prediction in temporary area
    saveBackwardPred( mb, Bpic );
    // Perform forward prediction
    if (mb->mtype == MTYPE263_INTER4V) {
        for (i = 0; i < 4; ++i) {
            mb->blkMvX[i] = mb->blkMvFx[i];
            mb->blkMvY[i] = mb->blkMvFy[i];
        }
    } else {
        mb->mv_x = mb->blkMvFx[0];
        mb->mv_y = mb->blkMvFy[0];
    }
    MotionComp263( mb, prevPic, Bpic );
    // Restore parameters
    if (mb->mtype == MTYPE263_INTER4V) {
        for (i = 0; i < 4; ++i) {
            mb->blkMvX[i] = saveMvX[i];
            mb->blkMvY[i] = saveMvY[i];
        }
    } else {
        mb->mv_x = saveMvX[0];
        mb->mv_y = saveMvY[0];
    }
    // Average forward and backward prediction
    averageForBack( mb, Bpic );
}


//  PredBdist - Form prediction for subsampled error computation
extern void PredBdist( MACROBLOCK_DESCR * mb,  // Macroblock to be predicted
                        PICTURE * prevPic,      // Prev. picture (forward pred)
                        PICTURE * nextPic,      // Next P-picture (backward pred)
                        PICTURE * Bpic          // Output picture where pred is placed
                        )
{
    int     i;
    S8      saveMvX[4], saveMvY[4];

    // Perform backward prediction
    if (mb->mtype == MTYPE263_INTER4V) {
        for (i = 0; i < 4; ++i) {
            saveMvX[i] = mb->blkMvX[i];
            mb->blkMvX[i] = mb->blkMvBx[i];
            saveMvY[i] = mb->blkMvY[i];
            mb->blkMvY[i] = mb->blkMvBy[i];
        }
    } else {
        saveMvX[0] = mb->mv_x;
        mb->mv_x = mb->blkMvBx[0];
        saveMvY[0] = mb->mv_y;
        mb->mv_y = mb->blkMvBy[0];
    }
    MotionComp263( mb, nextPic, Bpic );
    // Save backward prediction in temporary area
    saveBackwardPred( mb, Bpic );
    // Perform forward prediction
    if (mb->mtype == MTYPE263_INTER4V) {
        for (i = 0; i < 4; ++i) {
            mb->blkMvX[i] = mb->blkMvFx[i];
            mb->blkMvY[i] = mb->blkMvFy[i];
        }
    } else {
        mb->mv_x = mb->blkMvFx[0];
        mb->mv_y = mb->blkMvFy[0];
    }
    MotionComp263( mb, prevPic, Bpic );
    // Restore parameters
    if (mb->mtype == MTYPE263_INTER4V) {
        for (i = 0; i < 4; ++i) {
            mb->blkMvX[i] = saveMvX[i];
            mb->blkMvY[i] = saveMvY[i];
        }
    } else {
        mb->mv_x = saveMvX[0];
        mb->mv_y = saveMvY[0];
    }
    // Average forward and backward prediction
    averageForBack( mb, Bpic );
}


// chromaMVComp - derive motion component for chroma from luma motion component
static int chromaMVComp( int mvLuma )
{
    int mvChroma, fraction;
    
    mvChroma = 2 * (mvLuma >> 2);    // Truncate fractional part
    fraction = mvLuma & 0x3;    // Two fractional bits
    if (fraction != 0) {
        ++mvChroma;     // Round towards half-pixel
    }
    return( mvChroma );
}


// chromaMvComp4V - derive motion component for chroma from 4 luma motion components
static int chromaMvComp4V( S8 mvLuma[4] )
{
    int sum, mvChroma, fraction;
    
    sum = mvLuma[0] + mvLuma[1] + mvLuma[2] + mvLuma[3];
    mvChroma = 2 * (sum >> 4);  // Truncate fractional part
    fraction = sum & 0xf;       // Four fractional bits
    if (fraction >= 14) {
        mvChroma += 2;  // Round up to next integer value
    } else if (fraction >= 3) {
        ++mvChroma;     // Round towards half-pixel
    } // else round down to integer pixel
    return( mvChroma );
}


// mc - Perform motion compensation for a hSize x vSize block
void mc( int hSize, int vSize, 
                    PIXEL *in, PIXEL *out, int hdim,
                    int mvX, int mvY    // Motion vector
                    )
{
    int intX, intY, fracX, fracY;
    
#ifdef VVPROFILER
	S32 nVvProfNb = 2;
	if(!pVvProf[nVvProfNb]) pVvProf[nVvProfNb] = newCVvDebugTimer();//memory leak on destruction
	pVvProfCount[nVvProfNb]++;
	StartTime(pVvProf[nVvProfNb]);
#endif


    intX = mvX >> 1;    // Integer part of motion vector
    intY = mvY >> 1;
    fracX = mvX & 0x1;  // Fractional part of motion vector
    fracY = mvY & 0x1;
    in += intX + intY * hdim;
    if (hSize != 16  &&  hSize != 8  &&  hSize != 4) {
        H261ErrMsg("mc -- hSize not supported");
        exit(0);
    }
    if (fracY == 0) {
        if (fracX == 0) {
            // No interpolation
            if (hSize == 8) {
                mc8pelsNoInterpol( in, out, hdim, vSize );
            } else if (hSize == 16) {
                mc16pelsNoInterpol( in, out, hdim, vSize );
            } else {
                mc4pelsNoInterpol( in, out, hdim, vSize );
            }
        } else {
            // Horizontal interpolation
            if (hSize == 8) {
                mc8pelsHorInterpol( in, out, hdim, vSize );
            } else if (hSize == 16) {
                mc16pelsHorInterpol( in, out, hdim, vSize );
            } else {
                mc4pelsHorInterpol( in, out, hdim, vSize );
            }
        }
    } else if (fracX == 0) {
        // Vertical interpolation
        if (hSize == 8) {
            mc8pelsVertInterpol( in, out, hdim, vSize );
        } else if (hSize == 16) {
            mc16pelsVertInterpol( in, out, hdim, vSize );
        } else {
            mc4pelsVertInterpol( in, out, hdim, vSize );
        }
    } else {    // Bilinear interpolation
        if (hSize == 8) {
            mc8pels2DInterpol( in, out, hdim, vSize );
        } else if (hSize == 16) {
            mc16pels2DInterpol( in, out, hdim, vSize );
        } else {
            mc4pels2DInterpol( in, out, hdim, vSize );
        }
    }

#ifdef VVPROFILER
	StopAndAccuTime(pVvProf[nVvProfNb]);
#endif

    return;
}


// Limit x to interval [low,high]
#define LIMIT( low, x, high )    max( low, min( x, high ))

// limitMC - Perform motion compensation; use edge pixels when referring to
//  pixels outside picture
static void limitMC( int hSize, int vSize, 
                    PIXEL const *in, PIXEL *out, int hdim,
                    int mvX, int mvY,   // Motion vector
                    int minX, int maxX, int minY, int maxY // Limits for hor/vert indices
                    )
{
#define MAX_HSIZE   (16)
    int intX, intY, fracX, fracY, outsideTop, outsideBot, repeatTop, repeatBot, x, y;
    static int  mapX[MAX_HSIZE + 1];
    PIXEL *outSave, *outBegin;
    union { // Copy words to speed up routine
        PIXEL   *pix;
        U32     *word;
    } pIn, pOut;
    
    if (hSize & 0x3) {
        H261ErrMsg("limitMC -- hSize must be multiple of 4");
        exit(0);
    }
    if (hSize > MAX_HSIZE) {
        H261ErrMsg("limitMC -- hSize too large");
        exit(0);
    }
    intX = mvX >> 1;    // Integer part of motion vector
    intY = mvY >> 1;
    fracX = mvX & 0x1;  // Fractional part of motion vector
    fracY = mvY & 0x1;
    // Create horizontal mapping vector
    for (x = 0; x <= hSize; ++x) {
        mapX[x] = LIMIT( minX, x + intX, maxX );
    }
    //repeatTop = max( 0, minY - intY);                     // Lines on top that are outside
    //repeatBot = max( 0, vSize - 1 + intY + fracY - maxY); // Lines at bottom that are outside
    outsideTop = max( 0, minY - intY);						// Lines on top that are outside
    outsideBot = max( 0, vSize - 1 + intY + fracY - maxY);  // Lines at bottom that are outside
	// Don't produce more lines than the blocksize (used to be a nasty bug hidden here)
	repeatTop = min( outsideTop, vSize );
    if (outsideBot < vSize) {
        repeatBot = outsideBot;
        in += (intY + outsideTop) * hdim;	// Apply vert motion comp. (hor MC thru mapping)
    } else {    // Whole block is "outside" bottom of picture
        repeatBot = vSize;
        in += (vSize - 1) * hdim;   // Point to last line of picture
    }
	// Output pointers
    outSave = out;						// Upper left corner of output block
    out += repeatTop * hdim;			// "Repeated" lines will be filled in later
    outBegin = out;						// Save address for first valid output line
    if (fracY == 0) {
		// Ensure that at least one output line gets written
		if (repeatTop == vSize) {
			--repeatTop;
			out -= hdim;
			outBegin = out;
		} else if (repeatBot == vSize) {
			--repeatBot;
		}
        if (fracX == 0) {
            // No interpolation
            for (y = repeatTop; y < vSize - repeatBot; ++y) {
                for (x = 0; x < hSize; x += 4) {
                    out[x+0] = in[ mapX[x+0] ];
                    out[x+1] = in[ mapX[x+1] ];
                    out[x+2] = in[ mapX[x+2] ];
                    out[x+3] = in[ mapX[x+3] ];
                }
                in += hdim;
                out += hdim;
            }
        } else {
            // Horizontal interpolation
            for (y = repeatTop; y < vSize - repeatBot; ++y) {
                for (x = 0; x < hSize; x += 4) {
                    out[x+0] = (in[mapX[x+0]] + in[mapX[x+1]] + 1) >> 1;
                    out[x+1] = (in[mapX[x+1]] + in[mapX[x+2]] + 1) >> 1;
                    out[x+2] = (in[mapX[x+2]] + in[mapX[x+3]] + 1) >> 1;
                    out[x+3] = (in[mapX[x+3]] + in[mapX[x+4]] + 1) >> 1;
                }
                in += hdim;
                out += hdim;
            }
        }
    } else if (fracX == 0) {
        // Vertical interpolation
        if (repeatTop > 0) {    // Produce line to repeat
            outBegin = out - hdim;
            for (x = 0; x < hSize; ++x) {
                outBegin[x] = in[ mapX[x] ];
            }
        }
        for (y = repeatTop; y < vSize - repeatBot; ++y) {
            for (x = 0; x < hSize; x += 4) {
                out[x+0] = (in[mapX[x+0]] + in[mapX[x+0] + hdim] + 1) >> 1;
                out[x+1] = (in[mapX[x+1]] + in[mapX[x+1] + hdim] + 1) >> 1;
                out[x+2] = (in[mapX[x+2]] + in[mapX[x+2] + hdim] + 1) >> 1;
                out[x+3] = (in[mapX[x+3]] + in[mapX[x+3] + hdim] + 1) >> 1;
            }
            in += hdim;
            out += hdim;
        }
        if (repeatBot > 0) {    // Produce line to repeat
            for (x = 0; x < hSize; ++x) {
                out[x] = in[ mapX[x] ];
            }
            out += hdim;
        }
    } else {    // Bilinear interpolation
        if (repeatTop > 0) {    // Produce line to repeat
            outBegin = out - hdim;
            for (x = 0; x < hSize; ++x) {
                outBegin[x] = (in[mapX[x]] + in[mapX[x+1]] + 1) >> 1;
            }
        }
        for (y = repeatTop; y < vSize - repeatBot; ++y) {
            for (x = 0; x < hSize; x += 4) {
                    out[x+0] = (in[mapX[x+0]] + in[mapX[x+0] + hdim]
                                + in[mapX[x+1]] + in[mapX[x+1] + hdim] + 2) >> 2;
                    out[x+1] = (in[mapX[x+1]] + in[mapX[x+1] + hdim]
                                + in[mapX[x+2]] + in[mapX[x+2] + hdim] + 2) >> 2;
                    out[x+2] = (in[mapX[x+2]] + in[mapX[x+2] + hdim]
                                + in[mapX[x+3]] + in[mapX[x+3] + hdim] + 2) >> 2;
                    out[x+3] = (in[mapX[x+3]] + in[mapX[x+3] + hdim]
                                + in[mapX[x+4]] + in[mapX[x+4] + hdim] + 2) >> 2;
            }
            in += hdim;
            out += hdim;
        }
        if (repeatBot > 0) {    // Produce line to repeat
            for (x = 0; x < hSize; ++x) {
                out[x] = (in[mapX[x]] + in[mapX[x+1]] + 1) >> 1;
            }
            out += hdim;
        }
    }
    if (fracY == 1) {
        --repeatTop;    // Already did one line
        --repeatBot;
    }
    // Repeat first line at top
    pIn.pix = outBegin;
    pOut.pix = outSave;
    for (y = 0; y < repeatTop; ++y) {
        for (x = 0; x < (hSize >> 2); ++x) {
            *(pOut.word + x) = *(pIn.word + x);
        }
        pOut.pix += hdim;
    }
    // Repeat last line at the bottom
    pIn.pix = out - hdim;
    pOut.pix = out;
    for (y = 0; y < repeatBot; ++y) {
        for (x = 0; x < (hSize >> 2); ++x) {
            *(pOut.word + x) = *(pIn.word + x);
        }
        pOut.pix += hdim;
    }
    return;
}


// mvDiff - Return YES if motion vector for adjacent block is different, otherwise NO
//  Return NO if adjacent block is INTRA (unless PBframe)
static int mvDiff( int mvX, int mvY,    // motion vector
                MACROBLOCK_DESCR const *borderMB,   // adjacent macroblock
                int subBlk, // adjacent subblock (needed if borderMB has 4 motion vectors)
                int PBframe,    // If PBframe: do overlap also with INTRA neighbor
                int border[2]   // return motion vector components for adjacent block
                )
{
    if (borderMB->mtype == MTYPE263_INTER4V) {
        border[0] = borderMB->blkMvX[ subBlk ];
        border[1] = borderMB->blkMvY[ subBlk ];
    } else {
        border[0] = borderMB->mv_x;
        border[1] = borderMB->mv_y;
        if (borderMB->mtype >= MTYPE263_INTRA  &&  PBframe == 0) {
            return( NO );   // No overlap with INTRA neighbor (except in PBframe)
        }
    }
    if (border[0] != mvX  ||  border[1] != mvY) {
        return( YES );
    }
    return( NO );
}

// median3 - return median of 3 values
static int  median3( int x[3] )
{
    int order[2];
    
    if (x[1] < x[0]) {
        order[0] = x[1];
        order[1] = x[0];
    } else {
        order[0] = x[0];
        order[1] = x[1];
    }
    if (x[2] < order[0]) {
        return( order[0] );
    } else if (x[2] < order[1]) {
        return( x[2] );
    }
    return( order[1] );
}

// MvPred - Compute mv predictor (mvX,mvY) for block "blk" of macroblock mb[0].
//  blk = UPPER_LEFT_BLK, UPPER_RIGHT_BLK, LOWER_LEFT_BLK, LOWER_RIGHT_BLK,
//          or WHOLE_MACROBLOCK.
//  If horPredOnly=YES: don't use previous row of macroblocks in prediction
extern void MvPred( MACROBLOCK_DESCR mb[],
                    int blk,        // specify block: UL, UR, LL, LR, or WHOLE 
                    int mbhor,      // offset from previous row of MBs
                    int horPredOnly, 
                    int *mvX, int *mvY )
{
    int x[3], y[3];
    
    switch (blk) {
    case UPPER_RIGHT_BLK:
        x[0] = mb[0].blkMvX[UPPER_LEFT_BLK];
        y[0] = mb[0].blkMvY[UPPER_LEFT_BLK];
        if (horPredOnly == YES) {   // Top border or sending GOB header to resync
            *mvX = x[0], *mvY = y[0];
        } else {
            if (mb[-mbhor].mtype == MTYPE263_INTER4V) {
                x[1] = mb[-mbhor].blkMvX[LOWER_RIGHT_BLK];
                y[1] = mb[-mbhor].blkMvY[LOWER_RIGHT_BLK];
            } else {
                x[1] = mb[-mbhor].mv_x;
                y[1] = mb[-mbhor].mv_y;
            }
            if (mb[0].x == mbhor - 1) { // Right border
                x[2] = 0, y[2] = 0;
            } else if (mb[1-mbhor].mtype == MTYPE263_INTER4V) {
                x[2] = mb[1-mbhor].blkMvX[LOWER_LEFT_BLK];
                y[2] = mb[1-mbhor].blkMvY[LOWER_LEFT_BLK];
            } else {
                x[2] = mb[1-mbhor].mv_x;
                y[2] = mb[1-mbhor].mv_y;
            }
            *mvX = median3( x );
            *mvY = median3( y );
        }
        break;
        
    case LOWER_LEFT_BLK:
        if (mb[0].x == 0) {     // Left border
            x[0] = 0, y[0] = 0;
        } else if (mb[-1].mtype == MTYPE263_INTER4V) {
            x[0] = mb[-1].blkMvX[LOWER_RIGHT_BLK];
            y[0] = mb[-1].blkMvY[LOWER_RIGHT_BLK];
        } else {
            x[0] = mb[-1].mv_x;
            y[0] = mb[-1].mv_y;
        }
        x[1] = mb[0].blkMvX[UPPER_LEFT_BLK];
        y[1] = mb[0].blkMvY[UPPER_LEFT_BLK];
        x[2] = mb[0].blkMvX[UPPER_RIGHT_BLK];
        y[2] = mb[0].blkMvY[UPPER_RIGHT_BLK];
        *mvX = median3( x );
        *mvY = median3( y );
        break;
        
    case LOWER_RIGHT_BLK:
        x[0] = mb[0].blkMvX[LOWER_LEFT_BLK];
        y[0] = mb[0].blkMvY[LOWER_LEFT_BLK];
        x[1] = mb[0].blkMvX[UPPER_LEFT_BLK];
        y[1] = mb[0].blkMvY[UPPER_LEFT_BLK];
        x[2] = mb[0].blkMvX[UPPER_RIGHT_BLK];
        y[2] = mb[0].blkMvY[UPPER_RIGHT_BLK];
        *mvX = median3( x );
        *mvY = median3( y );
        break;
        
    case WHOLE_MACROBLOCK:
    case UPPER_LEFT_BLK:
    default:
        if (mb[0].x == 0) {     // Left border
            x[0] = 0, y[0] = 0;
        } else if (mb[-1].mtype == MTYPE263_INTER4V) {
            x[0] = mb[-1].blkMvX[UPPER_RIGHT_BLK];
            y[0] = mb[-1].blkMvY[UPPER_RIGHT_BLK];
        } else {
            x[0] = mb[-1].mv_x;
            y[0] = mb[-1].mv_y;
        }
        if (horPredOnly == YES) {   // Top border or sending GOB header to resync
            *mvX = x[0], *mvY = y[0];
        } else {
            if (mb[-mbhor].mtype == MTYPE263_INTER4V) {
                x[1] = mb[-mbhor].blkMvX[LOWER_LEFT_BLK];
                y[1] = mb[-mbhor].blkMvY[LOWER_LEFT_BLK];
            } else {
                x[1] = mb[-mbhor].mv_x;
                y[1] = mb[-mbhor].mv_y;
            }
            if (mb[0].x == mbhor - 1) { // Right border
                x[2] = 0, y[2] = 0;
            } else if (mb[1-mbhor].mtype == MTYPE263_INTER4V) {
                x[2] = mb[1-mbhor].blkMvX[LOWER_LEFT_BLK];
                y[2] = mb[1-mbhor].blkMvY[LOWER_LEFT_BLK];
            } else {
                x[2] = mb[1-mbhor].mv_x;
                y[2] = mb[1-mbhor].mv_y;
            }
            *mvX = median3( x );
            *mvY = median3( y );
        }
        break;
    }
    return;
}


// doOverlapMC - Perform overlapped motion compensation on 8x8 block
static void doOverlapMC( int subBlk,    // Indicates subblock to process (UL, UR, LL, LR)
                    MACROBLOCK_DESCR *mb,   // Used to determine (x,y) coordinates for block
                    PICTURE *prevPic,   // Previous picture; used to create overlapping MC
                    PICTURE *pic, // Contains non-overlapped MC on entry; returns overlapped MC
                    int borderMv[4][2], // Motion vectors for adjacent blocks (L,Top,R,Bottom)
                    int left,           // If YES, overlap using LEFT mv
                    int top,            // If YES, overlap using TOP mv
                    int right,          // If YES, overlap using RIGHT mv
                    int bottom          // If YES, overlap using BOTTOM mv
                    )
{
#define LEFT_ADDR           (0)
#define RIGHT_ADDR          (LEFT_ADDR + 4)
#define TOP_OFFSET          (8)
#define TOP_ADDR            (LEFT_ADDR + TOP_OFFSET)
#define MAX_HDIM            (352)
    static PIXEL    p[8 * MAX_HDIM];    // MC predictions using neighboring vectors
        // Only using first 16 columns; line offset chosen to be same as for pictures
    int     row, col, picOffset, hdim, nhor, nvert, *mv;
    PIXEL   * source, * dest;

	void (*pMC) ( int hSize, int vSize, PIXEL *in, PIXEL *out, int hdim, int mvX, int mvY);
#if defined(COMPILE_MMX)
	if(cpuid_is_mmx_motion_on()) {
		//do mmx if compiler switch AND initialized AND detected
		pMC = mcMMX;
	} else 
#endif
	{
		pMC = mc;
	}
    // Compute luma pointers
    col = 16 * mb->x;
    row = 16 * mb->y;
    if (subBlk == UPPER_RIGHT_BLK  ||  subBlk == LOWER_RIGHT_BLK)  col += 8;
    if (subBlk == LOWER_LEFT_BLK  ||  subBlk == LOWER_RIGHT_BLK)  row += 8;
    hdim = pic->y.hoffset;
    nhor = pic->y.nhor, nvert = pic->y.nvert;
    picOffset = col + row * hdim;
    dest = pic->y.ptr + picOffset;          // Point to output luma (non-overlapped MC)
    source = prevPic->y.ptr + picOffset;    // Point to input luma (without motion comp)
    if (hdim > MAX_HDIM) {
        H261ErrMsg("doOverlapMC - Increase size of internal array");
        exit(0);
    }
    // Create motion compensated blocks using neighboring motion vectors
    if (left == YES) {  // Produce left 4 columns
        mv = borderMv[LEFT];
        if (PointingOutside( col, col+3, row, row+7, *mv, *(mv+1), nhor, nvert)  ==  NO) {
            pMC( 4,8, source, &p[LEFT_ADDR], hdim, *mv, *(mv+1) );
        } else {
            limitMC( 4,8, source, &p[LEFT_ADDR], hdim, *mv, *(mv+1),
                    -col, nhor - 1 - col, -row, nvert - 1 - row );
        }
    }
    if (right == YES) { // Produce right 4 columns
        mv = borderMv[RIGHT];
        if (PointingOutside( col+4, col+7, row, row+7, *mv, *(mv+1), nhor, nvert)  ==  NO) {
            pMC( 4,8, source + 4, &p[RIGHT_ADDR], hdim, *mv, *(mv+1) );
        } else {
            limitMC( 4,8, source + 4, &p[RIGHT_ADDR], hdim, *mv, *(mv+1),
                    -col - 4, nhor - 1 - col - 4, -row, nvert - 1 - row );
        }
    }
    if (top == YES) {   // Produce top 4 rows
        mv = borderMv[TOP];
        if (PointingOutside( col, col+7, row, row+3, *mv, *(mv+1), nhor, nvert)  ==  NO) {
            pMC( 8,4, source, &p[TOP_ADDR], hdim, *mv, *(mv+1) );
        } else {
            limitMC( 8,4, source, &p[TOP_ADDR], hdim, *mv, *(mv+1),
                    -col, nhor - 1 - col, -row, nvert - 1 - row );
        }
    }
    if (bottom == YES) {    // Produce bottom 4 rows
        mv = borderMv[BOTTOM];
        if (PointingOutside( col, col+7, row+4, row+7, *mv, *(mv+1), nhor, nvert)  ==  NO) {
            pMC( 8,4, source + 4*hdim, &p[TOP_ADDR + 4*hdim], hdim, *mv, *(mv+1) );
        } else {
            limitMC( 8,4, source + 4*hdim, &p[TOP_ADDR + 4*hdim], hdim, *mv, *(mv+1),
                    -col, nhor - 1 - col, -row - 4, nvert - 1 - row - 4 );
        }
    }
    // Produce weighted ("overlapped") MC prediction
    weigh8x4( dest, &p[LEFT_ADDR], hdim, top, left, right );    // Top 4 rows
    weigh8x4( dest + 7*hdim, &p[LEFT_ADDR + 7*hdim], -hdim, bottom, left, right );  // Bottom 4 rows
    return;
}


// weigh8x4
static void weigh8x4( PIXEL *dest, PIXEL const *p, int hdim, int vert, int left, int right )
{
    if (vert == YES) {  // Use top/bottom MV
        if (left == YES) {
            weighLeft_Vert( dest, &p[LEFT_ADDR], hdim );
        } else {
            weighNoLeft_Vert( dest, &p[LEFT_ADDR], hdim );
        }
        if (right == YES) {
            weighRight_Vert( dest + 4, &p[RIGHT_ADDR], hdim );
        } else {
            weighNoRight_Vert( dest + 4, &p[RIGHT_ADDR], hdim );
        }
    } else {
        if (left == YES) {
            weighLeft( dest, &p[LEFT_ADDR], hdim );
        }
        if (right == YES) {
            weighRight( dest + 4, &p[RIGHT_ADDR], hdim );
        }
    }
    return;
}


// weighLeft_Vert - Overlap three MC predictions for upper left 4x4 pixels
static void weighLeft_Vert( PIXEL *dest,    // Non-overlapped MC; overlapped by this routine
                            PIXEL const *p, // p: MC using left motion vector
                                            // &p[TOP_OFFSET]: MC using top/bottom MV
                            int hdim )      // Line offset
{
    // First row
    dest[0] = (2 * dest[0] +     p[0] +     p[TOP_OFFSET + 0] + 2) >> 2;
    dest[1] = (5 * dest[1] +     p[1] + 2 * p[TOP_OFFSET + 1] + 4) >> 3;
    dest[2] = (5 * dest[2] +     p[2] + 2 * p[TOP_OFFSET + 2] + 4) >> 3;
    dest[3] = (5 * dest[3] +     p[3] + 2 * p[TOP_OFFSET + 3] + 4) >> 3;
    // Second row
    dest += hdim, p += hdim;
    dest[0] = (5 * dest[0] + 2 * p[0] +     p[TOP_OFFSET + 0] + 4) >> 3;
    dest[1] = (5 * dest[1] + 2 * p[1] +     p[TOP_OFFSET + 1] + 4) >> 3;
    dest[2] = (5 * dest[2] +     p[2] + 2 * p[TOP_OFFSET + 2] + 4) >> 3;
    dest[3] = (5 * dest[3] +     p[3] + 2 * p[TOP_OFFSET + 3] + 4) >> 3;
    // Third row
    dest += hdim, p += hdim;
    dest[0] = (5 * dest[0] + 2 * p[0] +     p[TOP_OFFSET + 0] + 4) >> 3;
    dest[1] = (5 * dest[1] + 2 * p[1] +     p[TOP_OFFSET + 1] + 4) >> 3;
    dest[2] = (6 * dest[2] +     p[2] +     p[TOP_OFFSET + 2] + 4) >> 3;
    dest[3] = (6 * dest[3] +     p[3] +     p[TOP_OFFSET + 3] + 4) >> 3;
    // Fourth row (identical to third row)
    dest += hdim, p += hdim;
    dest[0] = (5 * dest[0] + 2 * p[0] +     p[TOP_OFFSET + 0] + 4) >> 3;
    dest[1] = (5 * dest[1] + 2 * p[1] +     p[TOP_OFFSET + 1] + 4) >> 3;
    dest[2] = (6 * dest[2] +     p[2] +     p[TOP_OFFSET + 2] + 4) >> 3;
    dest[3] = (6 * dest[3] +     p[3] +     p[TOP_OFFSET + 3] + 4) >> 3;
    return;
}


// weighNoLeft_Vert - Overlap with MV from vert. neighbor for upper left 4x4 pixels
static void weighNoLeft_Vert( PIXEL *dest, PIXEL const *p, int hdim )
{
    // First row
    dest[0] = (3 * dest[0] +  p[TOP_OFFSET + 0] + 2) >> 2;
    dest[1] = (3 * dest[1] +  p[TOP_OFFSET + 1] + 2) >> 2;
    dest[2] = (3 * dest[2] +  p[TOP_OFFSET + 2] + 2) >> 2;
    dest[3] = (3 * dest[3] +  p[TOP_OFFSET + 3] + 2) >> 2;
    // Second row
    dest += hdim, p += hdim;
    dest[0] = (7 * dest[0] +  p[TOP_OFFSET + 0] + 4) >> 3;
    dest[1] = (7 * dest[1] +  p[TOP_OFFSET + 1] + 4) >> 3;
    dest[2] = (3 * dest[2] +  p[TOP_OFFSET + 2] + 2) >> 2;
    dest[3] = (3 * dest[3] +  p[TOP_OFFSET + 3] + 2) >> 2;
    // Third row
    dest += hdim, p += hdim;
    dest[0] = (7 * dest[0] +  p[TOP_OFFSET + 0] + 4) >> 3;
    dest[1] = (7 * dest[1] +  p[TOP_OFFSET + 1] + 4) >> 3;
    dest[2] = (7 * dest[2] +  p[TOP_OFFSET + 2] + 4) >> 3;
    dest[3] = (7 * dest[3] +  p[TOP_OFFSET + 3] + 4) >> 3;
    // Fourth row (identical to third row)
    dest += hdim, p += hdim;
    dest[0] = (7 * dest[0] +  p[TOP_OFFSET + 0] + 4) >> 3;
    dest[1] = (7 * dest[1] +  p[TOP_OFFSET + 1] + 4) >> 3;
    dest[2] = (7 * dest[2] +  p[TOP_OFFSET + 2] + 4) >> 3;
    dest[3] = (7 * dest[3] +  p[TOP_OFFSET + 3] + 4) >> 3;
    return;
}


// weighLeft - Overlap with MV from left neighbor for upper left 4x4 pixels
static void weighLeft( PIXEL *dest, PIXEL const *p, int hdim )
{
    // First row
    dest[0] = (3 * dest[0] +  p[0] + 2) >> 2;
    dest[1] = (7 * dest[1] +  p[1] + 4) >> 3;
    dest[2] = (7 * dest[2] +  p[2] + 4) >> 3;
    dest[3] = (7 * dest[3] +  p[3] + 4) >> 3;
    // Second row
    dest += hdim, p += hdim;
    dest[0] = (3 * dest[0] +  p[0] + 2) >> 2;
    dest[1] = (3 * dest[1] +  p[1] + 2) >> 2;
    dest[2] = (7 * dest[2] +  p[2] + 4) >> 3;
    dest[3] = (7 * dest[3] +  p[3] + 4) >> 3;
    // Third row (identical to second row)
    dest += hdim, p += hdim;
    dest[0] = (3 * dest[0] +  p[0] + 2) >> 2;
    dest[1] = (3 * dest[1] +  p[1] + 2) >> 2;
    dest[2] = (7 * dest[2] +  p[2] + 4) >> 3;
    dest[3] = (7 * dest[3] +  p[3] + 4) >> 3;
    // Fourth row (identical to third row)
    dest += hdim, p += hdim;
    dest[0] = (3 * dest[0] +  p[0] + 2) >> 2;
    dest[1] = (3 * dest[1] +  p[1] + 2) >> 2;
    dest[2] = (7 * dest[2] +  p[2] + 4) >> 3;
    dest[3] = (7 * dest[3] +  p[3] + 4) >> 3;
    return;
}


// weighRight_Vert - Derived from weighLeft_Vert by reversing indices
//  This means that the assembler routine can be easily derived from the other
static void weighRight_Vert( PIXEL *dest,   // Non-overlapped MC; overlapped by this routine
                            PIXEL const *p, // p: MC using right motion vector
                                            // &p[TOP_OFFSET]: MC using top/bottom MV
                            int hdim )      // Line offset
{
    // First row
    dest[3] = (2 * dest[3] +     p[3] +     p[TOP_OFFSET + 3] + 2) >> 2;
    dest[2] = (5 * dest[2] +     p[2] + 2 * p[TOP_OFFSET + 2] + 4) >> 3;
    dest[1] = (5 * dest[1] +     p[1] + 2 * p[TOP_OFFSET + 1] + 4) >> 3;
    dest[0] = (5 * dest[0] +     p[0] + 2 * p[TOP_OFFSET + 0] + 4) >> 3;
    // Second row
    dest += hdim, p += hdim;
    dest[3] = (5 * dest[3] + 2 * p[3] +     p[TOP_OFFSET + 3] + 4) >> 3;
    dest[2] = (5 * dest[2] + 2 * p[2] +     p[TOP_OFFSET + 2] + 4) >> 3;
    dest[1] = (5 * dest[1] +     p[1] + 2 * p[TOP_OFFSET + 1] + 4) >> 3;
    dest[0] = (5 * dest[0] +     p[0] + 2 * p[TOP_OFFSET + 0] + 4) >> 3;
    // Third row
    dest += hdim, p += hdim;
    dest[3] = (5 * dest[3] + 2 * p[3] +     p[TOP_OFFSET + 3] + 4) >> 3;
    dest[2] = (5 * dest[2] + 2 * p[2] +     p[TOP_OFFSET + 2] + 4) >> 3;
    dest[1] = (6 * dest[1] +     p[1] +     p[TOP_OFFSET + 1] + 4) >> 3;
    dest[0] = (6 * dest[0] +     p[0] +     p[TOP_OFFSET + 0] + 4) >> 3;
    // Fourth row (identical to third row)
    dest += hdim, p += hdim;
    dest[3] = (5 * dest[3] + 2 * p[3] +     p[TOP_OFFSET + 3] + 4) >> 3;
    dest[2] = (5 * dest[2] + 2 * p[2] +     p[TOP_OFFSET + 2] + 4) >> 3;
    dest[1] = (6 * dest[1] +     p[1] +     p[TOP_OFFSET + 1] + 4) >> 3;
    dest[0] = (6 * dest[0] +     p[0] +     p[TOP_OFFSET + 0] + 4) >> 3;
    return;
}


// weighNoRight_Vert - Derived from weighNoLeft_Vert by reversing indices
//  This means that the assembler routine can be easily derived from the other
static void weighNoRight_Vert( PIXEL *dest, PIXEL const *p, int hdim )
{
    // First row
    dest[3] = (3 * dest[3] +  p[TOP_OFFSET + 3] + 2) >> 2;
    dest[2] = (3 * dest[2] +  p[TOP_OFFSET + 2] + 2) >> 2;
    dest[1] = (3 * dest[1] +  p[TOP_OFFSET + 1] + 2) >> 2;
    dest[0] = (3 * dest[0] +  p[TOP_OFFSET + 0] + 2) >> 2;
    // Second row
    dest += hdim, p += hdim;
    dest[3] = (7 * dest[3] +  p[TOP_OFFSET + 3] + 4) >> 3;
    dest[2] = (7 * dest[2] +  p[TOP_OFFSET + 2] + 4) >> 3;
    dest[1] = (3 * dest[1] +  p[TOP_OFFSET + 1] + 2) >> 2;
    dest[0] = (3 * dest[0] +  p[TOP_OFFSET + 0] + 2) >> 2;
    // Third row
    dest += hdim, p += hdim;
    dest[3] = (7 * dest[3] +  p[TOP_OFFSET + 3] + 4) >> 3;
    dest[2] = (7 * dest[2] +  p[TOP_OFFSET + 2] + 4) >> 3;
    dest[1] = (7 * dest[1] +  p[TOP_OFFSET + 1] + 4) >> 3;
    dest[0] = (7 * dest[0] +  p[TOP_OFFSET + 0] + 4) >> 3;
    // Fourth row (identical to third row)
    dest += hdim, p += hdim;
    dest[3] = (7 * dest[3] +  p[TOP_OFFSET + 3] + 4) >> 3;
    dest[2] = (7 * dest[2] +  p[TOP_OFFSET + 2] + 4) >> 3;
    dest[1] = (7 * dest[1] +  p[TOP_OFFSET + 1] + 4) >> 3;
    dest[0] = (7 * dest[0] +  p[TOP_OFFSET + 0] + 4) >> 3;
    return;
}


// weighRight - Derived from weighLeft by reversing indices
//  This means that the assembler routine can be easily derived from the other
static void weighRight( PIXEL *dest, PIXEL const *p, int hdim )
{
    // First row
    dest[3] = (3 * dest[3] +  p[3] + 2) >> 2;
    dest[2] = (7 * dest[2] +  p[2] + 4) >> 3;
    dest[1] = (7 * dest[1] +  p[1] + 4) >> 3;
    dest[0] = (7 * dest[0] +  p[0] + 4) >> 3;
    // Second row
    dest += hdim, p += hdim;
    dest[3] = (3 * dest[3] +  p[3] + 2) >> 2;
    dest[2] = (3 * dest[2] +  p[2] + 2) >> 2;
    dest[1] = (7 * dest[1] +  p[1] + 4) >> 3;
    dest[0] = (7 * dest[0] +  p[0] + 4) >> 3;
    // Third row (identical to second row)
    dest += hdim, p += hdim;
    dest[3] = (3 * dest[3] +  p[3] + 2) >> 2;
    dest[2] = (3 * dest[2] +  p[2] + 2) >> 2;
    dest[1] = (7 * dest[1] +  p[1] + 4) >> 3;
    dest[0] = (7 * dest[0] +  p[0] + 4) >> 3;
    // Fourth row (identical to third row)
    dest += hdim, p += hdim;
    dest[3] = (3 * dest[3] +  p[3] + 2) >> 2;
    dest[2] = (3 * dest[2] +  p[2] + 2) >> 2;
    dest[1] = (7 * dest[1] +  p[1] + 4) >> 3;
    dest[0] = (7 * dest[0] +  p[0] + 4) >> 3;
    return;
}


static void mc16pelsNoInterpol( PIXEL *in, PIXEL *out, int hdim, int vSize )
{
#ifndef FOR_UNIX
    union { // Copy words to speed up routine
        PIXEL   *pix;
        U32     *word;
    } pIn, pOut;

    pIn.pix = in;
    pOut.pix = out;
    while (vSize > 0) {
        *(pOut.word + 0) = *(pIn.word + 0);
        *(pOut.word + 1) = *(pIn.word + 1);
        *(pOut.word + 2) = *(pIn.word + 2);
        *(pOut.word + 3) = *(pIn.word + 3);
        pIn.pix += hdim;
        pOut.pix += hdim;
        --vSize;
    }
    return;
#else
    union { // PIXELs are not always word-aligned! Gotta copy bytes in UNIX
        PIXEL   *pix;
        BYTE    *byte;
    } pIn, pOut;

    BYTE    *pInbyte;
    BYTE    *pOutbyte;
    int     i;

    pIn.pix = in;
    pOut.pix = out;
    while (vSize > 0) {
        pInbyte = pIn.byte;
        pOutbyte = pOut.byte;
        for (i=0; i<16; i++)
	  *(pOutbyte++) = *(pInbyte++);

        pIn.pix += hdim;
        pOut.pix += hdim;
        --vSize;
    }
    return;
#endif
}


static void mc8pelsNoInterpol( PIXEL *in, PIXEL *out, int hdim, int vSize )
{
#ifndef FOR_UNIX
    union { // Copy words to speed up routine
        PIXEL   *pix;
        U32     *word;
    } pIn, pOut;

    pIn.pix = in;
    pOut.pix = out;
    while (vSize > 0) {
        *(pOut.word + 0) = *(pIn.word + 0);
        *(pOut.word + 1) = *(pIn.word + 1);
        pIn.pix += hdim;
        pOut.pix += hdim;
        --vSize;
    }
    return;
#else
    union { // PIXELs are not always word-aligned! Gotta copy bytes in UNIX
        PIXEL   *pix;
        BYTE    *byte;
    } pIn, pOut;

    pIn.pix = in;
    pOut.pix = out;
    while (vSize > 0) {
        *(pOut.byte + 0) = *(pIn.byte + 0);
        *(pOut.byte + 1) = *(pIn.byte + 1);
        *(pOut.byte + 2) = *(pIn.byte + 2);
        *(pOut.byte + 3) = *(pIn.byte + 3);
        *(pOut.byte + 4) = *(pIn.byte + 4);
        *(pOut.byte + 5) = *(pIn.byte + 5);
        *(pOut.byte + 6) = *(pIn.byte + 6);
        *(pOut.byte + 7) = *(pIn.byte + 7);
        pIn.pix += hdim;
        pOut.pix += hdim;
        --vSize;
    }
    return;
#endif
}


static void mc4pelsNoInterpol( PIXEL *in, PIXEL *out, int hdim, int vSize )
{
#ifndef FOR_UNIX
    union { // Copy words to speed up routine
        PIXEL   *pix;
        U32     *word;
    } pIn, pOut;

    pIn.pix = in;
    pOut.pix = out;
    while (vSize > 0) {
        *(pOut.word + 0) = *(pIn.word + 0);
        pIn.pix += hdim;
        pOut.pix += hdim;
        --vSize;
    }
    return;
#else
    union { // PIXELs are not always word-aligned! Gotta copy bytes in UNIX
        PIXEL   *pix;
        BYTE    *byte;
    } pIn, pOut;

    pIn.pix = in;
    pOut.pix = out;
    while (vSize > 0) {
        *(pOut.byte + 0) = *(pIn.byte + 0);
        *(pOut.byte + 1) = *(pIn.byte + 1);
        *(pOut.byte + 2) = *(pIn.byte + 2);
        *(pOut.byte + 3) = *(pIn.byte + 3);
        pIn.pix += hdim;
        pOut.pix += hdim;
        --vSize;
    }
    return;
#endif
}


static void mc16pelsHorInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + 1) >> 1;
        out[1] = (in[1] + in[2] + 1) >> 1;
        out[2] = (in[2] + in[3] + 1) >> 1;
        out[3] = (in[3] + in[4] + 1) >> 1;
        out[4] = (in[4] + in[5] + 1) >> 1;
        out[5] = (in[5] + in[6] + 1) >> 1;
        out[6] = (in[6] + in[7] + 1) >> 1;
        out[7] = (in[7] + in[8] + 1) >> 1;
        out[8] = (in[8] + in[9] + 1) >> 1;
        out[9] = (in[9] + in[10] + 1) >> 1;
        out[10] = (in[10] + in[11] + 1) >> 1;
        out[11] = (in[11] + in[12] + 1) >> 1;
        out[12] = (in[12] + in[13] + 1) >> 1;
        out[13] = (in[13] + in[14] + 1) >> 1;
        out[14] = (in[14] + in[15] + 1) >> 1;
        out[15] = (in[15] + in[16] + 1) >> 1;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


static void mc8pelsHorInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + 1) >> 1;
        out[1] = (in[1] + in[2] + 1) >> 1;
        out[2] = (in[2] + in[3] + 1) >> 1;
        out[3] = (in[3] + in[4] + 1) >> 1;
        out[4] = (in[4] + in[5] + 1) >> 1;
        out[5] = (in[5] + in[6] + 1) >> 1;
        out[6] = (in[6] + in[7] + 1) >> 1;
        out[7] = (in[7] + in[8] + 1) >> 1;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


static void mc4pelsHorInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + 1) >> 1;
        out[1] = (in[1] + in[2] + 1) >> 1;
        out[2] = (in[2] + in[3] + 1) >> 1;
        out[3] = (in[3] + in[4] + 1) >> 1;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


static void mc16pelsVertInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[hdim+0] + 1) >> 1;
        out[1] = (in[1] + in[hdim+1] + 1) >> 1;
        out[2] = (in[2] + in[hdim+2] + 1) >> 1;
        out[3] = (in[3] + in[hdim+3] + 1) >> 1;
        out[4] = (in[4] + in[hdim+4] + 1) >> 1;
        out[5] = (in[5] + in[hdim+5] + 1) >> 1;
        out[6] = (in[6] + in[hdim+6] + 1) >> 1;
        out[7] = (in[7] + in[hdim+7] + 1) >> 1;
        out[8] = (in[8] + in[hdim+8] + 1) >> 1;
        out[9] = (in[9] + in[hdim+9] + 1) >> 1;
        out[10] = (in[10] + in[hdim+10] + 1) >> 1;
        out[11] = (in[11] + in[hdim+11] + 1) >> 1;
        out[12] = (in[12] + in[hdim+12] + 1) >> 1;
        out[13] = (in[13] + in[hdim+13] + 1) >> 1;
        out[14] = (in[14] + in[hdim+14] + 1) >> 1;
        out[15] = (in[15] + in[hdim+15] + 1) >> 1;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


static void mc8pelsVertInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[hdim+0] + 1) >> 1;
        out[1] = (in[1] + in[hdim+1] + 1) >> 1;
        out[2] = (in[2] + in[hdim+2] + 1) >> 1;
        out[3] = (in[3] + in[hdim+3] + 1) >> 1;
        out[4] = (in[4] + in[hdim+4] + 1) >> 1;
        out[5] = (in[5] + in[hdim+5] + 1) >> 1;
        out[6] = (in[6] + in[hdim+6] + 1) >> 1;
        out[7] = (in[7] + in[hdim+7] + 1) >> 1;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


static void mc4pelsVertInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[hdim+0] + 1) >> 1;
        out[1] = (in[1] + in[hdim+1] + 1) >> 1;
        out[2] = (in[2] + in[hdim+2] + 1) >> 1;
        out[3] = (in[3] + in[hdim+3] + 1) >> 1;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


static void mc16pels2DInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
        out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
        out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
        out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
        out[4] = (in[4] + in[5] + in[hdim+4] + in[hdim+5] + 2) >> 2;
        out[5] = (in[5] + in[6] + in[hdim+5] + in[hdim+6] + 2) >> 2;
        out[6] = (in[6] + in[7] + in[hdim+6] + in[hdim+7] + 2) >> 2;
        out[7] = (in[7] + in[8] + in[hdim+7] + in[hdim+8] + 2) >> 2;
        out[8] = (in[8] + in[9] + in[hdim+8] + in[hdim+9] + 2) >> 2;
        out[9] = (in[9] + in[10] + in[hdim+9] + in[hdim+10] + 2) >> 2;
        out[10] = (in[10] + in[11] + in[hdim+10] + in[hdim+11] + 2) >> 2;
        out[11] = (in[11] + in[12] + in[hdim+11] + in[hdim+12] + 2) >> 2;
        out[12] = (in[12] + in[13] + in[hdim+12] + in[hdim+13] + 2) >> 2;
        out[13] = (in[13] + in[14] + in[hdim+13] + in[hdim+14] + 2) >> 2;
        out[14] = (in[14] + in[15] + in[hdim+14] + in[hdim+15] + 2) >> 2;
        out[15] = (in[15] + in[16] + in[hdim+15] + in[hdim+16] + 2) >> 2;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


static void mc8pels2DInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
        out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
        out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
        out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
        out[4] = (in[4] + in[5] + in[hdim+4] + in[hdim+5] + 2) >> 2;
        out[5] = (in[5] + in[6] + in[hdim+5] + in[hdim+6] + 2) >> 2;
        out[6] = (in[6] + in[7] + in[hdim+6] + in[hdim+7] + 2) >> 2;
        out[7] = (in[7] + in[8] + in[hdim+7] + in[hdim+8] + 2) >> 2;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


static void mc4pels2DInterpol( PIXEL const *in, PIXEL *out, int hdim, int vSize )
{
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
        out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
        out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
        out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
}


// Temporary arrays for backwards prediction (using next P-picture)
static  PIXEL   yBack[16*16], cbBack[8*8], crBack[8*8];

// saveBackwardPred - save macroblock to temporary area
static void saveBackwardPred( MACROBLOCK_DESCR * mb,    // Describes block to be saved
                              PICTURE * pic // Picture with pixels to be saved
                              )
{
    int     row, col, picOffset, hdim;

    // Compute luma pointer
    col = 16 * mb->x;
    row = 16 * mb->y;
    hdim = pic->y.hoffset;
    picOffset = col + row * hdim;
    // Copy luma block
    copyBlock( (S32 *)(pic->y.ptr + picOffset), 16>>2, 16, hdim>>2, (S32 *)yBack, 16>>2 );
    // Compute chroma pointer
    col = 8 * mb->x;
    row = 8 * mb->y;
    hdim = pic->cb.hoffset;
    picOffset = col + row * hdim;
    // Copy chroma blocks
    copyBlock( (S32 *)(pic->cb.ptr + picOffset), 8>>2, 8, hdim>>2, (S32 *)cbBack, 8>>2 );
    copyBlock( (S32 *)(pic->cr.ptr + picOffset), 8>>2, 8, hdim>>2, (S32 *)crBack, 8>>2 );
}


// averageForBack - Compute B-frame prediction from forward and backward predictions
static void averageForBack( MACROBLOCK_DESCR * mb,  // Describes block to be averaged
                            PICTURE * pic   // Input: contains forward prediction
                                            // Output: contains B-frame prediction
                            )
{
    int row, col, hdim, blk, firstCol, lastCol, firstRow, lastRow, cX, cY;
	void (*pAverageBlockFunction)( PIXEL forPred[], int hSize, int vSize, int forOffset,
                          PIXEL backPred[], int backOffset );
#if defined(COMPILE_MMX)
	if(cpuid_is_mmx_motion_on()) {
		//do mmx if compiler switch AND initialized AND detected
		pAverageBlockFunction = averageBlockMMX;
	} else 
#endif
	{
		pAverageBlockFunction = averageBlock;
	}

    // Process luma
    col = 16 * mb->x;
    row = 16 * mb->y;
    hdim = pic->y.hoffset;
    if (mb->mtype == MTYPE263_INTER4V) {
        for (blk = 0; blk < 4; ++blk) { // Treat each 8x8 block separately
            // Only use pixels in current MB for backwards prediction
            if (blk & 0x1) {
                firstCol = 8, lastCol = 15; // Block 1 and 3
            } else {
                firstCol = 0, lastCol = 7;  // Block 0 and 2
            }
            if (blk & 0x2) {
                firstRow = 8, lastRow = 15; // Block 2 and 3
            } else {
                firstRow = 0, lastRow = 7;  // Block 0 and 1
            }
            firstCol = max( firstCol,     (-mb->blkMvBx[blk] + 1) >> 1 );
            lastCol  = min( lastCol, 15 - ((mb->blkMvBx[blk] + 1) >> 1));
            firstRow = max( firstRow,     (-mb->blkMvBy[blk] + 1) >> 1 );
            lastRow  = min( lastRow, 15 - ((mb->blkMvBy[blk] + 1) >> 1));
            pAverageBlockFunction( pic->y.ptr + col + firstCol + (row + firstRow) * hdim,
                          lastCol - firstCol + 1,
                          lastRow - firstRow + 1, hdim,
                          yBack + firstCol + firstRow * 16, 16 );
        }
        // Determine motion vector for chroma block
        cX = chromaMvComp4V( mb->blkMvBx );
        cY = chromaMvComp4V( mb->blkMvBy );
    } else {
        // Only use pixels in current MB for backwards prediction
        firstCol = max( 0,       (-mb->blkMvBx[0] + 1) >> 1 );
        lastCol  = min( 15, 15 - ((mb->blkMvBx[0] + 1) >> 1));
        firstRow = max( 0,       (-mb->blkMvBy[0] + 1) >> 1 );
        lastRow  = min( 15, 15 - ((mb->blkMvBy[0] + 1) >> 1));
//#define MMX_AVERAGE_TEST01
#ifdef MMX_AVERAGE_TEST01
	{
		int	crow, ccol;
		PIXEL *forPred = pic->y.ptr + col + firstCol + (row + firstRow) * hdim;
		int hSize = lastCol - firstCol + 1;
		int vSize = lastRow - firstRow + 1;
		int forOffset = hdim;
		PIXEL *backPred = yBack + firstCol + firstRow * 16;
		int backOffset = 16;
		PIXEL forPred_org[16*352];
		PIXEL backPred_org[16*16];
		PIXEL forPred_temp[16*352];
		PIXEL backPred_temp[16*16];
		int countForOffset; 
		int countBackOffset; 
		int	bError = 0;

		countForOffset = 0; 
		countBackOffset = 0; 
		for (crow = 0; crow < vSize; ++crow) {
			for (ccol = 0; ccol < hSize; ++ccol) {//from smaller multiple of 4
				forPred_org[ccol + countForOffset] = forPred[ccol + countForOffset];
				backPred_org[ccol + countBackOffset] = backPred[ccol + countBackOffset];
				forPred_temp[ccol + countForOffset] = forPred[ccol + countForOffset];
				backPred_temp[ccol + countBackOffset] = backPred[ccol + countBackOffset];
			}
			countForOffset  += forOffset;
			countBackOffset += backOffset;
		}

		averageBlock( forPred_org,
					  lastCol - firstCol + 1,
					  lastRow - firstRow + 1, hdim,
					  backPred_org, 16 );
#endif
        pAverageBlockFunction( pic->y.ptr + col + firstCol + (row + firstRow) * hdim,
                      lastCol - firstCol + 1,
                      lastRow - firstRow + 1, hdim,
                      yBack + firstCol + firstRow * 16, 16 );

#ifdef MMX_AVERAGE_TEST01
    countForOffset  =0;
    countBackOffset = 0;
    for (crow = 0; crow < vSize; ++crow) {
        for (ccol = 0; ccol < hSize; ++ccol) {//from smaller multiple of 4
            if( (forPred_org[ccol + countForOffset] != forPred[ccol + countForOffset])) {//((hSize==8) || (hSize==16)) &&
				bError = 1;
			}
        }
        countForOffset  += forOffset;
        countBackOffset += backOffset;
    }
	if(bError!=0) {	
		countForOffset = 0; 
		countBackOffset = 0; 
		for (crow = 0; crow < vSize; ++crow) {
			for (ccol = 0; ccol < hSize; ++ccol) {//from smaller multiple of 4
				forPred_org[ccol + countForOffset] = forPred[ccol + countForOffset] = forPred_temp[ccol + countForOffset];
				backPred_org[ccol + countBackOffset] = backPred[ccol + countBackOffset] = backPred_temp[ccol + countBackOffset];
			}
			countForOffset  += forOffset;
			countBackOffset += backOffset;
		}

		averageBlock( forPred_org,
							  lastCol - firstCol + 1,
							  lastRow - firstRow + 1, hdim,
							  backPred_org, 16 );
		pAverageBlockFunction( pic->y.ptr + col + firstCol + (row + firstRow) * hdim,
							  lastCol - firstCol + 1,
							  lastRow - firstRow + 1, hdim,
							  yBack + firstCol + firstRow * 16, 16 );
	}
	}
#endif
    


        // Determine motion vector for chroma block
        cX = chromaMVComp( mb->blkMvBx[0] );
        cY = chromaMVComp( mb->blkMvBy[0] );
    }
    // Process chroma
    col = 8 * mb->x;
    row = 8 * mb->y;
    hdim = pic->cb.hoffset;
    // Only use pixels in current MB for backwards prediction
    firstCol = max( 0,     (-cX + 1) >> 1 );
    lastCol  = min( 7, 7 - ((cX + 1) >> 1));
    firstRow = max( 0,     (-cY + 1) >> 1 );
    lastRow  = min( 7, 7 - ((cY + 1) >> 1));

//#define MMX_AVERAGE_TEST02
#ifdef MMX_AVERAGE_TEST02
	{
	int	crow, ccol;
	PIXEL *forPred = pic->cb.ptr + col + firstCol + (row + firstRow) * hdim;
	int hSize = lastCol - firstCol + 1;
	int vSize = lastRow - firstRow + 1;
	int forOffset = hdim;
	PIXEL *backPred = cbBack + firstCol + firstRow * 8;
	int backOffset = 8;
	PIXEL forPred_org[16*352];
	PIXEL backPred_org[16*8];
	int countForOffset = 0; 
	int countBackOffset = 0; 

    for (crow = 0; crow < vSize; ++crow) {
        for (ccol = 0; ccol < hSize; ++ccol) {//from smaller multiple of 4
            forPred_org[ccol + countForOffset] = forPred[ccol + countForOffset];
			backPred_org[ccol + countBackOffset] = backPred[ccol + countBackOffset];
        }
        countForOffset  += forOffset;
        countBackOffset += backOffset;
    }

    averageBlock( forPred_org,
                  lastCol - firstCol + 1,
                  lastRow - firstRow + 1, hdim,
                  backPred_org, 8 );
#endif

    pAverageBlockFunction( pic->cb.ptr + col + firstCol + (row + firstRow) * hdim,
                  lastCol - firstCol + 1,
                  lastRow - firstRow + 1, hdim,
                  cbBack + firstCol + firstRow * 8, 8 );

#ifdef MMX_AVERAGE_TEST02
    countForOffset  =0;
    countBackOffset = 0;
    for (crow = 0; crow < vSize; ++crow) {
        for (ccol = 0; ccol < hSize; ++ccol) {//from smaller multiple of 4
            if(((hSize==8) || (hSize==16)) && (forPred_org[ccol + countForOffset] != forPred[ccol + countForOffset])) {
				averageBlock( forPred_org,
							  lastCol - firstCol + 1,
							  lastRow - firstRow + 1, hdim,
							  backPred_org, 8 );
				pAverageBlockFunction( pic->cb.ptr + col + firstCol + (row + firstRow) * hdim,
							  lastCol - firstCol + 1,
							  lastRow - firstRow + 1, hdim,
							  cbBack + firstCol + firstRow * 8, 8 );
			}
        }
        countForOffset  += forOffset;
        countBackOffset += backOffset;
    }

	}
#endif
    
	
	pAverageBlockFunction( pic->cr.ptr + col + firstCol + (row + firstRow) * hdim,
                  lastCol - firstCol + 1,
                  lastRow - firstRow + 1, hdim,
                  crBack + firstCol + firstRow * 8, 8 );
}


// copyBlock - copy 2-D array of 32-bit integers
static void copyBlock( S32 source[], int hSize, int vSize, int sourceOffset,
                       S32 dest[], int destOffset )
{
    int row, col;

    for (row = 0; row < vSize; ++row) {
        for (col = 0; col < hSize; ++col) {
            dest[col] = source[col];
        }
        source += sourceOffset;
        dest   += destOffset;
    }
}


// averageBlock - compute average of two hSize*vSize pixel arrays
void averageBlock( PIXEL forPred[], int hSize, int vSize, int forOffset,
                          PIXEL backPred[], int backOffset )
{
    int row, col;
#ifdef VVPROFILER
	S32 nVvProfNb = 3;
	if(!pVvProf[nVvProfNb]) pVvProf[nVvProfNb] = newCVvDebugTimer();//memory leak on destruction
	pVvProfCount[nVvProfNb]++;
	StartTime(pVvProf[nVvProfNb]);
#endif

    for (row = 0; row < vSize; ++row) {
        for (col = 0; col < hSize; ++col) {
            forPred[col] = (forPred[col] + backPred[col]) >> 1;
        }
        forPred  += forOffset;
        backPred += backOffset;
    }

#ifdef VVPROFILER
	StopAndAccuTime(pVvProf[nVvProfNb]);
#endif


}
