Код
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Local frw_quant_mpeg_intra entry point
// Input parameters:
//   circular buffer 2: B2 = I2 = mq, L2 = 256
//   I0 = I1 = coefs, L0 = L1 = 0
//   R0 = quant_add
//   R1 = quant_inv
//   P0 = 32
// Function modifies:
//   I0 = I1 = next coef cell
// Function destroys:
//   A0, A1, R7:4, LC0
// Function performs the following serie of operations:
//   level[ii] = abs(coef[ii])
//   vq[ii]    = (16*level[ii] + mq[ii]/2) * mq_inv[ii] (1, 2)
//   vq[ii]  >>= 18
//   vq[ii]    = (vq[ii] + quant_add) * quant_inv (3, 4)
//   lq[ii]    = vq[ii] >> 16
//   coefq[ii] = sign(coef[ii]) * lq[ii]
// Remark:
//   (1) - since coefs are upscaled by 4 during fDCT we multiply them here by 4 only
//   (2) - mq[] = { mq[0]/2, mq[1]/2, 4*65536/mq[0] + 1, 4*65536/mq[1] + 1, ... }
//   (3) - quant_add = (3 * quant + 2) / 4
//   (4) - quant_inv = 32768 / quant + 1

    .align 8;

frw_quant_mpeg_intra:

    R4 = [I0++];                                       // R4 = [              coef[1]              coef[0] ]

    R5 = R4 >>> 15 (V)                                  // R5 = sign(R4)
    || R6 = [I2++];                                     // R6 = [              mq[1]/2              mq[0]/2 ]

    LSETUP(frw_quant_mpeg_intra_l0_start, frw_quant_mpeg_intra_l0_end) LC0 = P0;

frw_quant_mpeg_intra_l0_start:

    R4 = ABS R4 (V)                                     // R4 = [             level[1]             level[0] ]
    || R7 = [I2++];                                    // R7 = [            inv(mq[1])           inv(mq[0])]

    R4 = R4 << 2 (V);                                  // R4 = [          16*level[1]          16*level[0] ]

    A1 = R6.h * R7.h, A0 = R6.l * R7.l (FU);           // AA = [  (mq[1]/2)*mq_inv[1]  (mq[0]/2)*mq_inv[0] ]

    A1 += R4.h * R7.h, A0 += R4.l * R7.l (FU);         // AA = [            vq[1]<<18            vq[0]<<18 ]

    R6.l = A0, R6.h = A1 (T);                          // R6 = [             vq[1]<<2             vq[0]<<2 ]

    R6 = R6 >> 2 (V);                                  // R6 = [                vq[1]                vq[0] ]

    R6 = R6 +|+ R0;                                    // R6 = [      vq[1]+quant_add      vq[0]+quant_add ]

    R6.h = R6.h * R1.l, R6.l = R6.l * R1.l (TFU);      // R6 = [                lq[1]                lq[0] ]

    R6 = R6 ^ R5;                                      // R6 = [   lq[1]*sign(coef[1])  lq[0]*sign(coef[0])]

    R7 = R6 -|- R5                                      // R7 = [             coefq[1]             coefq[0] ]
    || R4 = [I0++];                                    // R4 = [              coef[3]              coef[2] ]

frw_quant_mpeg_intra_l0_end:

    R5 = R4 >>> 15 (V)                                  // R5 = sign(R4)
    || R6 = [I2++]                                      // R6 = [              mq[3]/2              mq[2]/2 ]
    || [I1++] = R7;                                    // Save [             coefq[1]             coefq[0] ]


    RTS;
    NOP;

frw_quant_mpeg_intra.end:

Возникла задача портировать на 535 BlackFin кода, реализующего прямое квантование a-la MPEG4. Приведенный выше фрагмент работает и выполняет все необходимые действия. Это наша первая попытка оптимизации кода для этой машинки - потому она, похоже, не оптимальна. Фактически это просто пошаговая трансляция существующей MMX реализации на подмножество инструкций BlackFin.

Собственно, вопрос - не подскажет ли кто методов усовершенствования выше приведенного кода с учетом особенностей архитектуры именно BlackFin процессоров?