Библиотека DSP STM32F4xx_DSP_StdPeriph_Lib_V1.4.0
Файл arm_add_q31.c
Прошу разъяснить, какие отличия архитектуры Cortex-M3, M4 заставляют группировать по 4 операции в цикле, в отличии от одной операции в цикле для M0. Это для правильной работы конвеера?
Код
/**
* @brief Q31 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
*/
void arm_add_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#ifndef ARM_MATH_CM0_FAMILY
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
/*loop Unrolling */
blkCnt = blockSize >> 2u;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while(blkCnt > 0u)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
*pDst++ = __QADD(inA1, inB1);
*pDst++ = __QADD(inA2, inB2);
*pDst++ = __QADD(inA3, inB3);
*pDst++ = __QADD(inA4, inB4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4u;
while(blkCnt > 0u)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while(blkCnt > 0u)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ + *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
}