9.4 减法(Vector Sub) 这部分函数主要用于实现减法,公式描述如下: pDst[n] = pSrcA[n] - pSrcB[n], 0 <= n < blockSize.
9.4.1 arm_sub_f32 这个函数用于求32位浮点数的减法,源代码分析如下:
- /**
- * @brief Floating-point vector subtraction.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- */
-
- void arm_sub_f32(
- float32_t * pSrcA,
- float32_t * pSrcB,
- float32_t * pDst,
- uint32_t blockSize)
- {
- uint32_t blkCnt; /* loop counter */
-
- #ifndef ARM_MATH_CM0_FAMILY
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t inA1, inA2, inA3, inA4; /* temporary variables */
- float32_t inB1, inB2, inB3, inB4; /* temporary variables */
-
- /*loop Unrolling */
- blkCnt = blockSize >> 2u;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the results in the destination buffer. */
- /* Read 4 input samples from sourceA and sourceB */
- inA1 = *pSrcA;
- inB1 = *pSrcB;
- inA2 = *(pSrcA + 1);
- inB2 = *(pSrcB + 1);
- inA3 = *(pSrcA + 2);
- inB3 = *(pSrcB + 2);
- inA4 = *(pSrcA + 3);
- inB4 = *(pSrcB + 3);
-
- /* dst = srcA - srcB */
- /* subtract and store the result */ (1)
- *pDst = inA1 - inB1;
- *(pDst + 1) = inA2 - inB2;
- *(pDst + 2) = inA3 - inB3;
- *(pDst + 3) = inA4 - inB4;
-
-
- /* Update pointers to process next sampels */
- pSrcA += 4u;
- pSrcB += 4u;
- pDst += 4u;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4u;
-
- #else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- #endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the results in the destination buffer. */
- *pDst++ = (*pSrcA++) - (*pSrcB++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
复制代码
1. 浮点数的减法运算比较简单,直接两个数值相减即可。
9.4.2 arm_sub_q31 这个函数用于求32位定点数的减法,源代码分析如下:
- /**
- * @brief Q31 vector subtraction.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior: (1)
- * par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
- */
-
- void arm_sub_q31(
- q31_t * pSrcA,
- q31_t * pSrcB,
- q31_t * pDst,
- uint32_t blockSize)
- {
- uint32_t blkCnt; /* loop counter */
-
-
- #ifndef ARM_MATH_CM0_FAMILY
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inA1, inA2, inA3, inA4;
- q31_t inB1, inB2, inB3, inB4;
-
- /*loop Unrolling */
- blkCnt = blockSize >> 2u;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the results in the destination buffer. */
- inA1 = *pSrcA++;
- inA2 = *pSrcA++;
- inB1 = *pSrcB++;
- inB2 = *pSrcB++;
-
- inA3 = *pSrcA++;
- inA4 = *pSrcA++;
- inB3 = *pSrcB++;
- inB4 = *pSrcB++;
-
- *pDst++ = __QSUB(inA1, inB1); (2)
- *pDst++ = __QSUB(inA2, inB2);
- *pDst++ = __QSUB(inA3, inB3);
- *pDst++ = __QSUB(inA4, inB4);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4u;
-
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- #else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ - *pSrcB++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- #endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
- }
复制代码
1. 这个函数使用了饱和运算。 饱和运算数值0x80000000将变成0x7FFFFFFF。 2. __QSUB也是SIMD指令,这里可以用这个指令实现两个Q31格式数据的饱和减法。
9.4.3 arm_sub_q15 这个函数用于求16位定点数的减法,源代码分析如下:
- /**
- * @brief Q15 vector subtraction.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
- */
-
- void arm_sub_q15(
- q15_t * pSrcA,
- q15_t * pSrcB,
- q15_t * pDst,
- uint32_t blockSize)
- {
- uint32_t blkCnt; /* loop counter */
-
-
- #ifndef ARM_MATH_CM0_FAMILY
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inA1, inA2;
- q31_t inB1, inB2;
-
- /*loop Unrolling */
- blkCnt = blockSize >> 2u;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the results in the destination buffer two samples at a time. */
- inA1 = *__SIMD32(pSrcA)++; (1)
- inA2 = *__SIMD32(pSrcA)++;
- inB1 = *__SIMD32(pSrcB)++;
- inB2 = *__SIMD32(pSrcB)++;
-
- *__SIMD32(pDst)++ = __QSUB16(inA1, inB1); (2)
- *__SIMD32(pDst)++ = __QSUB16(inA2, inB2);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4u;
-
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- #else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- #endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
-
- }
复制代码
1. 这里一次读取两个Q15格式的数据。 2. 由于__QSUB16是SIMD指令,在这里调用一次__QSUB16可以实现两次减法运算。
9.4.4 arm_sub_q7 这个函数用于求8位定点数的减法,源代码分析如下:
- /**
- * @brief Q7 vector subtraction.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior:
- * par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
- */
-
- void arm_sub_q7(
- q7_t * pSrcA,
- q7_t * pSrcB,
- q7_t * pDst,
- uint32_t blockSize)
- {
- uint32_t blkCnt; /* loop counter */
-
- #ifndef ARM_MATH_CM0_FAMILY
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
-
- /*loop Unrolling */
- blkCnt = blockSize >> 2u;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the results in the destination buffer 4 samples at a time. */
- *__SIMD32(pDst)++ = __QSUB8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); (1)
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4u;
-
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = __SSAT(*pSrcA++ - *pSrcB++, 8);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- #else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while(blkCnt > 0u)
- {
- /* C = A - B */
- /* Subtract and then store the result in the destination buffer. */
- *pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ - *pSrcB++, 8);
-
- /* Decrement the loop counter */
- blkCnt--;
- }
-
- #endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
-
- }
复制代码
1. __QSUB8也是SIMD指令,调用一次就能实现4个Q7格式数据的减法运算。
9.4.5 实例讲解实验目的: 1. 四种种类型数据的减法。 实验内容: 1. 按下按键UP, 串口打印输出结果 实验现象: 通过窗口上位机软件SecureCRT(V5光盘里面有此软件)查看打印信息现象如下:
程序设计:
- /*
- *********************************************************************************************************
- * 函 数 名: DSP_Sub
- * 功能说明: 减法
- * 形 参:无
- * 返 回 值: 无
- *********************************************************************************************************
- */
- static void DSP_Sub(void)
- {
- static float32_t pSrcA[5] = {1.0f,1.0f,1.0f,1.0f,1.0f};
- static float32_t pSrcB[5] = {1.0f,1.0f,1.0f,1.0f,1.0f};
- static float32_t pDst[5];
-
- static q31_t pSrcA1[5] = {1,1,1,1,1};
- static q31_t pSrcB1[5] = {1,1,1,1,1};
- static q31_t pDst1[5];
-
- static q15_t pSrcA2[5] = {1,1,1,1,1};
- static q15_t pSrcB2[5] = {1,1,1,1,1};
- static q15_t pDst2[5];
-
- static q7_t pSrcA3[5] = {0x70,1,1,1,1};
- static q7_t pSrcB3[5] = {0x7f,1,1,1,1};
- static q7_t pDst3[5];
-
-
- pSrcA[0] += 1.1f;
- arm_sub_f32(pSrcA, pSrcB, pDst, 5);
- printf("arm_sub_f32 = %frn", pDst[0]);
-
- pSrcA1[0] += 1;
- arm_sub_q31(pSrcA1, pSrcB1, pDst1, 5);
- printf("arm_sub_q31 = %drn", pDst1[0]);
-
- pSrcA2[0] += 1;
- arm_sub_q15(pSrcA2, pSrcB2, pDst2, 5);
- printf("arm_sub_q15 = %drn", pDst2[0]);
-
- pSrcA3[0] += 1;
- arm_sub_q7(pSrcA3, pSrcB3, pDst3, 5);
- printf("arm_sub_q7 = %drn", pDst3[0]);
- printf("***********************************rn");
- }
复制代码
|