8.4 乘法(Vector Multiplication) 这部分函数主要用于乘法,公式描述如下: pDst[n]= pSrcA[n] * pSrcB[n], 0 <= n
8.4.1 arm_mult_f32 这个函数用于求32位浮点数的乘法,源代码分析如下:
- /**
- * @brief Floating-point vector multiplication.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- */
-
- void arm_mult_f32(
- float32_t * pSrcA,
- float32_t * pSrcB,
- float32_t * pDst,
- uint32_t blockSize)
- {
- uint32_t blkCnt; /* loop counters */
- #ifndef ARM_MATH_CM0_FAMILY
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- float32_t inA1, inA2, inA3, inA4; /* temporary input variables */
- float32_t inB1, inB2, inB3, inB4; /* temporary input variables */
- float32_t out1, out2, out3, out4; /* temporary output variables */
-
- /* loop Unrolling */
- blkCnt = blockSize >> 2u;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while(blkCnt > 0u)
- {
- /* C = A * B */
- /* Multiply the inputs and store the results in output buffer */ (1)
- /* read sample from sourceA */
- inA1 = *pSrcA;
- /* read sample from sourceB */
- inB1 = *pSrcB;
- /* read sample from sourceA */
- inA2 = *(pSrcA + 1);
- /* read sample from sourceB */
- inB2 = *(pSrcB + 1);
-
- /* out = sourceA * sourceB */
- out1 = inA1 * inB1;
-
- /* read sample from sourceA */
- inA3 = *(pSrcA + 2);
- /* read sample from sourceB */
- inB3 = *(pSrcB + 2);
-
- /* out = sourceA * sourceB */
- out2 = inA2 * inB2;
-
- /* read sample from sourceA */
- inA4 = *(pSrcA + 3);
-
- /* store result to destination buffer */
- *pDst = out1;
-
- /* read sample from sourceB */
- inB4 = *(pSrcB + 3);
-
- /* out = sourceA * sourceB */
- out3 = inA3 * inB3;
-
- /* store result to destination buffer */
- *(pDst + 1) = out2;
-
- /* out = sourceA * sourceB */
- out4 = inA4 * inB4;
- /* store result to destination buffer */
- *(pDst + 2) = out3;
- /* store result to destination buffer */
- *(pDst + 3) = out4;
-
-
- /* update pointers to process next samples */
- pSrcA += 4u;
- pSrcB += 4u;
- pDst += 4u;
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4u;
-
- #else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- #endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
- while(blkCnt > 0u)
- {
- /* C = A * B */
- /* Multiply the inputs and store the results in output buffer */
- *pDst++ = (*pSrcA++) * (*pSrcB++);
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
- }
复制代码
1. 浮点的32位乘法比较简单,这里依然是以4次的计算为一组。
8.4.2 arm_mult_q31 这个函数用于求32位定点数的乘法,源代码分析如下:
- /**
- * @brief Q31 vector multiplication.
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior: (1)
- * par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
- */
-
- void arm_mult_q31(
- q31_t * pSrcA,
- q31_t * pSrcB,
- q31_t * pDst,
- uint32_t blockSize)
- {
- uint32_t blkCnt; /* loop counters */
-
- #ifndef ARM_MATH_CM0_FAMILY
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inA1, inA2, inA3, inA4; /* temporary input variables */
- q31_t inB1, inB2, inB3, inB4; /* temporary input variables */
- q31_t out1, out2, out3, out4; /* temporary output variables */
-
- /* loop Unrolling */
- blkCnt = blockSize >> 2u;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while(blkCnt > 0u)
- {
- /* C = A * B */
- /* Multiply the inputs and then store the results in the destination buffer. */
- inA1 = *pSrcA++;
- inA2 = *pSrcA++;
- inA3 = *pSrcA++;
- inA4 = *pSrcA++;
- inB1 = *pSrcB++;
- inB2 = *pSrcB++;
- inB3 = *pSrcB++;
- inB4 = *pSrcB++;
-
- out1 = ((q63_t) inA1 * inB1) >> 32; (2)
- out2 = ((q63_t) inA2 * inB2) >> 32;
- out3 = ((q63_t) inA3 * inB3) >> 32;
- out4 = ((q63_t) inA4 * inB4) >> 32;
-
- out1 = __SSAT(out1, 31); (3)
- out2 = __SSAT(out2, 31);
- out3 = __SSAT(out3, 31);
- out4 = __SSAT(out4, 31);
-
- *pDst++ = out1 << 1u; (4)
- *pDst++ = out2 << 1u;
- *pDst++ = out3 << 1u;
- *pDst++ = out4 << 1u;
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4u;
-
- #else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- #endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
- while(blkCnt > 0u)
- {
- /* C = A * B */
- /* Multiply the inputs and then store the results in the destination buffer. */
- *pDst++ =
- (q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31);
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
- }
复制代码
1. 这个函数使用了饱和算法。 所得结果是Q31格式,范围Q31range[0x80000000 0x7FFFFFFF]。 2. 所得乘积左移32位。 3. 实现31位精度的饱和运算。 4. 右移一位,保证所得结果是Q31格式。
8.4.3 arm_mult_q15 这个函数用于求16位定点数的乘法,源代码分析如下:
- /**
- * @brief Q15 vector multiplication
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior: (1)
- * par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
- */
-
- void arm_mult_q15(
- q15_t * pSrcA,
- q15_t * pSrcB,
- q15_t * pDst,
- uint32_t blockSize)
- {
- uint32_t blkCnt; /* loop counters */
-
- #ifndef ARM_MATH_CM0_FAMILY
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
- q15_t out1, out2, out3, out4; /* temporary output variables */
- q31_t mul1, mul2, mul3, mul4; /* temporary variables */
-
- /* loop Unrolling */
- blkCnt = blockSize >> 2u;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while(blkCnt > 0u)
- {
- /* read two samples at a time from sourceA */
- inA1 = *__SIMD32(pSrcA)++; (2)
- /* read two samples at a time from sourceB */
- inB1 = *__SIMD32(pSrcB)++;
- /* read two samples at a time from sourceA */
- inA2 = *__SIMD32(pSrcA)++;
- /* read two samples at a time from sourceB */
- inB2 = *__SIMD32(pSrcB)++;
-
- /* multiply mul = sourceA * sourceB */
- mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16)); (3)
- mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
- mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
- mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
-
- /* saturate result to 16 bit */
- out1 = (q15_t) __SSAT(mul1 >> 15, 16); (4)
- out2 = (q15_t) __SSAT(mul2 >> 15, 16);
- out3 = (q15_t) __SSAT(mul3 >> 15, 16);
- out4 = (q15_t) __SSAT(mul4 >> 15, 16);
-
- /* store the result */
- #ifndef ARM_MATH_BIG_ENDIAN
-
- *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16); (5)
- *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
- #else
-
- *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
- *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
- #endif // #ifndef ARM_MATH_BIG_ENDIAN
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4u;
-
- #else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- #endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
-
- while(blkCnt > 0u)
- {
- /* C = A * B */
- /* Multiply the inputs and store the result in the destination buffer */
- *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
- }
复制代码
1. 这个函数使用了饱和算法。 所得结果是Q15格式,范围[0x8000 0x7FFF]。 2. 一次读取两个Q15格式的数据。 3. 将四组数的乘积保存到Q31格式的变量mul1,mul2,mul3,mul4。 4. 丢弃32位数据的低15位,并把最终结果饱和到16位精度。 5. 通过SIMD指令__PKHBT将两个Q15格式的数据保存的结果数组中,从而一个指令周期就能完成两个数据的存储。
8.4.4 arm_mult_q7 这个函数用于求8位定点数的乘法,源代码分析如下:
- /**
- * @brief Q7 vector multiplication
- * @param[in] *pSrcA points to the first input vector
- * @param[in] *pSrcB points to the second input vector
- * @param[out] *pDst points to the output vector
- * @param[in] blockSize number of samples in each vector
- * @return none.
- *
- * Scaling and Overflow Behavior: (1)
- * par
- * The function uses saturating arithmetic.
- * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
- */
-
- void arm_mult_q7(
- q7_t * pSrcA,
- q7_t * pSrcB,
- q7_t * pDst,
- uint32_t blockSize)
- {
- uint32_t blkCnt; /* loop counters */
-
- #ifndef ARM_MATH_CM0_FAMILY
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
- q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
-
- /* loop Unrolling */
- blkCnt = blockSize >> 2u;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while(blkCnt > 0u)
- {
- /* C = A * B */
- /* Multiply the inputs and store the results in temporary variables */ (2)
- out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
- out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
- out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
- out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
-
- /* Store the results of 4 inputs in the destination buffer in single cycle by packing */
- *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4); (3)
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
-
- /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- blkCnt = blockSize % 0x4u;
-
- #else
-
- /* Run the below code for Cortex-M0 */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- #endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
-
- while(blkCnt > 0u)
- {
- /* C = A * B */
- /* Multiply the inputs and store the result in the destination buffer */
- *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
-
- /* Decrement the blockSize loop counter */
- blkCnt--;
- }
- }
复制代码
1. 这个函数使用了饱和算法。 所得结果是Q7格式,范围 [0x80 0x7F]。 2. 将两个Q7格式的数据乘积左移7位,也就是丢掉低7位的数据,并将所得结果饱和到8位精度。 3. __PACKq7函数可以在一个时钟周期就能完成相应操作。
8.4.5 实例讲解实验目的: 1. 四种类型数据的乘法。 实验内容: 1. 按下摇杆的UP键, 串口打印输出结果 实验现象: 通过窗口上位机软件SecureCRT(V5光盘里面有此软件)查看打印信息现象如下:
程序设计:
- /*
- *********************************************************************************************************
- * 函 数 名: DSP_Multiplication
- * 功能说明: 乘法
- * 形 参:无
- * 返 回 值: 无
- *********************************************************************************************************
- */
- static void DSP_Multiplication(void)
- {
- static float32_t pSrcA[5] = {1.0f,1.0f,1.0f,1.0f,1.0f};
- static float32_t pSrcB[5] = {1.0f,1.0f,1.0f,1.0f,1.0f};
- static float32_t pDst[5];
-
- static q31_t pSrcA1[5] = {1,1,1,1,1};
- static q31_t pSrcB1[5] = {1,1,1,1,1};
- static q31_t pDst1[5];
-
- static q15_t pSrcA2[5] = {1,1,1,1,1};
- static q15_t pSrcB2[5] = {1,1,1,1,1};
- static q15_t pDst2[5];
-
- static q7_t pSrcA3[5] = {0x70,1,1,1,1};
- static q7_t pSrcB3[5] = {0x7f,1,1,1,1};
- static q7_t pDst3[5];
-
-
- pSrcA[0] += 1.1f;
- arm_mult_f32(pSrcA, pSrcB, pDst, 5);
- printf("arm_mult_f32 = %frn", pDst[0]);
-
- pSrcA1[0] += 1;
- arm_mult_q31(pSrcA1, pSrcB1, pDst1, 5);
- printf("arm_mult_q31 = %drn", pDst1[0]);
-
- pSrcA2[0] += 1;
- arm_mult_q15(pSrcA2, pSrcB2, pDst2, 5);
- printf("arm_mult_q15 = %drn", pDst2[0]);
-
- pSrcA3[0] += 1;
- arm_mult_q7(pSrcA3, pSrcB3, pDst3, 5);
- printf("arm_mult_q7 = %drn", pDst3[0]);
- printf("***********************************rn");
- }
复制代码
|