|
|
@@ -0,0 +1,411 @@
|
|
|
+/* ----------------------------------------------------------------------
|
|
|
+ * Project: CMSIS DSP Library
|
|
|
+ * Title: arm_correlate_f64.c
|
|
|
+ * Description: Correlation of floating-point sequences
|
|
|
+ *
|
|
|
+ * $Date: 13 September 2021
|
|
|
+ * $Revision: V1.10.0
|
|
|
+ *
|
|
|
+ * Target Processor: Cortex-M and Cortex-A cores
|
|
|
+ * -------------------------------------------------------------------- */
|
|
|
+/*
|
|
|
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
|
|
|
+ *
|
|
|
+ * SPDX-License-Identifier: Apache-2.0
|
|
|
+ *
|
|
|
+ * Licensed under the Apache License, Version 2.0 (the License); you may
|
|
|
+ * not use this file except in compliance with the License.
|
|
|
+ * You may obtain a copy of the License at
|
|
|
+ *
|
|
|
+ * www.apache.org/licenses/LICENSE-2.0
|
|
|
+ *
|
|
|
+ * Unless required by applicable law or agreed to in writing, software
|
|
|
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
|
|
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+ * See the License for the specific language governing permissions and
|
|
|
+ * limitations under the License.
|
|
|
+ */
|
|
|
+
|
|
|
+#include "dsp/filtering_functions.h"
|
|
|
+
|
|
|
+/**
|
|
|
+ @ingroup groupFilters
|
|
|
+ */
|
|
|
+
|
|
|
+/**
|
|
|
+ @defgroup Corr Correlation
|
|
|
+
|
|
|
+ Correlation is a mathematical operation that is similar to convolution.
|
|
|
+ As with convolution, correlation uses two signals to produce a third signal.
|
|
|
+ The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
|
|
|
+ Correlation is commonly used to measure the similarity between two signals.
|
|
|
+ It has applications in pattern recognition, cryptanalysis, and searching.
|
|
|
+ The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
|
|
|
+ Fast versions of the Q15 and Q31 functions are also provided.
|
|
|
+
|
|
|
+ @par Algorithm
|
|
|
+ Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
|
|
|
+ The convolution of the two signals is denoted by
|
|
|
+ <pre>
|
|
|
+ c[n] = a[n] * b[n]
|
|
|
+ </pre>
|
|
|
+ In correlation, one of the signals is flipped in time
|
|
|
+ <pre>
|
|
|
+ c[n] = a[n] * b[-n]
|
|
|
+ </pre>
|
|
|
+ @par
|
|
|
+ and this is mathematically defined as
|
|
|
+ \image html CorrelateEquation.gif
|
|
|
+ @par
|
|
|
+ The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
|
|
|
+ The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
|
|
|
+ The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
|
|
|
+
|
|
|
+ @note
|
|
|
+ The <code>pDst</code> should be initialized to all zeros before being used.
|
|
|
+
|
|
|
+ @par Fixed-Point Behavior
|
|
|
+ Correlation requires summing up a large number of intermediate products.
|
|
|
+ As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
|
|
|
+ Refer to the function specific documentation below for further details of the particular algorithm used.
|
|
|
+
|
|
|
+ @par Fast Versions
|
|
|
+ Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
|
|
|
+ the input signals should be scaled down to avoid intermediate overflows.
|
|
|
+
|
|
|
+ @par Opt Versions
|
|
|
+ Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
|
|
|
+ These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of correlate
|
|
|
+ */
|
|
|
+
|
|
|
+/**
|
|
|
+ @addtogroup Corr
|
|
|
+ @{
|
|
|
+ */
|
|
|
+
|
|
|
+/**
|
|
|
+ @brief Correlation of floating-point sequences.
|
|
|
+ @param[in] pSrcA points to the first input sequence
|
|
|
+ @param[in] srcALen length of the first input sequence
|
|
|
+ @param[in] pSrcB points to the second input sequence
|
|
|
+ @param[in] srcBLen length of the second input sequence
|
|
|
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
|
|
|
+ @return none
|
|
|
+ */
|
|
|
+
|
|
|
+void arm_correlate_f64(
|
|
|
+ const float64_t * pSrcA,
|
|
|
+ uint32_t srcALen,
|
|
|
+ const float64_t * pSrcB,
|
|
|
+ uint32_t srcBLen,
|
|
|
+ float64_t * pDst)
|
|
|
+{
|
|
|
+ const float64_t *pIn1; /* InputA pointer */
|
|
|
+ const float64_t *pIn2; /* InputB pointer */
|
|
|
+ float64_t *pOut = pDst; /* Output pointer */
|
|
|
+ const float64_t *px; /* Intermediate inputA pointer */
|
|
|
+ const float64_t *py; /* Intermediate inputB pointer */
|
|
|
+ const float64_t *pSrc1;
|
|
|
+ float64_t sum;
|
|
|
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
|
|
|
+ uint32_t j, k, count, blkCnt; /* Loop counters */
|
|
|
+ uint32_t outBlockSize; /* Loop counter */
|
|
|
+ int32_t inc = 1; /* Destination address modifier */
|
|
|
+
|
|
|
+ /* The algorithm implementation is based on the lengths of the inputs. */
|
|
|
+ /* srcB is always made to slide across srcA. */
|
|
|
+ /* So srcBLen is always considered as shorter or equal to srcALen */
|
|
|
+ /* But CORR(x, y) is reverse of CORR(y, x) */
|
|
|
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
|
|
|
+ /* and the destination pointer modifier, inc is set to -1 */
|
|
|
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
|
|
|
+ /* But to improve the performance,
|
|
|
+ * we assume zeroes in the output instead of zero padding either of the the inputs*/
|
|
|
+ /* If srcALen > srcBLen,
|
|
|
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
|
|
|
+ /* If srcALen < srcBLen,
|
|
|
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
|
|
|
+ if (srcALen >= srcBLen)
|
|
|
+ {
|
|
|
+ /* Initialization of inputA pointer */
|
|
|
+ pIn1 = pSrcA;
|
|
|
+
|
|
|
+ /* Initialization of inputB pointer */
|
|
|
+ pIn2 = pSrcB;
|
|
|
+
|
|
|
+ /* Number of output samples is calculated */
|
|
|
+ outBlockSize = (2U * srcALen) - 1U;
|
|
|
+
|
|
|
+ /* When srcALen > srcBLen, zero padding has to be done to srcB
|
|
|
+ * to make their lengths equal.
|
|
|
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
|
|
|
+ * number of output samples are made zero */
|
|
|
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
|
|
|
+
|
|
|
+ /* Updating the pointer position to non zero value */
|
|
|
+ pOut += j;
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ /* Initialization of inputA pointer */
|
|
|
+ pIn1 = pSrcB;
|
|
|
+
|
|
|
+ /* Initialization of inputB pointer */
|
|
|
+ pIn2 = pSrcA;
|
|
|
+
|
|
|
+ /* srcBLen is always considered as shorter or equal to srcALen */
|
|
|
+ j = srcBLen;
|
|
|
+ srcBLen = srcALen;
|
|
|
+ srcALen = j;
|
|
|
+
|
|
|
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
|
|
|
+ /* Hence set the destination pointer to point to the last output sample */
|
|
|
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
|
|
|
+
|
|
|
+ /* Destination address modifier is set to -1 */
|
|
|
+ inc = -1;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* The function is internally
|
|
|
+ * divided into three stages according to the number of multiplications that has to be
|
|
|
+ * taken place between inputA samples and inputB samples. In the first stage of the
|
|
|
+ * algorithm, the multiplications increase by one for every iteration.
|
|
|
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
|
|
|
+ * In the third stage of the algorithm, the multiplications decrease by one
|
|
|
+ * for every iteration. */
|
|
|
+
|
|
|
+ /* The algorithm is implemented in three stages.
|
|
|
+ The loop counters of each stage is initiated here. */
|
|
|
+ blockSize1 = srcBLen - 1U;
|
|
|
+ blockSize2 = srcALen - (srcBLen - 1U);
|
|
|
+ blockSize3 = blockSize1;
|
|
|
+
|
|
|
+ /* --------------------------
|
|
|
+ * Initializations of stage1
|
|
|
+ * -------------------------*/
|
|
|
+
|
|
|
+ /* sum = x[0] * y[srcBlen - 1]
|
|
|
+ * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
|
|
|
+ * ....
|
|
|
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
|
|
|
+ */
|
|
|
+
|
|
|
+ /* In this stage the MAC operations are increased by 1 for every iteration.
|
|
|
+ The count variable holds the number of MAC operations performed */
|
|
|
+ count = 1U;
|
|
|
+
|
|
|
+ /* Working pointer of inputA */
|
|
|
+ px = pIn1;
|
|
|
+
|
|
|
+ /* Working pointer of inputB */
|
|
|
+ pSrc1 = pIn2 + (srcBLen - 1U);
|
|
|
+ py = pSrc1;
|
|
|
+
|
|
|
+ /* ------------------------
|
|
|
+ * Stage1 process
|
|
|
+ * ----------------------*/
|
|
|
+
|
|
|
+ /* The first stage starts here */
|
|
|
+ while (blockSize1 > 0U)
|
|
|
+ {
|
|
|
+ /* Accumulator is made zero for every iteration */
|
|
|
+ sum = 0.0f;
|
|
|
+
|
|
|
+ /* Initialize k with number of samples */
|
|
|
+ k = count;
|
|
|
+
|
|
|
+ while (k > 0U)
|
|
|
+ {
|
|
|
+ /* Perform the multiply-accumulate */
|
|
|
+ /* x[0] * y[srcBLen - 1] */
|
|
|
+ sum += *px++ * *py++;
|
|
|
+
|
|
|
+ /* Decrement loop counter */
|
|
|
+ k--;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Store the result in the accumulator in the destination buffer. */
|
|
|
+ *pOut = sum;
|
|
|
+ /* Destination pointer is updated according to the address modifier, inc */
|
|
|
+ pOut += inc;
|
|
|
+
|
|
|
+ /* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
+ py = pSrc1 - count;
|
|
|
+ px = pIn1;
|
|
|
+
|
|
|
+ /* Increment MAC count */
|
|
|
+ count++;
|
|
|
+
|
|
|
+ /* Decrement loop counter */
|
|
|
+ blockSize1--;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* --------------------------
|
|
|
+ * Initializations of stage2
|
|
|
+ * ------------------------*/
|
|
|
+
|
|
|
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
|
|
|
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
|
|
|
+ * ....
|
|
|
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
|
|
|
+ */
|
|
|
+
|
|
|
+ /* Working pointer of inputA */
|
|
|
+ px = pIn1;
|
|
|
+
|
|
|
+ /* Working pointer of inputB */
|
|
|
+ py = pIn2;
|
|
|
+
|
|
|
+ /* count is index by which the pointer pIn1 to be incremented */
|
|
|
+ count = 0U;
|
|
|
+
|
|
|
+ /* -------------------
|
|
|
+ * Stage2 process
|
|
|
+ * ------------------*/
|
|
|
+
|
|
|
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
|
|
|
+ * So, to loop unroll over blockSize2,
|
|
|
+ * srcBLen should be greater than or equal to 4 */
|
|
|
+ if (srcBLen >= 4U)
|
|
|
+ {
|
|
|
+ /* Initialize blkCnt with number of samples */
|
|
|
+ blkCnt = blockSize2;
|
|
|
+
|
|
|
+ while (blkCnt > 0U)
|
|
|
+ {
|
|
|
+ /* Accumulator is made zero for every iteration */
|
|
|
+ sum = 0.0f;
|
|
|
+
|
|
|
+ /* Initialize blkCnt with number of samples */
|
|
|
+ k = srcBLen;
|
|
|
+
|
|
|
+ while (k > 0U)
|
|
|
+ {
|
|
|
+ /* Perform the multiply-accumulate */
|
|
|
+ sum += *px++ * *py++;
|
|
|
+
|
|
|
+ /* Decrement the loop counter */
|
|
|
+ k--;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Store the result in the accumulator in the destination buffer. */
|
|
|
+ *pOut = sum;
|
|
|
+
|
|
|
+ /* Destination pointer is updated according to the address modifier, inc */
|
|
|
+ pOut += inc;
|
|
|
+
|
|
|
+ /* Increment the pointer pIn1 index, count by 1 */
|
|
|
+ count++;
|
|
|
+
|
|
|
+ /* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
+ px = pIn1 + count;
|
|
|
+ py = pIn2;
|
|
|
+
|
|
|
+ /* Decrement the loop counter */
|
|
|
+ blkCnt--;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ /* If the srcBLen is not a multiple of 4,
|
|
|
+ * the blockSize2 loop cannot be unrolled by 4 */
|
|
|
+ blkCnt = blockSize2;
|
|
|
+
|
|
|
+ while (blkCnt > 0U)
|
|
|
+ {
|
|
|
+ /* Accumulator is made zero for every iteration */
|
|
|
+ sum = 0.0f;
|
|
|
+
|
|
|
+ /* Loop over srcBLen */
|
|
|
+ k = srcBLen;
|
|
|
+
|
|
|
+ while (k > 0U)
|
|
|
+ {
|
|
|
+ /* Perform the multiply-accumulate */
|
|
|
+ sum += *px++ * *py++;
|
|
|
+
|
|
|
+ /* Decrement the loop counter */
|
|
|
+ k--;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Store the result in the accumulator in the destination buffer. */
|
|
|
+ *pOut = sum;
|
|
|
+ /* Destination pointer is updated according to the address modifier, inc */
|
|
|
+ pOut += inc;
|
|
|
+
|
|
|
+ /* Increment the pointer pIn1 index, count by 1 */
|
|
|
+ count++;
|
|
|
+
|
|
|
+ /* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
+ px = pIn1 + count;
|
|
|
+ py = pIn2;
|
|
|
+
|
|
|
+ /* Decrement the loop counter */
|
|
|
+ blkCnt--;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ /* --------------------------
|
|
|
+ * Initializations of stage3
|
|
|
+ * -------------------------*/
|
|
|
+
|
|
|
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
|
|
|
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
|
|
|
+ * ....
|
|
|
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
|
|
|
+ * sum += x[srcALen-1] * y[0]
|
|
|
+ */
|
|
|
+
|
|
|
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
|
|
|
+ The count variable holds the number of MAC operations performed */
|
|
|
+ count = srcBLen - 1U;
|
|
|
+
|
|
|
+ /* Working pointer of inputA */
|
|
|
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
|
|
|
+ px = pSrc1;
|
|
|
+
|
|
|
+ /* Working pointer of inputB */
|
|
|
+ py = pIn2;
|
|
|
+
|
|
|
+ /* -------------------
|
|
|
+ * Stage3 process
|
|
|
+ * ------------------*/
|
|
|
+
|
|
|
+ while (blockSize3 > 0U)
|
|
|
+ {
|
|
|
+ /* Accumulator is made zero for every iteration */
|
|
|
+ sum = 0.0f;
|
|
|
+
|
|
|
+ /* Initialize blkCnt with number of samples */
|
|
|
+ k = count;
|
|
|
+
|
|
|
+ while (k > 0U)
|
|
|
+ {
|
|
|
+ /* Perform the multiply-accumulate */
|
|
|
+ sum += *px++ * *py++;
|
|
|
+
|
|
|
+ /* Decrement loop counter */
|
|
|
+ k--;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Store the result in the accumulator in the destination buffer. */
|
|
|
+ *pOut = sum;
|
|
|
+ /* Destination pointer is updated according to the address modifier, inc */
|
|
|
+ pOut += inc;
|
|
|
+
|
|
|
+ /* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
+ px = ++pSrc1;
|
|
|
+ py = pIn2;
|
|
|
+
|
|
|
+ /* Decrement MAC count */
|
|
|
+ count--;
|
|
|
+
|
|
|
+ /* Decrement the loop counter */
|
|
|
+ blockSize3--;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ @} end of Corr group
|
|
|
+ */
|