| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039 |
- /* ----------------------------------------------------------------------
- * Project: CMSIS DSP Library
- * Title: arm_bitonic_sort_f32.c
- * Description: Floating point bitonic sort
- *
- * $Date: 23 April 2021
- * $Revision: V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
- /*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "dsp/support_functions.h"
- #include "arm_sorting.h"
- #if !defined(ARM_MATH_NEON)
- static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
- {
- uint32_t step;
- uint32_t k, j;
- float32_t *leftPtr, *rightPtr;
- float32_t temp;
- step = n>>1;
- leftPtr = pSrc;
- rightPtr = pSrc+n-1;
- for(k=0; k<step; k++)
- {
- if(dir == (*leftPtr > *rightPtr))
- {
- // Swap
- temp=*leftPtr;
- *leftPtr=*rightPtr;
- *rightPtr=temp;
- }
- leftPtr++; // Move right
- rightPtr--; // Move left
- }
- // Merge
- for(step=(n>>2); step>0; step/=2)
- {
- for(j=0; j<n; j=j+step*2)
- {
- leftPtr = pSrc+j;
- rightPtr = pSrc+j+step;
- for(k=0; k<step; k++)
- {
- if(*leftPtr > *rightPtr)
- {
- // Swap
- temp=*leftPtr;
- *leftPtr=*rightPtr;
- *rightPtr=temp;
- }
- leftPtr++;
- rightPtr++;
- }
- }
- }
- }
- #endif
- #if defined(ARM_MATH_NEON)
- static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
- {
- /* Start with two vectors:
- * +---+---+---+---+
- * | a | b | c | d |
- * +---+---+---+---+
- * +---+---+---+---+
- * | e | f | g | h |
- * +---+---+---+---+
- * All the elements of the first are guaranteed to be less than or equal to
- * all of the elements in the second, and both vectors are bitonic.
- * We need to perform these operations to completely sort both lists:
- * vminmax([abcd],[efgh])
- * vminmax([acbd],[egfh])
- */
- vtrn128_64q(a, b);
- /* +---+---+---+---+
- * | a | b | e | f |
- * +---+---+---+---+
- * +---+---+---+---+
- * | c | d | g | h |
- * +---+---+---+---+
- */
- if(dir)
- vminmaxq(a, b);
- else
- vminmaxq(b, a);
-
- vtrn128_32q(a, b);
- /* +---+---+---+---+
- * | a | c | e | g |
- * +---+---+---+---+
- * +---+---+---+---+
- * | b | d | f | h |
- * +---+---+---+---+
- */
- if(dir)
- vminmaxq(a, b);
- else
- vminmaxq(b, a);
-
- return vzipq_f32(a, b);
- }
- static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
- {
- /* a and b are guaranteed to be bitonic */
- // Reverse the element of the second vector
- b = vrev128q_f32(b);
- // Compare the two vectors
- if(dir)
- vminmaxq(a, b);
- else
- vminmaxq(b, a);
- // Merge the two vectors
- float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
- return ab;
- }
- static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
- {
- /* Start with two vectors:
- * +---+---+---+---+---+---+---+---+
- * | a | b | c | d | e | f | g | h |
- * +---+---+---+---+---+---+---+---+
- * +---+---+---+---+---+---+---+---+
- * | i | j | k | l | m | n | o | p |
- * +---+---+---+---+---+---+---+---+
- * All the elements of the first are guaranteed to be less than or equal to
- * all of the elements in the second, and both vectors are bitonic.
- * We need to perform these operations to completely sort both lists:
- * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
- * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
- * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
- */
- vtrn256_128q(a, b);
- /* +---+---+---+---+---+---+---+---+
- * | a | b | c | d | i | j | k | l |
- * +---+---+---+---+---+---+---+---+
- * +---+---+---+---+---+---+---+---+
- * | e | f | g | h | m | n | o | p |
- * +---+---+---+---+---+---+---+---+
- */
- if(dir)
- vminmax256q(a, b);
- else
- vminmax256q(b, a);
-
- vtrn256_64q(a, b);
-
- /* +---+---+---+---+---+---+---+---+
- * | a | b | e | f | i | j | m | n |
- * +---+---+---+---+---+---+---+---+
- * +---+---+---+---+---+---+---+---+
- * | c | d | g | h | k | l | o | p |
- * +---+---+---+---+---+---+---+---+
- */
- if(dir)
- vminmax256q(a, b);
- else
- vminmax256q(b, a);
-
- vtrn256_32q(a, b);
- /* We now have:
- * +---+---+---+---+---+---+---+---+
- * | a | c | e | g | i | k | m | o |
- * +---+---+---+---+---+---+---+---+
- * +---+---+---+---+---+---+---+---+
- * | b | d | f | h | j | l | n | p |
- * +---+---+---+---+---+---+---+---+
- */
- if(dir)
- vminmax256q(a, b);
- else
- vminmax256q(b, a);
-
- float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
- float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
-
- vst1q_f32(pOut, out1.val[0]);
- vst1q_f32(pOut+4, out1.val[1]);
- vst1q_f32(pOut+8, out2.val[0]);
- vst1q_f32(pOut+12, out2.val[1]);
- }
- static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
- {
- // Merge two preordered float32x4x2_t
- vrev256q_f32(b);
- if(dir)
- vminmax256q(a, b);
- else
- vminmax256q(b, a);
- arm_bitonic_resort_16_f32(pOut, a, b, dir);
- }
- static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
- {
- float32x4_t a;
- float32x4_t b;
- float32x4_t c;
- float32x4_t d;
- // Load 16 samples
- a = vld1q_f32(pSrc);
- b = vld1q_f32(pSrc+4);
- c = vld1q_f32(pSrc+8);
- d = vld1q_f32(pSrc+12);
-
- // Bitonic sorting network for 4 samples x 4 times
- if(dir)
- {
- vminmaxq(a, b);
- vminmaxq(c, d);
-
- vminmaxq(a, d);
- vminmaxq(b, c);
-
- vminmaxq(a, b);
- vminmaxq(c, d);
- }
- else
- {
- vminmaxq(b, a);
- vminmaxq(d, c);
-
- vminmaxq(d, a);
- vminmaxq(c, b);
-
- vminmaxq(b, a);
- vminmaxq(d, c);
- }
- float32x4x2_t ab = vtrnq_f32 (a, b);
- float32x4x2_t cd = vtrnq_f32 (c, d);
-
- // Transpose 4 ordered arrays of 4 samples
- a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
- b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
- c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
- d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
- // Merge pairs of arrays of 4 samples
- ab = arm_bitonic_merge_8_f32(a, b, dir);
- cd = arm_bitonic_merge_8_f32(c, d, dir);
-
- // Merge arrays of 8 samples
- arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
- }
- static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
- {
- //Compare
- if(dir)
- {
- vminmax256q(ab1, cd1);
- vminmax256q(ab2, cd2);
- }
- else
- {
- vminmax256q(cd1, ab1);
- vminmax256q(cd2, ab2);
- }
- //Transpose 256
- float32x4_t temp;
- temp = ab2.val[0];
- ab2.val[0] = cd1.val[0];
- cd1.val[0] = temp;
- temp = ab2.val[1];
- ab2.val[1] = cd1.val[1];
- cd1.val[1] = temp;
- //Compare
- if(dir)
- {
- vminmax256q(ab1, cd1);
- vminmax256q(ab2, cd2);
- }
- else
- {
- vminmax256q(cd1, ab1);
- vminmax256q(cd2, ab2);
- }
-
- //Transpose 128
- arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir);
- arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
- }
- static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
- {
- float32x4x2_t ab1, ab2, ab3, ab4;
- float32x4x2_t cd1, cd2, cd3, cd4;
- //Load and reverse second array
- ab1.val[0] = vld1q_f32(pSrc+0 );
- ab1.val[1] = vld1q_f32(pSrc+4 );
- ab2.val[0] = vld1q_f32(pSrc+8 );
- ab2.val[1] = vld1q_f32(pSrc+12);
- ab3.val[0] = vld1q_f32(pSrc+16);
- ab3.val[1] = vld1q_f32(pSrc+20);
- ab4.val[0] = vld1q_f32(pSrc+24);
- ab4.val[1] = vld1q_f32(pSrc+28);
- vldrev128q_f32(cd4.val[1], pSrc+32);
- vldrev128q_f32(cd4.val[0], pSrc+36);
- vldrev128q_f32(cd3.val[1], pSrc+40);
- vldrev128q_f32(cd3.val[0], pSrc+44);
- vldrev128q_f32(cd2.val[1], pSrc+48);
- vldrev128q_f32(cd2.val[0], pSrc+52);
- vldrev128q_f32(cd1.val[1], pSrc+56);
- vldrev128q_f32(cd1.val[0], pSrc+60);
-
- //Compare
- if(dir)
- {
- vminmax256q(ab1, cd1);
- vminmax256q(ab2, cd2);
- vminmax256q(ab3, cd3);
- vminmax256q(ab4, cd4);
- }
- else
- {
- vminmax256q(cd1, ab1);
- vminmax256q(cd2, ab2);
- vminmax256q(cd3, ab3);
- vminmax256q(cd4, ab4);
- }
- //Transpose 512
- float32x4_t temp;
- temp = ab3.val[0];
- ab3.val[0] = cd1.val[0];
- cd1.val[0] = temp;
- temp = ab3.val[1];
- ab3.val[1] = cd1.val[1];
- cd1.val[1] = temp;
- temp = ab4.val[0];
- ab4.val[0] = cd2.val[0];
- cd2.val[0] = temp;
- temp = ab4.val[1];
- ab4.val[1] = cd2.val[1];
- cd2.val[1] = temp;
- //Compare
- if(dir)
- {
- vminmax256q(ab1, cd1);
- vminmax256q(ab2, cd2);
- vminmax256q(ab3, cd3);
- vminmax256q(ab4, cd4);
- }
- else
- {
- vminmax256q(cd1, ab1);
- vminmax256q(cd2, ab2);
- vminmax256q(cd3, ab3);
- vminmax256q(cd4, ab4);
- }
-
- //Transpose 256
- arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir);
- arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
- }
- static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
- {
- float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
- float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
- //Load and reverse second array
- ab1.val[0] = vld1q_f32(pSrc+0 );
- ab1.val[1] = vld1q_f32(pSrc+4 );
- ab2.val[0] = vld1q_f32(pSrc+8 );
- ab2.val[1] = vld1q_f32(pSrc+12);
- ab3.val[0] = vld1q_f32(pSrc+16);
- ab3.val[1] = vld1q_f32(pSrc+20);
- ab4.val[0] = vld1q_f32(pSrc+24);
- ab4.val[1] = vld1q_f32(pSrc+28);
- ab5.val[0] = vld1q_f32(pSrc+32);
- ab5.val[1] = vld1q_f32(pSrc+36);
- ab6.val[0] = vld1q_f32(pSrc+40);
- ab6.val[1] = vld1q_f32(pSrc+44);
- ab7.val[0] = vld1q_f32(pSrc+48);
- ab7.val[1] = vld1q_f32(pSrc+52);
- ab8.val[0] = vld1q_f32(pSrc+56);
- ab8.val[1] = vld1q_f32(pSrc+60);
- vldrev128q_f32(cd8.val[1], pSrc+64);
- vldrev128q_f32(cd8.val[0], pSrc+68);
- vldrev128q_f32(cd7.val[1], pSrc+72);
- vldrev128q_f32(cd7.val[0], pSrc+76);
- vldrev128q_f32(cd6.val[1], pSrc+80);
- vldrev128q_f32(cd6.val[0], pSrc+84);
- vldrev128q_f32(cd5.val[1], pSrc+88);
- vldrev128q_f32(cd5.val[0], pSrc+92);
- vldrev128q_f32(cd4.val[1], pSrc+96);
- vldrev128q_f32(cd4.val[0], pSrc+100);
- vldrev128q_f32(cd3.val[1], pSrc+104);
- vldrev128q_f32(cd3.val[0], pSrc+108);
- vldrev128q_f32(cd2.val[1], pSrc+112);
- vldrev128q_f32(cd2.val[0], pSrc+116);
- vldrev128q_f32(cd1.val[1], pSrc+120);
- vldrev128q_f32(cd1.val[0], pSrc+124);
-
- //Compare
- if(dir)
- {
- vminmax256q(ab1, cd1);
- vminmax256q(ab2, cd2);
- vminmax256q(ab3, cd3);
- vminmax256q(ab4, cd4);
- vminmax256q(ab5, cd5);
- vminmax256q(ab6, cd6);
- vminmax256q(ab7, cd7);
- vminmax256q(ab8, cd8);
- }
- else
- {
- vminmax256q(cd1, ab1);
- vminmax256q(cd2, ab2);
- vminmax256q(cd3, ab3);
- vminmax256q(cd4, ab4);
- vminmax256q(cd5, ab5);
- vminmax256q(cd6, ab6);
- vminmax256q(cd7, ab7);
- vminmax256q(cd8, ab8);
- }
-
- //Transpose
- float32x4_t temp;
- temp = ab5.val[0];
- ab5.val[0] = cd1.val[0];
- cd1.val[0] = temp;
- temp = ab5.val[1];
- ab5.val[1] = cd1.val[1];
- cd1.val[1] = temp;
- temp = ab6.val[0];
- ab6.val[0] = cd2.val[0];
- cd2.val[0] = temp;
- temp = ab6.val[1];
- ab6.val[1] = cd2.val[1];
- cd2.val[1] = temp;
- temp = ab7.val[0];
- ab7.val[0] = cd3.val[0];
- cd3.val[0] = temp;
- temp = ab7.val[1];
- ab7.val[1] = cd3.val[1];
- cd3.val[1] = temp;
- temp = ab8.val[0];
- ab8.val[0] = cd4.val[0];
- cd4.val[0] = temp;
- temp = ab8.val[1];
- ab8.val[1] = cd4.val[1];
- cd4.val[1] = temp;
- //Compare
- if(dir)
- {
- vminmax256q(ab1, cd1);
- vminmax256q(ab2, cd2);
- vminmax256q(ab3, cd3);
- vminmax256q(ab4, cd4);
- vminmax256q(ab5, cd5);
- vminmax256q(ab6, cd6);
- vminmax256q(ab7, cd7);
- vminmax256q(ab8, cd8);
- }
- else
- {
- vminmax256q(cd1, ab1);
- vminmax256q(cd2, ab2);
- vminmax256q(cd3, ab3);
- vminmax256q(cd4, ab4);
- vminmax256q(cd5, ab5);
- vminmax256q(cd6, ab6);
- vminmax256q(cd7, ab7);
- vminmax256q(cd8, ab8);
- }
- vst1q_f32(pSrc, ab1.val[0]);
- vst1q_f32(pSrc+4, ab1.val[1]);
- vst1q_f32(pSrc+8, ab2.val[0]);
- vst1q_f32(pSrc+12, ab2.val[1]);
- vst1q_f32(pSrc+16, ab3.val[0]);
- vst1q_f32(pSrc+20, ab3.val[1]);
- vst1q_f32(pSrc+24, ab4.val[0]);
- vst1q_f32(pSrc+28, ab4.val[1]);
- vst1q_f32(pSrc+32, cd1.val[0]);
- vst1q_f32(pSrc+36, cd1.val[1]);
- vst1q_f32(pSrc+40, cd2.val[0]);
- vst1q_f32(pSrc+44, cd2.val[1]);
- vst1q_f32(pSrc+48, cd3.val[0]);
- vst1q_f32(pSrc+52, cd3.val[1]);
- vst1q_f32(pSrc+56, cd4.val[0]);
- vst1q_f32(pSrc+60, cd4.val[1]);
- vst1q_f32(pSrc+64, ab5.val[0]);
- vst1q_f32(pSrc+68, ab5.val[1]);
- vst1q_f32(pSrc+72, ab6.val[0]);
- vst1q_f32(pSrc+76, ab6.val[1]);
- vst1q_f32(pSrc+80, ab7.val[0]);
- vst1q_f32(pSrc+84, ab7.val[1]);
- vst1q_f32(pSrc+88, ab8.val[0]);
- vst1q_f32(pSrc+92, ab8.val[1]);
- vst1q_f32(pSrc+96, cd5.val[0]);
- vst1q_f32(pSrc+100, cd5.val[1]);
- vst1q_f32(pSrc+104, cd6.val[0]);
- vst1q_f32(pSrc+108, cd6.val[1]);
- vst1q_f32(pSrc+112, cd7.val[0]);
- vst1q_f32(pSrc+116, cd7.val[1]);
- vst1q_f32(pSrc+120, cd8.val[0]);
- vst1q_f32(pSrc+124, cd8.val[1]);
- //Transpose
- arm_bitonic_merge_64_f32(pSrc+0 , dir);
- arm_bitonic_merge_64_f32(pSrc+64, dir);
- }
- static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
- {
- float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
- float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
- float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
- float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
- //Load and reverse second array
- ab1.val[0] = vld1q_f32(pSrc+0 );
- ab1.val[1] = vld1q_f32(pSrc+4 );
- ab2.val[0] = vld1q_f32(pSrc+8 );
- ab2.val[1] = vld1q_f32(pSrc+12 );
- ab3.val[0] = vld1q_f32(pSrc+16 );
- ab3.val[1] = vld1q_f32(pSrc+20 );
- ab4.val[0] = vld1q_f32(pSrc+24 );
- ab4.val[1] = vld1q_f32(pSrc+28 );
- ab5.val[0] = vld1q_f32(pSrc+32 );
- ab5.val[1] = vld1q_f32(pSrc+36 );
- ab6.val[0] = vld1q_f32(pSrc+40 );
- ab6.val[1] = vld1q_f32(pSrc+44 );
- ab7.val[0] = vld1q_f32(pSrc+48 );
- ab7.val[1] = vld1q_f32(pSrc+52 );
- ab8.val[0] = vld1q_f32(pSrc+56 );
- ab8.val[1] = vld1q_f32(pSrc+60 );
- ab9.val[0] = vld1q_f32(pSrc+64 );
- ab9.val[1] = vld1q_f32(pSrc+68 );
- ab10.val[0] = vld1q_f32(pSrc+72 );
- ab10.val[1] = vld1q_f32(pSrc+76 );
- ab11.val[0] = vld1q_f32(pSrc+80 );
- ab11.val[1] = vld1q_f32(pSrc+84 );
- ab12.val[0] = vld1q_f32(pSrc+88 );
- ab12.val[1] = vld1q_f32(pSrc+92 );
- ab13.val[0] = vld1q_f32(pSrc+96 );
- ab13.val[1] = vld1q_f32(pSrc+100);
- ab14.val[0] = vld1q_f32(pSrc+104);
- ab14.val[1] = vld1q_f32(pSrc+108);
- ab15.val[0] = vld1q_f32(pSrc+112);
- ab15.val[1] = vld1q_f32(pSrc+116);
- ab16.val[0] = vld1q_f32(pSrc+120);
- ab16.val[1] = vld1q_f32(pSrc+124);
- vldrev128q_f32(cd16.val[1], pSrc+128);
- vldrev128q_f32(cd16.val[0], pSrc+132);
- vldrev128q_f32(cd15.val[1], pSrc+136);
- vldrev128q_f32(cd15.val[0], pSrc+140);
- vldrev128q_f32(cd14.val[1], pSrc+144);
- vldrev128q_f32(cd14.val[0], pSrc+148);
- vldrev128q_f32(cd13.val[1], pSrc+152);
- vldrev128q_f32(cd13.val[0], pSrc+156);
- vldrev128q_f32(cd12.val[1], pSrc+160);
- vldrev128q_f32(cd12.val[0], pSrc+164);
- vldrev128q_f32(cd11.val[1], pSrc+168);
- vldrev128q_f32(cd11.val[0], pSrc+172);
- vldrev128q_f32(cd10.val[1], pSrc+176);
- vldrev128q_f32(cd10.val[0], pSrc+180);
- vldrev128q_f32(cd9.val[1] , pSrc+184);
- vldrev128q_f32(cd9.val[0] , pSrc+188);
- vldrev128q_f32(cd8.val[1] , pSrc+192);
- vldrev128q_f32(cd8.val[0] , pSrc+196);
- vldrev128q_f32(cd7.val[1] , pSrc+200);
- vldrev128q_f32(cd7.val[0] , pSrc+204);
- vldrev128q_f32(cd6.val[1] , pSrc+208);
- vldrev128q_f32(cd6.val[0] , pSrc+212);
- vldrev128q_f32(cd5.val[1] , pSrc+216);
- vldrev128q_f32(cd5.val[0] , pSrc+220);
- vldrev128q_f32(cd4.val[1] , pSrc+224);
- vldrev128q_f32(cd4.val[0] , pSrc+228);
- vldrev128q_f32(cd3.val[1] , pSrc+232);
- vldrev128q_f32(cd3.val[0] , pSrc+236);
- vldrev128q_f32(cd2.val[1] , pSrc+240);
- vldrev128q_f32(cd2.val[0] , pSrc+244);
- vldrev128q_f32(cd1.val[1] , pSrc+248);
- vldrev128q_f32(cd1.val[0] , pSrc+252);
-
- //Compare
- if(dir)
- {
- vminmax256q(ab1 , cd1 );
- vminmax256q(ab2 , cd2 );
- vminmax256q(ab3 , cd3 );
- vminmax256q(ab4 , cd4 );
- vminmax256q(ab5 , cd5 );
- vminmax256q(ab6 , cd6 );
- vminmax256q(ab7 , cd7 );
- vminmax256q(ab8 , cd8 );
- vminmax256q(ab9 , cd9 );
- vminmax256q(ab10, cd10);
- vminmax256q(ab11, cd11);
- vminmax256q(ab12, cd12);
- vminmax256q(ab13, cd13);
- vminmax256q(ab14, cd14);
- vminmax256q(ab15, cd15);
- vminmax256q(ab16, cd16);
- }
- else
- {
- vminmax256q(cd1 , ab1 );
- vminmax256q(cd2 , ab2 );
- vminmax256q(cd3 , ab3 );
- vminmax256q(cd4 , ab4 );
- vminmax256q(cd5 , ab5 );
- vminmax256q(cd6 , ab6 );
- vminmax256q(cd7 , ab7 );
- vminmax256q(cd8 , ab8 );
- vminmax256q(cd9 , ab9 );
- vminmax256q(cd10, ab10);
- vminmax256q(cd11, ab11);
- vminmax256q(cd12, ab12);
- vminmax256q(cd13, ab13);
- vminmax256q(cd14, ab14);
- vminmax256q(cd15, ab15);
- vminmax256q(cd16, ab16);
- }
- //Transpose
- float32x4_t temp;
- temp = ab9.val[0];
- ab9.val[0] = cd1.val[0];
- cd1.val[0] = temp;
- temp = ab9.val[1];
- ab9.val[1] = cd1.val[1];
- cd1.val[1] = temp;
- temp = ab10.val[0];
- ab10.val[0] = cd2.val[0];
- cd2.val[0] = temp;
- temp = ab10.val[1];
- ab10.val[1] = cd2.val[1];
- cd2.val[1] = temp;
- temp = ab11.val[0];
- ab11.val[0] = cd3.val[0];
- cd3.val[0] = temp;
- temp = ab11.val[1];
- ab11.val[1] = cd3.val[1];
- cd3.val[1] = temp;
- temp = ab12.val[0];
- ab12.val[0] = cd4.val[0];
- cd4.val[0] = temp;
- temp = ab12.val[1];
- ab12.val[1] = cd4.val[1];
- cd4.val[1] = temp;
- temp = ab13.val[0];
- ab13.val[0] = cd5.val[0];
- cd5.val[0] = temp;
- temp = ab13.val[1];
- ab13.val[1] = cd5.val[1];
- cd5.val[1] = temp;
- temp = ab14.val[0];
- ab14.val[0] = cd6.val[0];
- cd6.val[0] = temp;
- temp = ab14.val[1];
- ab14.val[1] = cd6.val[1];
- cd6.val[1] = temp;
- temp = ab15.val[0];
- ab15.val[0] = cd7.val[0];
- cd7.val[0] = temp;
- temp = ab15.val[1];
- ab15.val[1] = cd7.val[1];
- cd7.val[1] = temp;
- temp = ab16.val[0];
- ab16.val[0] = cd8.val[0];
- cd8.val[0] = temp;
- temp = ab16.val[1];
- ab16.val[1] = cd8.val[1];
- cd8.val[1] = temp;
- //Compare
- if(dir)
- {
- vminmax256q(ab1 , cd1 );
- vminmax256q(ab2 , cd2 );
- vminmax256q(ab3 , cd3 );
- vminmax256q(ab4 , cd4 );
- vminmax256q(ab5 , cd5 );
- vminmax256q(ab6 , cd6 );
- vminmax256q(ab7 , cd7 );
- vminmax256q(ab8 , cd8 );
- vminmax256q(ab9 , cd9 );
- vminmax256q(ab10, cd10);
- vminmax256q(ab11, cd11);
- vminmax256q(ab12, cd12);
- vminmax256q(ab13, cd13);
- vminmax256q(ab14, cd14);
- vminmax256q(ab15, cd15);
- vminmax256q(ab16, cd16);
- }
- else
- {
- vminmax256q(cd1 , ab1 );
- vminmax256q(cd2 , ab2 );
- vminmax256q(cd3 , ab3 );
- vminmax256q(cd4 , ab4 );
- vminmax256q(cd5 , ab5 );
- vminmax256q(cd6 , ab6 );
- vminmax256q(cd7 , ab7 );
- vminmax256q(cd8 , ab8 );
- vminmax256q(cd9 , ab9 );
- vminmax256q(cd10, ab10);
- vminmax256q(cd11, ab11);
- vminmax256q(cd12, ab12);
- vminmax256q(cd13, ab13);
- vminmax256q(cd14, ab14);
- vminmax256q(cd15, ab15);
- vminmax256q(cd16, ab16);
- }
- vst1q_f32(pSrc, ab1.val[0] );
- vst1q_f32(pSrc+4, ab1.val[1] );
- vst1q_f32(pSrc+8, ab2.val[0] );
- vst1q_f32(pSrc+12, ab2.val[1] );
- vst1q_f32(pSrc+16, ab3.val[0] );
- vst1q_f32(pSrc+20, ab3.val[1] );
- vst1q_f32(pSrc+24, ab4.val[0] );
- vst1q_f32(pSrc+28, ab4.val[1] );
- vst1q_f32(pSrc+32, ab5.val[0] );
- vst1q_f32(pSrc+36, ab5.val[1] );
- vst1q_f32(pSrc+40, ab6.val[0] );
- vst1q_f32(pSrc+44, ab6.val[1] );
- vst1q_f32(pSrc+48, ab7.val[0] );
- vst1q_f32(pSrc+52, ab7.val[1] );
- vst1q_f32(pSrc+56, ab8.val[0] );
- vst1q_f32(pSrc+60, ab8.val[1] );
- vst1q_f32(pSrc+64, cd1.val[0] );
- vst1q_f32(pSrc+68, cd1.val[1] );
- vst1q_f32(pSrc+72, cd2.val[0] );
- vst1q_f32(pSrc+76, cd2.val[1] );
- vst1q_f32(pSrc+80, cd3.val[0] );
- vst1q_f32(pSrc+84, cd3.val[1] );
- vst1q_f32(pSrc+88, cd4.val[0] );
- vst1q_f32(pSrc+92, cd4.val[1] );
- vst1q_f32(pSrc+96, cd5.val[0] );
- vst1q_f32(pSrc+100, cd5.val[1] );
- vst1q_f32(pSrc+104, cd6.val[0] );
- vst1q_f32(pSrc+108, cd6.val[1] );
- vst1q_f32(pSrc+112, cd7.val[0] );
- vst1q_f32(pSrc+116, cd7.val[1] );
- vst1q_f32(pSrc+120, cd8.val[0] );
- vst1q_f32(pSrc+124, cd8.val[1] );
- vst1q_f32(pSrc+128, ab9.val[0] );
- vst1q_f32(pSrc+132, ab9.val[1] );
- vst1q_f32(pSrc+136, ab10.val[0]);
- vst1q_f32(pSrc+140, ab10.val[1]);
- vst1q_f32(pSrc+144, ab11.val[0]);
- vst1q_f32(pSrc+148, ab11.val[1]);
- vst1q_f32(pSrc+152, ab12.val[0]);
- vst1q_f32(pSrc+156, ab12.val[1]);
- vst1q_f32(pSrc+160, ab13.val[0]);
- vst1q_f32(pSrc+164, ab13.val[1]);
- vst1q_f32(pSrc+168, ab14.val[0]);
- vst1q_f32(pSrc+172, ab14.val[1]);
- vst1q_f32(pSrc+176, ab15.val[0]);
- vst1q_f32(pSrc+180, ab15.val[1]);
- vst1q_f32(pSrc+184, ab16.val[0]);
- vst1q_f32(pSrc+188, ab16.val[1]);
- vst1q_f32(pSrc+192, cd9.val[0] );
- vst1q_f32(pSrc+196, cd9.val[1] );
- vst1q_f32(pSrc+200, cd10.val[0]);
- vst1q_f32(pSrc+204, cd10.val[1]);
- vst1q_f32(pSrc+208, cd11.val[0]);
- vst1q_f32(pSrc+212, cd11.val[1]);
- vst1q_f32(pSrc+216, cd12.val[0]);
- vst1q_f32(pSrc+220, cd12.val[1]);
- vst1q_f32(pSrc+224, cd13.val[0]);
- vst1q_f32(pSrc+228, cd13.val[1]);
- vst1q_f32(pSrc+232, cd14.val[0]);
- vst1q_f32(pSrc+236, cd14.val[1]);
- vst1q_f32(pSrc+240, cd15.val[0]);
- vst1q_f32(pSrc+244, cd15.val[1]);
- vst1q_f32(pSrc+248, cd16.val[0]);
- vst1q_f32(pSrc+252, cd16.val[1]);
- //Transpose
- arm_bitonic_merge_128_f32(pSrc+0 , dir);
- arm_bitonic_merge_128_f32(pSrc+128, dir);
- }
- #define SWAP(a,i,j) \
- temp = vgetq_lane_f32(a, j); \
- a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\
- a = vsetq_lane_f32(temp, a, i);
- static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
- {
- float32_t temp;
- if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
- {
- SWAP(a,0,1);
- }
- if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) )
- {
- SWAP(a,2,3);
- }
- if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) )
- {
- SWAP(a,0,3);
- }
- if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) )
- {
- SWAP(a,1,2);
- }
- if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
- {
- SWAP(a,0,1);
- }
- if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) )
- {
- SWAP(a,2,3);
- }
- return a;
- }
- static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
- {
- a = arm_bitonic_sort_4_f32(a, dir);
- b = arm_bitonic_sort_4_f32(b, dir);
- return arm_bitonic_merge_8_f32(a, b, dir);
- }
- #endif
- /**
- @ingroup groupSupport
- */
- /**
- @defgroup Sorting Vector sorting algorithms
- Sort the elements of a vector
- There are separate functions for floating-point, Q31, Q15, and Q7 data types.
- */
- /**
- @addtogroup Sorting
- @{
- */
- /**
- * @private
- * @param[in] S points to an instance of the sorting structure.
- * @param[in] pSrc points to the block of input data.
- * @param[out] pDst points to the block of output data
- * @param[in] blockSize number of samples to process.
- */
- void arm_bitonic_sort_f32(
- const arm_sort_instance_f32 * S,
- float32_t * pSrc,
- float32_t * pDst,
- uint32_t blockSize)
- {
- uint16_t s, i;
- uint8_t dir = S->dir;
- #ifdef ARM_MATH_NEON
- (void)s;
- float32_t * pOut;
- uint16_t counter = blockSize>>5;
- if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
- {
- if(pSrc == pDst) // in-place
- pOut = pSrc;
- else
- pOut = pDst;
-
- float32x4x2_t ab1, ab2;
- float32x4x2_t cd1, cd2;
- if(blockSize == 1)
- pOut = pSrc;
- else if(blockSize == 2)
- {
- float32_t temp;
-
- if( dir==(pSrc[0]>pSrc[1]) )
- {
- temp = pSrc[1];
- pOut[1] = pSrc[0];
- pOut[0] = temp;
- }
- else
- pOut = pSrc;
- }
- else if(blockSize == 4)
- {
- float32x4_t a = vld1q_f32(pSrc);
- a = arm_bitonic_sort_4_f32(a, dir);
- vst1q_f32(pOut, a);
- }
- else if(blockSize == 8)
- {
- float32x4_t a;
- float32x4_t b;
- float32x4x2_t ab;
-
- a = vld1q_f32(pSrc);
- b = vld1q_f32(pSrc+4);
-
- ab = arm_bitonic_sort_8_f32(a, b, dir);
- vst1q_f32(pOut, ab.val[0]);
- vst1q_f32(pOut+4, ab.val[1]);
- }
- else if(blockSize >=16)
- {
- // Order 16 bits long vectors
- for(i=0; i<blockSize; i=i+16)
- arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir);
-
- // Merge
- for(i=0; i<counter; i++)
- {
- // Load and reverse second vector
- ab1.val[0] = vld1q_f32(pOut+32*i+0 );
- ab1.val[1] = vld1q_f32(pOut+32*i+4 );
- ab2.val[0] = vld1q_f32(pOut+32*i+8 );
- ab2.val[1] = vld1q_f32(pOut+32*i+12);
- vldrev128q_f32(cd2.val[1], pOut+32*i+16);
- vldrev128q_f32(cd2.val[0], pOut+32*i+20);
- vldrev128q_f32(cd1.val[1], pOut+32*i+24);
- vldrev128q_f32(cd1.val[0], pOut+32*i+28);
- arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir);
- }
-
- counter = counter>>1;
- for(i=0; i<counter; i++)
- arm_bitonic_merge_64_f32(pOut+64*i, dir);
-
- counter = counter>>1;
- for(i=0; i<counter; i++)
- arm_bitonic_merge_128_f32(pOut+128*i, dir);
-
- counter = counter>>1;
- for(i=0; i<counter; i++)
- arm_bitonic_merge_256_f32(pOut+256*i, dir);
- // Etc...
- }
- }
- #else
- float32_t * pA;
- if(pSrc != pDst) // out-of-place
- {
- memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
- pA = pDst;
- }
- else
- pA = pSrc;
- if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
- {
- for(s=2; s<=blockSize; s=s*2)
- {
- for(i=0; i<blockSize; i=i+s)
- arm_bitonic_sort_core_f32(pA+i, s, dir);
- }
- }
- #endif
- }
- /**
- @} end of Sorting group
- */
|