arm_bitonic_sort_f32.c 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_bitonic_sort_f32.c
  4. * Description: Floating point bitonic sort
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/support_functions.h"
  29. #include "arm_sorting.h"
  30. #if !defined(ARM_MATH_NEON)
  31. static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
  32. {
  33. uint32_t step;
  34. uint32_t k, j;
  35. float32_t *leftPtr, *rightPtr;
  36. float32_t temp;
  37. step = n>>1;
  38. leftPtr = pSrc;
  39. rightPtr = pSrc+n-1;
  40. for(k=0; k<step; k++)
  41. {
  42. if(dir == (*leftPtr > *rightPtr))
  43. {
  44. // Swap
  45. temp=*leftPtr;
  46. *leftPtr=*rightPtr;
  47. *rightPtr=temp;
  48. }
  49. leftPtr++; // Move right
  50. rightPtr--; // Move left
  51. }
  52. // Merge
  53. for(step=(n>>2); step>0; step/=2)
  54. {
  55. for(j=0; j<n; j=j+step*2)
  56. {
  57. leftPtr = pSrc+j;
  58. rightPtr = pSrc+j+step;
  59. for(k=0; k<step; k++)
  60. {
  61. if(*leftPtr > *rightPtr)
  62. {
  63. // Swap
  64. temp=*leftPtr;
  65. *leftPtr=*rightPtr;
  66. *rightPtr=temp;
  67. }
  68. leftPtr++;
  69. rightPtr++;
  70. }
  71. }
  72. }
  73. }
  74. #endif
  75. #if defined(ARM_MATH_NEON)
  76. static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
  77. {
  78. /* Start with two vectors:
  79. * +---+---+---+---+
  80. * | a | b | c | d |
  81. * +---+---+---+---+
  82. * +---+---+---+---+
  83. * | e | f | g | h |
  84. * +---+---+---+---+
  85. * All the elements of the first are guaranteed to be less than or equal to
  86. * all of the elements in the second, and both vectors are bitonic.
  87. * We need to perform these operations to completely sort both lists:
  88. * vminmax([abcd],[efgh])
  89. * vminmax([acbd],[egfh])
  90. */
  91. vtrn128_64q(a, b);
  92. /* +---+---+---+---+
  93. * | a | b | e | f |
  94. * +---+---+---+---+
  95. * +---+---+---+---+
  96. * | c | d | g | h |
  97. * +---+---+---+---+
  98. */
  99. if(dir)
  100. vminmaxq(a, b);
  101. else
  102. vminmaxq(b, a);
  103. vtrn128_32q(a, b);
  104. /* +---+---+---+---+
  105. * | a | c | e | g |
  106. * +---+---+---+---+
  107. * +---+---+---+---+
  108. * | b | d | f | h |
  109. * +---+---+---+---+
  110. */
  111. if(dir)
  112. vminmaxq(a, b);
  113. else
  114. vminmaxq(b, a);
  115. return vzipq_f32(a, b);
  116. }
  117. static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
  118. {
  119. /* a and b are guaranteed to be bitonic */
  120. // Reverse the element of the second vector
  121. b = vrev128q_f32(b);
  122. // Compare the two vectors
  123. if(dir)
  124. vminmaxq(a, b);
  125. else
  126. vminmaxq(b, a);
  127. // Merge the two vectors
  128. float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
  129. return ab;
  130. }
  131. static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
  132. {
  133. /* Start with two vectors:
  134. * +---+---+---+---+---+---+---+---+
  135. * | a | b | c | d | e | f | g | h |
  136. * +---+---+---+---+---+---+---+---+
  137. * +---+---+---+---+---+---+---+---+
  138. * | i | j | k | l | m | n | o | p |
  139. * +---+---+---+---+---+---+---+---+
  140. * All the elements of the first are guaranteed to be less than or equal to
  141. * all of the elements in the second, and both vectors are bitonic.
  142. * We need to perform these operations to completely sort both lists:
  143. * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
  144. * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
  145. * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
  146. */
  147. vtrn256_128q(a, b);
  148. /* +---+---+---+---+---+---+---+---+
  149. * | a | b | c | d | i | j | k | l |
  150. * +---+---+---+---+---+---+---+---+
  151. * +---+---+---+---+---+---+---+---+
  152. * | e | f | g | h | m | n | o | p |
  153. * +---+---+---+---+---+---+---+---+
  154. */
  155. if(dir)
  156. vminmax256q(a, b);
  157. else
  158. vminmax256q(b, a);
  159. vtrn256_64q(a, b);
  160. /* +---+---+---+---+---+---+---+---+
  161. * | a | b | e | f | i | j | m | n |
  162. * +---+---+---+---+---+---+---+---+
  163. * +---+---+---+---+---+---+---+---+
  164. * | c | d | g | h | k | l | o | p |
  165. * +---+---+---+---+---+---+---+---+
  166. */
  167. if(dir)
  168. vminmax256q(a, b);
  169. else
  170. vminmax256q(b, a);
  171. vtrn256_32q(a, b);
  172. /* We now have:
  173. * +---+---+---+---+---+---+---+---+
  174. * | a | c | e | g | i | k | m | o |
  175. * +---+---+---+---+---+---+---+---+
  176. * +---+---+---+---+---+---+---+---+
  177. * | b | d | f | h | j | l | n | p |
  178. * +---+---+---+---+---+---+---+---+
  179. */
  180. if(dir)
  181. vminmax256q(a, b);
  182. else
  183. vminmax256q(b, a);
  184. float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
  185. float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
  186. vst1q_f32(pOut, out1.val[0]);
  187. vst1q_f32(pOut+4, out1.val[1]);
  188. vst1q_f32(pOut+8, out2.val[0]);
  189. vst1q_f32(pOut+12, out2.val[1]);
  190. }
  191. static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
  192. {
  193. // Merge two preordered float32x4x2_t
  194. vrev256q_f32(b);
  195. if(dir)
  196. vminmax256q(a, b);
  197. else
  198. vminmax256q(b, a);
  199. arm_bitonic_resort_16_f32(pOut, a, b, dir);
  200. }
  201. static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
  202. {
  203. float32x4_t a;
  204. float32x4_t b;
  205. float32x4_t c;
  206. float32x4_t d;
  207. // Load 16 samples
  208. a = vld1q_f32(pSrc);
  209. b = vld1q_f32(pSrc+4);
  210. c = vld1q_f32(pSrc+8);
  211. d = vld1q_f32(pSrc+12);
  212. // Bitonic sorting network for 4 samples x 4 times
  213. if(dir)
  214. {
  215. vminmaxq(a, b);
  216. vminmaxq(c, d);
  217. vminmaxq(a, d);
  218. vminmaxq(b, c);
  219. vminmaxq(a, b);
  220. vminmaxq(c, d);
  221. }
  222. else
  223. {
  224. vminmaxq(b, a);
  225. vminmaxq(d, c);
  226. vminmaxq(d, a);
  227. vminmaxq(c, b);
  228. vminmaxq(b, a);
  229. vminmaxq(d, c);
  230. }
  231. float32x4x2_t ab = vtrnq_f32 (a, b);
  232. float32x4x2_t cd = vtrnq_f32 (c, d);
  233. // Transpose 4 ordered arrays of 4 samples
  234. a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
  235. b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
  236. c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
  237. d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
  238. // Merge pairs of arrays of 4 samples
  239. ab = arm_bitonic_merge_8_f32(a, b, dir);
  240. cd = arm_bitonic_merge_8_f32(c, d, dir);
  241. // Merge arrays of 8 samples
  242. arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
  243. }
  244. static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
  245. {
  246. //Compare
  247. if(dir)
  248. {
  249. vminmax256q(ab1, cd1);
  250. vminmax256q(ab2, cd2);
  251. }
  252. else
  253. {
  254. vminmax256q(cd1, ab1);
  255. vminmax256q(cd2, ab2);
  256. }
  257. //Transpose 256
  258. float32x4_t temp;
  259. temp = ab2.val[0];
  260. ab2.val[0] = cd1.val[0];
  261. cd1.val[0] = temp;
  262. temp = ab2.val[1];
  263. ab2.val[1] = cd1.val[1];
  264. cd1.val[1] = temp;
  265. //Compare
  266. if(dir)
  267. {
  268. vminmax256q(ab1, cd1);
  269. vminmax256q(ab2, cd2);
  270. }
  271. else
  272. {
  273. vminmax256q(cd1, ab1);
  274. vminmax256q(cd2, ab2);
  275. }
  276. //Transpose 128
  277. arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir);
  278. arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
  279. }
  280. static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
  281. {
  282. float32x4x2_t ab1, ab2, ab3, ab4;
  283. float32x4x2_t cd1, cd2, cd3, cd4;
  284. //Load and reverse second array
  285. ab1.val[0] = vld1q_f32(pSrc+0 );
  286. ab1.val[1] = vld1q_f32(pSrc+4 );
  287. ab2.val[0] = vld1q_f32(pSrc+8 );
  288. ab2.val[1] = vld1q_f32(pSrc+12);
  289. ab3.val[0] = vld1q_f32(pSrc+16);
  290. ab3.val[1] = vld1q_f32(pSrc+20);
  291. ab4.val[0] = vld1q_f32(pSrc+24);
  292. ab4.val[1] = vld1q_f32(pSrc+28);
  293. vldrev128q_f32(cd4.val[1], pSrc+32);
  294. vldrev128q_f32(cd4.val[0], pSrc+36);
  295. vldrev128q_f32(cd3.val[1], pSrc+40);
  296. vldrev128q_f32(cd3.val[0], pSrc+44);
  297. vldrev128q_f32(cd2.val[1], pSrc+48);
  298. vldrev128q_f32(cd2.val[0], pSrc+52);
  299. vldrev128q_f32(cd1.val[1], pSrc+56);
  300. vldrev128q_f32(cd1.val[0], pSrc+60);
  301. //Compare
  302. if(dir)
  303. {
  304. vminmax256q(ab1, cd1);
  305. vminmax256q(ab2, cd2);
  306. vminmax256q(ab3, cd3);
  307. vminmax256q(ab4, cd4);
  308. }
  309. else
  310. {
  311. vminmax256q(cd1, ab1);
  312. vminmax256q(cd2, ab2);
  313. vminmax256q(cd3, ab3);
  314. vminmax256q(cd4, ab4);
  315. }
  316. //Transpose 512
  317. float32x4_t temp;
  318. temp = ab3.val[0];
  319. ab3.val[0] = cd1.val[0];
  320. cd1.val[0] = temp;
  321. temp = ab3.val[1];
  322. ab3.val[1] = cd1.val[1];
  323. cd1.val[1] = temp;
  324. temp = ab4.val[0];
  325. ab4.val[0] = cd2.val[0];
  326. cd2.val[0] = temp;
  327. temp = ab4.val[1];
  328. ab4.val[1] = cd2.val[1];
  329. cd2.val[1] = temp;
  330. //Compare
  331. if(dir)
  332. {
  333. vminmax256q(ab1, cd1);
  334. vminmax256q(ab2, cd2);
  335. vminmax256q(ab3, cd3);
  336. vminmax256q(ab4, cd4);
  337. }
  338. else
  339. {
  340. vminmax256q(cd1, ab1);
  341. vminmax256q(cd2, ab2);
  342. vminmax256q(cd3, ab3);
  343. vminmax256q(cd4, ab4);
  344. }
  345. //Transpose 256
  346. arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir);
  347. arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
  348. }
  349. static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
  350. {
  351. float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
  352. float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
  353. //Load and reverse second array
  354. ab1.val[0] = vld1q_f32(pSrc+0 );
  355. ab1.val[1] = vld1q_f32(pSrc+4 );
  356. ab2.val[0] = vld1q_f32(pSrc+8 );
  357. ab2.val[1] = vld1q_f32(pSrc+12);
  358. ab3.val[0] = vld1q_f32(pSrc+16);
  359. ab3.val[1] = vld1q_f32(pSrc+20);
  360. ab4.val[0] = vld1q_f32(pSrc+24);
  361. ab4.val[1] = vld1q_f32(pSrc+28);
  362. ab5.val[0] = vld1q_f32(pSrc+32);
  363. ab5.val[1] = vld1q_f32(pSrc+36);
  364. ab6.val[0] = vld1q_f32(pSrc+40);
  365. ab6.val[1] = vld1q_f32(pSrc+44);
  366. ab7.val[0] = vld1q_f32(pSrc+48);
  367. ab7.val[1] = vld1q_f32(pSrc+52);
  368. ab8.val[0] = vld1q_f32(pSrc+56);
  369. ab8.val[1] = vld1q_f32(pSrc+60);
  370. vldrev128q_f32(cd8.val[1], pSrc+64);
  371. vldrev128q_f32(cd8.val[0], pSrc+68);
  372. vldrev128q_f32(cd7.val[1], pSrc+72);
  373. vldrev128q_f32(cd7.val[0], pSrc+76);
  374. vldrev128q_f32(cd6.val[1], pSrc+80);
  375. vldrev128q_f32(cd6.val[0], pSrc+84);
  376. vldrev128q_f32(cd5.val[1], pSrc+88);
  377. vldrev128q_f32(cd5.val[0], pSrc+92);
  378. vldrev128q_f32(cd4.val[1], pSrc+96);
  379. vldrev128q_f32(cd4.val[0], pSrc+100);
  380. vldrev128q_f32(cd3.val[1], pSrc+104);
  381. vldrev128q_f32(cd3.val[0], pSrc+108);
  382. vldrev128q_f32(cd2.val[1], pSrc+112);
  383. vldrev128q_f32(cd2.val[0], pSrc+116);
  384. vldrev128q_f32(cd1.val[1], pSrc+120);
  385. vldrev128q_f32(cd1.val[0], pSrc+124);
  386. //Compare
  387. if(dir)
  388. {
  389. vminmax256q(ab1, cd1);
  390. vminmax256q(ab2, cd2);
  391. vminmax256q(ab3, cd3);
  392. vminmax256q(ab4, cd4);
  393. vminmax256q(ab5, cd5);
  394. vminmax256q(ab6, cd6);
  395. vminmax256q(ab7, cd7);
  396. vminmax256q(ab8, cd8);
  397. }
  398. else
  399. {
  400. vminmax256q(cd1, ab1);
  401. vminmax256q(cd2, ab2);
  402. vminmax256q(cd3, ab3);
  403. vminmax256q(cd4, ab4);
  404. vminmax256q(cd5, ab5);
  405. vminmax256q(cd6, ab6);
  406. vminmax256q(cd7, ab7);
  407. vminmax256q(cd8, ab8);
  408. }
  409. //Transpose
  410. float32x4_t temp;
  411. temp = ab5.val[0];
  412. ab5.val[0] = cd1.val[0];
  413. cd1.val[0] = temp;
  414. temp = ab5.val[1];
  415. ab5.val[1] = cd1.val[1];
  416. cd1.val[1] = temp;
  417. temp = ab6.val[0];
  418. ab6.val[0] = cd2.val[0];
  419. cd2.val[0] = temp;
  420. temp = ab6.val[1];
  421. ab6.val[1] = cd2.val[1];
  422. cd2.val[1] = temp;
  423. temp = ab7.val[0];
  424. ab7.val[0] = cd3.val[0];
  425. cd3.val[0] = temp;
  426. temp = ab7.val[1];
  427. ab7.val[1] = cd3.val[1];
  428. cd3.val[1] = temp;
  429. temp = ab8.val[0];
  430. ab8.val[0] = cd4.val[0];
  431. cd4.val[0] = temp;
  432. temp = ab8.val[1];
  433. ab8.val[1] = cd4.val[1];
  434. cd4.val[1] = temp;
  435. //Compare
  436. if(dir)
  437. {
  438. vminmax256q(ab1, cd1);
  439. vminmax256q(ab2, cd2);
  440. vminmax256q(ab3, cd3);
  441. vminmax256q(ab4, cd4);
  442. vminmax256q(ab5, cd5);
  443. vminmax256q(ab6, cd6);
  444. vminmax256q(ab7, cd7);
  445. vminmax256q(ab8, cd8);
  446. }
  447. else
  448. {
  449. vminmax256q(cd1, ab1);
  450. vminmax256q(cd2, ab2);
  451. vminmax256q(cd3, ab3);
  452. vminmax256q(cd4, ab4);
  453. vminmax256q(cd5, ab5);
  454. vminmax256q(cd6, ab6);
  455. vminmax256q(cd7, ab7);
  456. vminmax256q(cd8, ab8);
  457. }
  458. vst1q_f32(pSrc, ab1.val[0]);
  459. vst1q_f32(pSrc+4, ab1.val[1]);
  460. vst1q_f32(pSrc+8, ab2.val[0]);
  461. vst1q_f32(pSrc+12, ab2.val[1]);
  462. vst1q_f32(pSrc+16, ab3.val[0]);
  463. vst1q_f32(pSrc+20, ab3.val[1]);
  464. vst1q_f32(pSrc+24, ab4.val[0]);
  465. vst1q_f32(pSrc+28, ab4.val[1]);
  466. vst1q_f32(pSrc+32, cd1.val[0]);
  467. vst1q_f32(pSrc+36, cd1.val[1]);
  468. vst1q_f32(pSrc+40, cd2.val[0]);
  469. vst1q_f32(pSrc+44, cd2.val[1]);
  470. vst1q_f32(pSrc+48, cd3.val[0]);
  471. vst1q_f32(pSrc+52, cd3.val[1]);
  472. vst1q_f32(pSrc+56, cd4.val[0]);
  473. vst1q_f32(pSrc+60, cd4.val[1]);
  474. vst1q_f32(pSrc+64, ab5.val[0]);
  475. vst1q_f32(pSrc+68, ab5.val[1]);
  476. vst1q_f32(pSrc+72, ab6.val[0]);
  477. vst1q_f32(pSrc+76, ab6.val[1]);
  478. vst1q_f32(pSrc+80, ab7.val[0]);
  479. vst1q_f32(pSrc+84, ab7.val[1]);
  480. vst1q_f32(pSrc+88, ab8.val[0]);
  481. vst1q_f32(pSrc+92, ab8.val[1]);
  482. vst1q_f32(pSrc+96, cd5.val[0]);
  483. vst1q_f32(pSrc+100, cd5.val[1]);
  484. vst1q_f32(pSrc+104, cd6.val[0]);
  485. vst1q_f32(pSrc+108, cd6.val[1]);
  486. vst1q_f32(pSrc+112, cd7.val[0]);
  487. vst1q_f32(pSrc+116, cd7.val[1]);
  488. vst1q_f32(pSrc+120, cd8.val[0]);
  489. vst1q_f32(pSrc+124, cd8.val[1]);
  490. //Transpose
  491. arm_bitonic_merge_64_f32(pSrc+0 , dir);
  492. arm_bitonic_merge_64_f32(pSrc+64, dir);
  493. }
  494. static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
  495. {
  496. float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
  497. float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
  498. float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
  499. float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
  500. //Load and reverse second array
  501. ab1.val[0] = vld1q_f32(pSrc+0 );
  502. ab1.val[1] = vld1q_f32(pSrc+4 );
  503. ab2.val[0] = vld1q_f32(pSrc+8 );
  504. ab2.val[1] = vld1q_f32(pSrc+12 );
  505. ab3.val[0] = vld1q_f32(pSrc+16 );
  506. ab3.val[1] = vld1q_f32(pSrc+20 );
  507. ab4.val[0] = vld1q_f32(pSrc+24 );
  508. ab4.val[1] = vld1q_f32(pSrc+28 );
  509. ab5.val[0] = vld1q_f32(pSrc+32 );
  510. ab5.val[1] = vld1q_f32(pSrc+36 );
  511. ab6.val[0] = vld1q_f32(pSrc+40 );
  512. ab6.val[1] = vld1q_f32(pSrc+44 );
  513. ab7.val[0] = vld1q_f32(pSrc+48 );
  514. ab7.val[1] = vld1q_f32(pSrc+52 );
  515. ab8.val[0] = vld1q_f32(pSrc+56 );
  516. ab8.val[1] = vld1q_f32(pSrc+60 );
  517. ab9.val[0] = vld1q_f32(pSrc+64 );
  518. ab9.val[1] = vld1q_f32(pSrc+68 );
  519. ab10.val[0] = vld1q_f32(pSrc+72 );
  520. ab10.val[1] = vld1q_f32(pSrc+76 );
  521. ab11.val[0] = vld1q_f32(pSrc+80 );
  522. ab11.val[1] = vld1q_f32(pSrc+84 );
  523. ab12.val[0] = vld1q_f32(pSrc+88 );
  524. ab12.val[1] = vld1q_f32(pSrc+92 );
  525. ab13.val[0] = vld1q_f32(pSrc+96 );
  526. ab13.val[1] = vld1q_f32(pSrc+100);
  527. ab14.val[0] = vld1q_f32(pSrc+104);
  528. ab14.val[1] = vld1q_f32(pSrc+108);
  529. ab15.val[0] = vld1q_f32(pSrc+112);
  530. ab15.val[1] = vld1q_f32(pSrc+116);
  531. ab16.val[0] = vld1q_f32(pSrc+120);
  532. ab16.val[1] = vld1q_f32(pSrc+124);
  533. vldrev128q_f32(cd16.val[1], pSrc+128);
  534. vldrev128q_f32(cd16.val[0], pSrc+132);
  535. vldrev128q_f32(cd15.val[1], pSrc+136);
  536. vldrev128q_f32(cd15.val[0], pSrc+140);
  537. vldrev128q_f32(cd14.val[1], pSrc+144);
  538. vldrev128q_f32(cd14.val[0], pSrc+148);
  539. vldrev128q_f32(cd13.val[1], pSrc+152);
  540. vldrev128q_f32(cd13.val[0], pSrc+156);
  541. vldrev128q_f32(cd12.val[1], pSrc+160);
  542. vldrev128q_f32(cd12.val[0], pSrc+164);
  543. vldrev128q_f32(cd11.val[1], pSrc+168);
  544. vldrev128q_f32(cd11.val[0], pSrc+172);
  545. vldrev128q_f32(cd10.val[1], pSrc+176);
  546. vldrev128q_f32(cd10.val[0], pSrc+180);
  547. vldrev128q_f32(cd9.val[1] , pSrc+184);
  548. vldrev128q_f32(cd9.val[0] , pSrc+188);
  549. vldrev128q_f32(cd8.val[1] , pSrc+192);
  550. vldrev128q_f32(cd8.val[0] , pSrc+196);
  551. vldrev128q_f32(cd7.val[1] , pSrc+200);
  552. vldrev128q_f32(cd7.val[0] , pSrc+204);
  553. vldrev128q_f32(cd6.val[1] , pSrc+208);
  554. vldrev128q_f32(cd6.val[0] , pSrc+212);
  555. vldrev128q_f32(cd5.val[1] , pSrc+216);
  556. vldrev128q_f32(cd5.val[0] , pSrc+220);
  557. vldrev128q_f32(cd4.val[1] , pSrc+224);
  558. vldrev128q_f32(cd4.val[0] , pSrc+228);
  559. vldrev128q_f32(cd3.val[1] , pSrc+232);
  560. vldrev128q_f32(cd3.val[0] , pSrc+236);
  561. vldrev128q_f32(cd2.val[1] , pSrc+240);
  562. vldrev128q_f32(cd2.val[0] , pSrc+244);
  563. vldrev128q_f32(cd1.val[1] , pSrc+248);
  564. vldrev128q_f32(cd1.val[0] , pSrc+252);
  565. //Compare
  566. if(dir)
  567. {
  568. vminmax256q(ab1 , cd1 );
  569. vminmax256q(ab2 , cd2 );
  570. vminmax256q(ab3 , cd3 );
  571. vminmax256q(ab4 , cd4 );
  572. vminmax256q(ab5 , cd5 );
  573. vminmax256q(ab6 , cd6 );
  574. vminmax256q(ab7 , cd7 );
  575. vminmax256q(ab8 , cd8 );
  576. vminmax256q(ab9 , cd9 );
  577. vminmax256q(ab10, cd10);
  578. vminmax256q(ab11, cd11);
  579. vminmax256q(ab12, cd12);
  580. vminmax256q(ab13, cd13);
  581. vminmax256q(ab14, cd14);
  582. vminmax256q(ab15, cd15);
  583. vminmax256q(ab16, cd16);
  584. }
  585. else
  586. {
  587. vminmax256q(cd1 , ab1 );
  588. vminmax256q(cd2 , ab2 );
  589. vminmax256q(cd3 , ab3 );
  590. vminmax256q(cd4 , ab4 );
  591. vminmax256q(cd5 , ab5 );
  592. vminmax256q(cd6 , ab6 );
  593. vminmax256q(cd7 , ab7 );
  594. vminmax256q(cd8 , ab8 );
  595. vminmax256q(cd9 , ab9 );
  596. vminmax256q(cd10, ab10);
  597. vminmax256q(cd11, ab11);
  598. vminmax256q(cd12, ab12);
  599. vminmax256q(cd13, ab13);
  600. vminmax256q(cd14, ab14);
  601. vminmax256q(cd15, ab15);
  602. vminmax256q(cd16, ab16);
  603. }
  604. //Transpose
  605. float32x4_t temp;
  606. temp = ab9.val[0];
  607. ab9.val[0] = cd1.val[0];
  608. cd1.val[0] = temp;
  609. temp = ab9.val[1];
  610. ab9.val[1] = cd1.val[1];
  611. cd1.val[1] = temp;
  612. temp = ab10.val[0];
  613. ab10.val[0] = cd2.val[0];
  614. cd2.val[0] = temp;
  615. temp = ab10.val[1];
  616. ab10.val[1] = cd2.val[1];
  617. cd2.val[1] = temp;
  618. temp = ab11.val[0];
  619. ab11.val[0] = cd3.val[0];
  620. cd3.val[0] = temp;
  621. temp = ab11.val[1];
  622. ab11.val[1] = cd3.val[1];
  623. cd3.val[1] = temp;
  624. temp = ab12.val[0];
  625. ab12.val[0] = cd4.val[0];
  626. cd4.val[0] = temp;
  627. temp = ab12.val[1];
  628. ab12.val[1] = cd4.val[1];
  629. cd4.val[1] = temp;
  630. temp = ab13.val[0];
  631. ab13.val[0] = cd5.val[0];
  632. cd5.val[0] = temp;
  633. temp = ab13.val[1];
  634. ab13.val[1] = cd5.val[1];
  635. cd5.val[1] = temp;
  636. temp = ab14.val[0];
  637. ab14.val[0] = cd6.val[0];
  638. cd6.val[0] = temp;
  639. temp = ab14.val[1];
  640. ab14.val[1] = cd6.val[1];
  641. cd6.val[1] = temp;
  642. temp = ab15.val[0];
  643. ab15.val[0] = cd7.val[0];
  644. cd7.val[0] = temp;
  645. temp = ab15.val[1];
  646. ab15.val[1] = cd7.val[1];
  647. cd7.val[1] = temp;
  648. temp = ab16.val[0];
  649. ab16.val[0] = cd8.val[0];
  650. cd8.val[0] = temp;
  651. temp = ab16.val[1];
  652. ab16.val[1] = cd8.val[1];
  653. cd8.val[1] = temp;
  654. //Compare
  655. if(dir)
  656. {
  657. vminmax256q(ab1 , cd1 );
  658. vminmax256q(ab2 , cd2 );
  659. vminmax256q(ab3 , cd3 );
  660. vminmax256q(ab4 , cd4 );
  661. vminmax256q(ab5 , cd5 );
  662. vminmax256q(ab6 , cd6 );
  663. vminmax256q(ab7 , cd7 );
  664. vminmax256q(ab8 , cd8 );
  665. vminmax256q(ab9 , cd9 );
  666. vminmax256q(ab10, cd10);
  667. vminmax256q(ab11, cd11);
  668. vminmax256q(ab12, cd12);
  669. vminmax256q(ab13, cd13);
  670. vminmax256q(ab14, cd14);
  671. vminmax256q(ab15, cd15);
  672. vminmax256q(ab16, cd16);
  673. }
  674. else
  675. {
  676. vminmax256q(cd1 , ab1 );
  677. vminmax256q(cd2 , ab2 );
  678. vminmax256q(cd3 , ab3 );
  679. vminmax256q(cd4 , ab4 );
  680. vminmax256q(cd5 , ab5 );
  681. vminmax256q(cd6 , ab6 );
  682. vminmax256q(cd7 , ab7 );
  683. vminmax256q(cd8 , ab8 );
  684. vminmax256q(cd9 , ab9 );
  685. vminmax256q(cd10, ab10);
  686. vminmax256q(cd11, ab11);
  687. vminmax256q(cd12, ab12);
  688. vminmax256q(cd13, ab13);
  689. vminmax256q(cd14, ab14);
  690. vminmax256q(cd15, ab15);
  691. vminmax256q(cd16, ab16);
  692. }
  693. vst1q_f32(pSrc, ab1.val[0] );
  694. vst1q_f32(pSrc+4, ab1.val[1] );
  695. vst1q_f32(pSrc+8, ab2.val[0] );
  696. vst1q_f32(pSrc+12, ab2.val[1] );
  697. vst1q_f32(pSrc+16, ab3.val[0] );
  698. vst1q_f32(pSrc+20, ab3.val[1] );
  699. vst1q_f32(pSrc+24, ab4.val[0] );
  700. vst1q_f32(pSrc+28, ab4.val[1] );
  701. vst1q_f32(pSrc+32, ab5.val[0] );
  702. vst1q_f32(pSrc+36, ab5.val[1] );
  703. vst1q_f32(pSrc+40, ab6.val[0] );
  704. vst1q_f32(pSrc+44, ab6.val[1] );
  705. vst1q_f32(pSrc+48, ab7.val[0] );
  706. vst1q_f32(pSrc+52, ab7.val[1] );
  707. vst1q_f32(pSrc+56, ab8.val[0] );
  708. vst1q_f32(pSrc+60, ab8.val[1] );
  709. vst1q_f32(pSrc+64, cd1.val[0] );
  710. vst1q_f32(pSrc+68, cd1.val[1] );
  711. vst1q_f32(pSrc+72, cd2.val[0] );
  712. vst1q_f32(pSrc+76, cd2.val[1] );
  713. vst1q_f32(pSrc+80, cd3.val[0] );
  714. vst1q_f32(pSrc+84, cd3.val[1] );
  715. vst1q_f32(pSrc+88, cd4.val[0] );
  716. vst1q_f32(pSrc+92, cd4.val[1] );
  717. vst1q_f32(pSrc+96, cd5.val[0] );
  718. vst1q_f32(pSrc+100, cd5.val[1] );
  719. vst1q_f32(pSrc+104, cd6.val[0] );
  720. vst1q_f32(pSrc+108, cd6.val[1] );
  721. vst1q_f32(pSrc+112, cd7.val[0] );
  722. vst1q_f32(pSrc+116, cd7.val[1] );
  723. vst1q_f32(pSrc+120, cd8.val[0] );
  724. vst1q_f32(pSrc+124, cd8.val[1] );
  725. vst1q_f32(pSrc+128, ab9.val[0] );
  726. vst1q_f32(pSrc+132, ab9.val[1] );
  727. vst1q_f32(pSrc+136, ab10.val[0]);
  728. vst1q_f32(pSrc+140, ab10.val[1]);
  729. vst1q_f32(pSrc+144, ab11.val[0]);
  730. vst1q_f32(pSrc+148, ab11.val[1]);
  731. vst1q_f32(pSrc+152, ab12.val[0]);
  732. vst1q_f32(pSrc+156, ab12.val[1]);
  733. vst1q_f32(pSrc+160, ab13.val[0]);
  734. vst1q_f32(pSrc+164, ab13.val[1]);
  735. vst1q_f32(pSrc+168, ab14.val[0]);
  736. vst1q_f32(pSrc+172, ab14.val[1]);
  737. vst1q_f32(pSrc+176, ab15.val[0]);
  738. vst1q_f32(pSrc+180, ab15.val[1]);
  739. vst1q_f32(pSrc+184, ab16.val[0]);
  740. vst1q_f32(pSrc+188, ab16.val[1]);
  741. vst1q_f32(pSrc+192, cd9.val[0] );
  742. vst1q_f32(pSrc+196, cd9.val[1] );
  743. vst1q_f32(pSrc+200, cd10.val[0]);
  744. vst1q_f32(pSrc+204, cd10.val[1]);
  745. vst1q_f32(pSrc+208, cd11.val[0]);
  746. vst1q_f32(pSrc+212, cd11.val[1]);
  747. vst1q_f32(pSrc+216, cd12.val[0]);
  748. vst1q_f32(pSrc+220, cd12.val[1]);
  749. vst1q_f32(pSrc+224, cd13.val[0]);
  750. vst1q_f32(pSrc+228, cd13.val[1]);
  751. vst1q_f32(pSrc+232, cd14.val[0]);
  752. vst1q_f32(pSrc+236, cd14.val[1]);
  753. vst1q_f32(pSrc+240, cd15.val[0]);
  754. vst1q_f32(pSrc+244, cd15.val[1]);
  755. vst1q_f32(pSrc+248, cd16.val[0]);
  756. vst1q_f32(pSrc+252, cd16.val[1]);
  757. //Transpose
  758. arm_bitonic_merge_128_f32(pSrc+0 , dir);
  759. arm_bitonic_merge_128_f32(pSrc+128, dir);
  760. }
  761. #define SWAP(a,i,j) \
  762. temp = vgetq_lane_f32(a, j); \
  763. a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\
  764. a = vsetq_lane_f32(temp, a, i);
  765. static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
  766. {
  767. float32_t temp;
  768. if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
  769. {
  770. SWAP(a,0,1);
  771. }
  772. if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) )
  773. {
  774. SWAP(a,2,3);
  775. }
  776. if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) )
  777. {
  778. SWAP(a,0,3);
  779. }
  780. if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) )
  781. {
  782. SWAP(a,1,2);
  783. }
  784. if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
  785. {
  786. SWAP(a,0,1);
  787. }
  788. if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) )
  789. {
  790. SWAP(a,2,3);
  791. }
  792. return a;
  793. }
  794. static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
  795. {
  796. a = arm_bitonic_sort_4_f32(a, dir);
  797. b = arm_bitonic_sort_4_f32(b, dir);
  798. return arm_bitonic_merge_8_f32(a, b, dir);
  799. }
  800. #endif
  801. /**
  802. @ingroup groupSupport
  803. */
  804. /**
  805. @defgroup Sorting Vector sorting algorithms
  806. Sort the elements of a vector
  807. There are separate functions for floating-point, Q31, Q15, and Q7 data types.
  808. */
  809. /**
  810. @addtogroup Sorting
  811. @{
  812. */
  813. /**
  814. * @private
  815. * @param[in] S points to an instance of the sorting structure.
  816. * @param[in] pSrc points to the block of input data.
  817. * @param[out] pDst points to the block of output data
  818. * @param[in] blockSize number of samples to process.
  819. */
  820. void arm_bitonic_sort_f32(
  821. const arm_sort_instance_f32 * S,
  822. float32_t * pSrc,
  823. float32_t * pDst,
  824. uint32_t blockSize)
  825. {
  826. uint16_t s, i;
  827. uint8_t dir = S->dir;
  828. #ifdef ARM_MATH_NEON
  829. (void)s;
  830. float32_t * pOut;
  831. uint16_t counter = blockSize>>5;
  832. if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
  833. {
  834. if(pSrc == pDst) // in-place
  835. pOut = pSrc;
  836. else
  837. pOut = pDst;
  838. float32x4x2_t ab1, ab2;
  839. float32x4x2_t cd1, cd2;
  840. if(blockSize == 1)
  841. pOut = pSrc;
  842. else if(blockSize == 2)
  843. {
  844. float32_t temp;
  845. if( dir==(pSrc[0]>pSrc[1]) )
  846. {
  847. temp = pSrc[1];
  848. pOut[1] = pSrc[0];
  849. pOut[0] = temp;
  850. }
  851. else
  852. pOut = pSrc;
  853. }
  854. else if(blockSize == 4)
  855. {
  856. float32x4_t a = vld1q_f32(pSrc);
  857. a = arm_bitonic_sort_4_f32(a, dir);
  858. vst1q_f32(pOut, a);
  859. }
  860. else if(blockSize == 8)
  861. {
  862. float32x4_t a;
  863. float32x4_t b;
  864. float32x4x2_t ab;
  865. a = vld1q_f32(pSrc);
  866. b = vld1q_f32(pSrc+4);
  867. ab = arm_bitonic_sort_8_f32(a, b, dir);
  868. vst1q_f32(pOut, ab.val[0]);
  869. vst1q_f32(pOut+4, ab.val[1]);
  870. }
  871. else if(blockSize >=16)
  872. {
  873. // Order 16 bits long vectors
  874. for(i=0; i<blockSize; i=i+16)
  875. arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir);
  876. // Merge
  877. for(i=0; i<counter; i++)
  878. {
  879. // Load and reverse second vector
  880. ab1.val[0] = vld1q_f32(pOut+32*i+0 );
  881. ab1.val[1] = vld1q_f32(pOut+32*i+4 );
  882. ab2.val[0] = vld1q_f32(pOut+32*i+8 );
  883. ab2.val[1] = vld1q_f32(pOut+32*i+12);
  884. vldrev128q_f32(cd2.val[1], pOut+32*i+16);
  885. vldrev128q_f32(cd2.val[0], pOut+32*i+20);
  886. vldrev128q_f32(cd1.val[1], pOut+32*i+24);
  887. vldrev128q_f32(cd1.val[0], pOut+32*i+28);
  888. arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir);
  889. }
  890. counter = counter>>1;
  891. for(i=0; i<counter; i++)
  892. arm_bitonic_merge_64_f32(pOut+64*i, dir);
  893. counter = counter>>1;
  894. for(i=0; i<counter; i++)
  895. arm_bitonic_merge_128_f32(pOut+128*i, dir);
  896. counter = counter>>1;
  897. for(i=0; i<counter; i++)
  898. arm_bitonic_merge_256_f32(pOut+256*i, dir);
  899. // Etc...
  900. }
  901. }
  902. #else
  903. float32_t * pA;
  904. if(pSrc != pDst) // out-of-place
  905. {
  906. memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
  907. pA = pDst;
  908. }
  909. else
  910. pA = pSrc;
  911. if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
  912. {
  913. for(s=2; s<=blockSize; s=s*2)
  914. {
  915. for(i=0; i<blockSize; i=i+s)
  916. arm_bitonic_sort_core_f32(pA+i, s, dir);
  917. }
  918. }
  919. #endif
  920. }
  921. /**
  922. @} end of Sorting group
  923. */