arm_boolean_distance_template.h 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_boolean_distance.c
  4. * Description: Templates for boolean distances
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. /**
  29. * @defgroup DISTANCEF Distance Functions
  30. *
  31. * Computes Distances between vectors.
  32. *
  33. * Distance functions are useful in a lot of algorithms.
  34. *
  35. */
  36. /**
  37. * @addtogroup DISTANCEF
  38. * @{
  39. */
  40. #define _FUNC(A,B) A##B
  41. #define FUNC(EXT) _FUNC(arm_boolean_distance, EXT)
  42. /**
  43. * @brief Elements of boolean distances
  44. *
  45. * Different values which are used to compute boolean distances
  46. *
  47. * @param[in] pA First vector of packed booleans
  48. * @param[in] pB Second vector of packed booleans
  49. * @param[in] numberOfBools Number of booleans
  50. * @return None
  51. *
  52. */
  53. #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
  54. #include "arm_common_tables.h"
  55. void FUNC(EXT)(const uint32_t *pA
  56. , const uint32_t *pB
  57. , uint32_t numberOfBools
  58. #ifdef TT
  59. , uint32_t *cTT
  60. #endif
  61. #ifdef FF
  62. , uint32_t *cFF
  63. #endif
  64. #ifdef TF
  65. , uint32_t *cTF
  66. #endif
  67. #ifdef FT
  68. , uint32_t *cFT
  69. #endif
  70. )
  71. {
  72. #ifdef TT
  73. uint32_t _ctt=0;
  74. #endif
  75. #ifdef FF
  76. uint32_t _cff=0;
  77. #endif
  78. #ifdef TF
  79. uint32_t _ctf=0;
  80. #endif
  81. #ifdef FT
  82. uint32_t _cft=0;
  83. #endif
  84. uint32_t a, b, ba, bb;
  85. int shift;
  86. const uint8_t *pA8 = (const uint8_t *) pA;
  87. const uint8_t *pB8 = (const uint8_t *) pB;
  88. /* handle vector blocks */
  89. uint32_t blkCnt = numberOfBools / 128;
  90. while (blkCnt > 0U) {
  91. uint8x16_t vecA = vld1q((const uint8_t *) pA8);
  92. uint8x16_t vecB = vld1q((const uint8_t *) pB8);
  93. #ifdef TT
  94. uint8x16_t vecTT = vecA & vecB;
  95. vecTT = vldrbq_gather_offset_u8(hwLUT, vecTT);
  96. _ctt += vaddvq(vecTT);
  97. #endif
  98. #ifdef FF
  99. uint8x16_t vecFF = vmvnq(vecA) & vmvnq(vecB);
  100. vecFF = vldrbq_gather_offset_u8(hwLUT, vecFF);
  101. _cff += vaddvq(vecFF);
  102. #endif
  103. #ifdef TF
  104. uint8x16_t vecTF = vecA & vmvnq(vecB);
  105. vecTF = vldrbq_gather_offset_u8(hwLUT, vecTF);
  106. _ctf += vaddvq(vecTF);
  107. #endif
  108. #ifdef FT
  109. uint8x16_t vecFT = vmvnq(vecA) & vecB;
  110. vecFT = vldrbq_gather_offset_u8(hwLUT, vecFT);
  111. _cft += vaddvq(vecFT);
  112. #endif
  113. pA8 += 16;
  114. pB8 += 16;
  115. blkCnt--;
  116. }
  117. pA = (const uint32_t *)pA8;
  118. pB = (const uint32_t *)pB8;
  119. blkCnt = numberOfBools & 0x7F;
  120. while(blkCnt >= 32)
  121. {
  122. a = *pA++;
  123. b = *pB++;
  124. shift = 0;
  125. while(shift < 32)
  126. {
  127. ba = a & 1;
  128. bb = b & 1;
  129. a = a >> 1;
  130. b = b >> 1;
  131. #ifdef TT
  132. _ctt += (ba && bb);
  133. #endif
  134. #ifdef FF
  135. _cff += ((1 ^ ba) && (1 ^ bb));
  136. #endif
  137. #ifdef TF
  138. _ctf += (ba && (1 ^ bb));
  139. #endif
  140. #ifdef FT
  141. _cft += ((1 ^ ba) && bb);
  142. #endif
  143. shift ++;
  144. }
  145. blkCnt -= 32;
  146. }
  147. a = *pA++;
  148. b = *pB++;
  149. a = a >> (32 - blkCnt);
  150. b = b >> (32 - blkCnt);
  151. while(blkCnt > 0)
  152. {
  153. ba = a & 1;
  154. bb = b & 1;
  155. a = a >> 1;
  156. b = b >> 1;
  157. #ifdef TT
  158. _ctt += (ba && bb);
  159. #endif
  160. #ifdef FF
  161. _cff += ((1 ^ ba) && (1 ^ bb));
  162. #endif
  163. #ifdef TF
  164. _ctf += (ba && (1 ^ bb));
  165. #endif
  166. #ifdef FT
  167. _cft += ((1 ^ ba) && bb);
  168. #endif
  169. blkCnt --;
  170. }
  171. #ifdef TT
  172. *cTT = _ctt;
  173. #endif
  174. #ifdef FF
  175. *cFF = _cff;
  176. #endif
  177. #ifdef TF
  178. *cTF = _ctf;
  179. #endif
  180. #ifdef FT
  181. *cFT = _cft;
  182. #endif
  183. }
  184. #else
  185. #if defined(ARM_MATH_NEON)
  186. void FUNC(EXT)(const uint32_t *pA
  187. , const uint32_t *pB
  188. , uint32_t numberOfBools
  189. #ifdef TT
  190. , uint32_t *cTT
  191. #endif
  192. #ifdef FF
  193. , uint32_t *cFF
  194. #endif
  195. #ifdef TF
  196. , uint32_t *cTF
  197. #endif
  198. #ifdef FT
  199. , uint32_t *cFT
  200. #endif
  201. )
  202. {
  203. #ifdef TT
  204. uint32_t _ctt=0;
  205. #endif
  206. #ifdef FF
  207. uint32_t _cff=0;
  208. #endif
  209. #ifdef TF
  210. uint32_t _ctf=0;
  211. #endif
  212. #ifdef FT
  213. uint32_t _cft=0;
  214. #endif
  215. uint32_t nbBoolBlock;
  216. uint32_t a,b,ba,bb;
  217. int shift;
  218. uint32x4_t aV, bV;
  219. #ifdef TT
  220. uint32x4_t cttV;
  221. #endif
  222. #ifdef FF
  223. uint32x4_t cffV;
  224. #endif
  225. #ifdef TF
  226. uint32x4_t ctfV;
  227. #endif
  228. #ifdef FT
  229. uint32x4_t cftV;
  230. #endif
  231. uint8x16_t tmp;
  232. uint16x8_t tmp2;
  233. uint32x4_t tmp3;
  234. uint64x2_t tmp4;
  235. #ifdef TT
  236. uint64x2_t tmp4tt;
  237. #endif
  238. #ifdef FF
  239. uint64x2_t tmp4ff;
  240. #endif
  241. #ifdef TF
  242. uint64x2_t tmp4tf;
  243. #endif
  244. #ifdef FT
  245. uint64x2_t tmp4ft;
  246. #endif
  247. #ifdef TT
  248. tmp4tt = vdupq_n_u64(0);
  249. #endif
  250. #ifdef FF
  251. tmp4ff = vdupq_n_u64(0);
  252. #endif
  253. #ifdef TF
  254. tmp4tf = vdupq_n_u64(0);
  255. #endif
  256. #ifdef FT
  257. tmp4ft = vdupq_n_u64(0);
  258. #endif
  259. nbBoolBlock = numberOfBools >> 7;
  260. while(nbBoolBlock > 0)
  261. {
  262. aV = vld1q_u32(pA);
  263. bV = vld1q_u32(pB);
  264. pA += 4;
  265. pB += 4;
  266. #ifdef TT
  267. cttV = vandq_u32(aV,bV);
  268. #endif
  269. #ifdef FF
  270. cffV = vandq_u32(vmvnq_u32(aV),vmvnq_u32(bV));
  271. #endif
  272. #ifdef TF
  273. ctfV = vandq_u32(aV,vmvnq_u32(bV));
  274. #endif
  275. #ifdef FT
  276. cftV = vandq_u32(vmvnq_u32(aV),bV);
  277. #endif
  278. #ifdef TT
  279. tmp = vcntq_u8(vreinterpretq_u8_u32(cttV));
  280. tmp2 = vpaddlq_u8(tmp);
  281. tmp3 = vpaddlq_u16(tmp2);
  282. tmp4 = vpaddlq_u32(tmp3);
  283. tmp4tt = vaddq_u64(tmp4tt, tmp4);
  284. #endif
  285. #ifdef FF
  286. tmp = vcntq_u8(vreinterpretq_u8_u32(cffV));
  287. tmp2 = vpaddlq_u8(tmp);
  288. tmp3 = vpaddlq_u16(tmp2);
  289. tmp4 = vpaddlq_u32(tmp3);
  290. tmp4ff = vaddq_u64(tmp4ff, tmp4);
  291. #endif
  292. #ifdef TF
  293. tmp = vcntq_u8(vreinterpretq_u8_u32(ctfV));
  294. tmp2 = vpaddlq_u8(tmp);
  295. tmp3 = vpaddlq_u16(tmp2);
  296. tmp4 = vpaddlq_u32(tmp3);
  297. tmp4tf = vaddq_u64(tmp4tf, tmp4);
  298. #endif
  299. #ifdef FT
  300. tmp = vcntq_u8(vreinterpretq_u8_u32(cftV));
  301. tmp2 = vpaddlq_u8(tmp);
  302. tmp3 = vpaddlq_u16(tmp2);
  303. tmp4 = vpaddlq_u32(tmp3);
  304. tmp4ft = vaddq_u64(tmp4ft, tmp4);
  305. #endif
  306. nbBoolBlock --;
  307. }
  308. #ifdef TT
  309. _ctt += vgetq_lane_u64(tmp4tt, 0) + vgetq_lane_u64(tmp4tt, 1);
  310. #endif
  311. #ifdef FF
  312. _cff +=vgetq_lane_u64(tmp4ff, 0) + vgetq_lane_u64(tmp4ff, 1);
  313. #endif
  314. #ifdef TF
  315. _ctf += vgetq_lane_u64(tmp4tf, 0) + vgetq_lane_u64(tmp4tf, 1);
  316. #endif
  317. #ifdef FT
  318. _cft += vgetq_lane_u64(tmp4ft, 0) + vgetq_lane_u64(tmp4ft, 1);
  319. #endif
  320. nbBoolBlock = numberOfBools & 0x7F;
  321. while(nbBoolBlock >= 32)
  322. {
  323. a = *pA++;
  324. b = *pB++;
  325. shift = 0;
  326. while(shift < 32)
  327. {
  328. ba = a & 1;
  329. bb = b & 1;
  330. a = a >> 1;
  331. b = b >> 1;
  332. #ifdef TT
  333. _ctt += (ba && bb);
  334. #endif
  335. #ifdef FF
  336. _cff += ((1 ^ ba) && (1 ^ bb));
  337. #endif
  338. #ifdef TF
  339. _ctf += (ba && (1 ^ bb));
  340. #endif
  341. #ifdef FT
  342. _cft += ((1 ^ ba) && bb);
  343. #endif
  344. shift ++;
  345. }
  346. nbBoolBlock -= 32;
  347. }
  348. a = *pA++;
  349. b = *pB++;
  350. a = a >> (32 - nbBoolBlock);
  351. b = b >> (32 - nbBoolBlock);
  352. while(nbBoolBlock > 0)
  353. {
  354. ba = a & 1;
  355. bb = b & 1;
  356. a = a >> 1;
  357. b = b >> 1;
  358. #ifdef TT
  359. _ctt += (ba && bb);
  360. #endif
  361. #ifdef FF
  362. _cff += ((1 ^ ba) && (1 ^ bb));
  363. #endif
  364. #ifdef TF
  365. _ctf += (ba && (1 ^ bb));
  366. #endif
  367. #ifdef FT
  368. _cft += ((1 ^ ba) && bb);
  369. #endif
  370. nbBoolBlock --;
  371. }
  372. #ifdef TT
  373. *cTT = _ctt;
  374. #endif
  375. #ifdef FF
  376. *cFF = _cff;
  377. #endif
  378. #ifdef TF
  379. *cTF = _ctf;
  380. #endif
  381. #ifdef FT
  382. *cFT = _cft;
  383. #endif
  384. }
  385. #else
  386. void FUNC(EXT)(const uint32_t *pA
  387. , const uint32_t *pB
  388. , uint32_t numberOfBools
  389. #ifdef TT
  390. , uint32_t *cTT
  391. #endif
  392. #ifdef FF
  393. , uint32_t *cFF
  394. #endif
  395. #ifdef TF
  396. , uint32_t *cTF
  397. #endif
  398. #ifdef FT
  399. , uint32_t *cFT
  400. #endif
  401. )
  402. {
  403. #ifdef TT
  404. uint32_t _ctt=0;
  405. #endif
  406. #ifdef FF
  407. uint32_t _cff=0;
  408. #endif
  409. #ifdef TF
  410. uint32_t _ctf=0;
  411. #endif
  412. #ifdef FT
  413. uint32_t _cft=0;
  414. #endif
  415. uint32_t a,b,ba,bb;
  416. int shift;
  417. while(numberOfBools >= 32)
  418. {
  419. a = *pA++;
  420. b = *pB++;
  421. shift = 0;
  422. while(shift < 32)
  423. {
  424. ba = a & 1;
  425. bb = b & 1;
  426. a = a >> 1;
  427. b = b >> 1;
  428. #ifdef TT
  429. _ctt += (ba && bb);
  430. #endif
  431. #ifdef FF
  432. _cff += ((1 ^ ba) && (1 ^ bb));
  433. #endif
  434. #ifdef TF
  435. _ctf += (ba && (1 ^ bb));
  436. #endif
  437. #ifdef FT
  438. _cft += ((1 ^ ba) && bb);
  439. #endif
  440. shift ++;
  441. }
  442. numberOfBools -= 32;
  443. }
  444. a = *pA++;
  445. b = *pB++;
  446. a = a >> (32 - numberOfBools);
  447. b = b >> (32 - numberOfBools);
  448. while(numberOfBools > 0)
  449. {
  450. ba = a & 1;
  451. bb = b & 1;
  452. a = a >> 1;
  453. b = b >> 1;
  454. #ifdef TT
  455. _ctt += (ba && bb);
  456. #endif
  457. #ifdef FF
  458. _cff += ((1 ^ ba) && (1 ^ bb));
  459. #endif
  460. #ifdef TF
  461. _ctf += (ba && (1 ^ bb));
  462. #endif
  463. #ifdef FT
  464. _cft += ((1 ^ ba) && bb);
  465. #endif
  466. numberOfBools --;
  467. }
  468. #ifdef TT
  469. *cTT = _ctt;
  470. #endif
  471. #ifdef FF
  472. *cFF = _cff;
  473. #endif
  474. #ifdef TF
  475. *cTF = _ctf;
  476. #endif
  477. #ifdef FT
  478. *cFT = _cft;
  479. #endif
  480. }
  481. #endif
  482. #endif /* defined(ARM_MATH_MVEI) */
  483. /**
  484. * @} end of DISTANCEF group
  485. */