arm_boolean_distance_template.h 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_boolean_distance.c
  4. * Description: Templates for boolean distances
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. /**
  29. * @defgroup DISTANCEF Distance Functions
  30. *
  31. * Computes Distances between vectors.
  32. *
  33. * Distance functions are useful in a lot of algorithms.
  34. *
  35. */
  36. /**
  37. * @addtogroup DISTANCEF
  38. * @{
  39. */
  40. #define _FUNC(A,B) A##B
  41. #define FUNC(EXT) _FUNC(arm_boolean_distance, EXT)
  42. /**
  43. * @brief Elements of boolean distances
  44. *
  45. * Different values which are used to compute boolean distances
  46. *
  47. * @param[in] pA First vector of packed booleans
  48. * @param[in] pB Second vector of packed booleans
  49. * @param[in] numberOfBools Number of booleans
  50. *
  51. */
  52. #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
  53. #include "arm_common_tables.h"
  54. void FUNC(EXT)(const uint32_t *pA
  55. , const uint32_t *pB
  56. , uint32_t numberOfBools
  57. #ifdef TT
  58. , uint32_t *cTT
  59. #endif
  60. #ifdef FF
  61. , uint32_t *cFF
  62. #endif
  63. #ifdef TF
  64. , uint32_t *cTF
  65. #endif
  66. #ifdef FT
  67. , uint32_t *cFT
  68. #endif
  69. )
  70. {
  71. #ifdef TT
  72. uint32_t _ctt=0;
  73. #endif
  74. #ifdef FF
  75. uint32_t _cff=0;
  76. #endif
  77. #ifdef TF
  78. uint32_t _ctf=0;
  79. #endif
  80. #ifdef FT
  81. uint32_t _cft=0;
  82. #endif
  83. uint32_t a, b, ba, bb;
  84. int shift;
  85. const uint8_t *pA8 = (const uint8_t *) pA;
  86. const uint8_t *pB8 = (const uint8_t *) pB;
  87. /* handle vector blocks */
  88. uint32_t blkCnt = numberOfBools / 128;
  89. while (blkCnt > 0U) {
  90. uint8x16_t vecA = vld1q((const uint8_t *) pA8);
  91. uint8x16_t vecB = vld1q((const uint8_t *) pB8);
  92. #ifdef TT
  93. uint8x16_t vecTT = vecA & vecB;
  94. vecTT = vldrbq_gather_offset_u8(hwLUT, vecTT);
  95. _ctt += vaddvq(vecTT);
  96. #endif
  97. #ifdef FF
  98. uint8x16_t vecFF = vmvnq(vecA) & vmvnq(vecB);
  99. vecFF = vldrbq_gather_offset_u8(hwLUT, vecFF);
  100. _cff += vaddvq(vecFF);
  101. #endif
  102. #ifdef TF
  103. uint8x16_t vecTF = vecA & vmvnq(vecB);
  104. vecTF = vldrbq_gather_offset_u8(hwLUT, vecTF);
  105. _ctf += vaddvq(vecTF);
  106. #endif
  107. #ifdef FT
  108. uint8x16_t vecFT = vmvnq(vecA) & vecB;
  109. vecFT = vldrbq_gather_offset_u8(hwLUT, vecFT);
  110. _cft += vaddvq(vecFT);
  111. #endif
  112. pA8 += 16;
  113. pB8 += 16;
  114. blkCnt--;
  115. }
  116. pA = (const uint32_t *)pA8;
  117. pB = (const uint32_t *)pB8;
  118. blkCnt = numberOfBools & 0x7F;
  119. while(blkCnt >= 32)
  120. {
  121. a = *pA++;
  122. b = *pB++;
  123. shift = 0;
  124. while(shift < 32)
  125. {
  126. ba = a & 1;
  127. bb = b & 1;
  128. a = a >> 1;
  129. b = b >> 1;
  130. #ifdef TT
  131. _ctt += (ba && bb);
  132. #endif
  133. #ifdef FF
  134. _cff += ((1 ^ ba) && (1 ^ bb));
  135. #endif
  136. #ifdef TF
  137. _ctf += (ba && (1 ^ bb));
  138. #endif
  139. #ifdef FT
  140. _cft += ((1 ^ ba) && bb);
  141. #endif
  142. shift ++;
  143. }
  144. blkCnt -= 32;
  145. }
  146. a = *pA++;
  147. b = *pB++;
  148. a = a >> (32 - blkCnt);
  149. b = b >> (32 - blkCnt);
  150. while(blkCnt > 0)
  151. {
  152. ba = a & 1;
  153. bb = b & 1;
  154. a = a >> 1;
  155. b = b >> 1;
  156. #ifdef TT
  157. _ctt += (ba && bb);
  158. #endif
  159. #ifdef FF
  160. _cff += ((1 ^ ba) && (1 ^ bb));
  161. #endif
  162. #ifdef TF
  163. _ctf += (ba && (1 ^ bb));
  164. #endif
  165. #ifdef FT
  166. _cft += ((1 ^ ba) && bb);
  167. #endif
  168. blkCnt --;
  169. }
  170. #ifdef TT
  171. *cTT = _ctt;
  172. #endif
  173. #ifdef FF
  174. *cFF = _cff;
  175. #endif
  176. #ifdef TF
  177. *cTF = _ctf;
  178. #endif
  179. #ifdef FT
  180. *cFT = _cft;
  181. #endif
  182. }
  183. #else
  184. #if defined(ARM_MATH_NEON)
  185. void FUNC(EXT)(const uint32_t *pA
  186. , const uint32_t *pB
  187. , uint32_t numberOfBools
  188. #ifdef TT
  189. , uint32_t *cTT
  190. #endif
  191. #ifdef FF
  192. , uint32_t *cFF
  193. #endif
  194. #ifdef TF
  195. , uint32_t *cTF
  196. #endif
  197. #ifdef FT
  198. , uint32_t *cFT
  199. #endif
  200. )
  201. {
  202. #ifdef TT
  203. uint32_t _ctt=0;
  204. #endif
  205. #ifdef FF
  206. uint32_t _cff=0;
  207. #endif
  208. #ifdef TF
  209. uint32_t _ctf=0;
  210. #endif
  211. #ifdef FT
  212. uint32_t _cft=0;
  213. #endif
  214. uint32_t nbBoolBlock;
  215. uint32_t a,b,ba,bb;
  216. int shift;
  217. uint32x4_t aV, bV;
  218. #ifdef TT
  219. uint32x4_t cttV;
  220. #endif
  221. #ifdef FF
  222. uint32x4_t cffV;
  223. #endif
  224. #ifdef TF
  225. uint32x4_t ctfV;
  226. #endif
  227. #ifdef FT
  228. uint32x4_t cftV;
  229. #endif
  230. uint8x16_t tmp;
  231. uint16x8_t tmp2;
  232. uint32x4_t tmp3;
  233. uint64x2_t tmp4;
  234. #ifdef TT
  235. uint64x2_t tmp4tt;
  236. #endif
  237. #ifdef FF
  238. uint64x2_t tmp4ff;
  239. #endif
  240. #ifdef TF
  241. uint64x2_t tmp4tf;
  242. #endif
  243. #ifdef FT
  244. uint64x2_t tmp4ft;
  245. #endif
  246. #ifdef TT
  247. tmp4tt = vdupq_n_u64(0);
  248. #endif
  249. #ifdef FF
  250. tmp4ff = vdupq_n_u64(0);
  251. #endif
  252. #ifdef TF
  253. tmp4tf = vdupq_n_u64(0);
  254. #endif
  255. #ifdef FT
  256. tmp4ft = vdupq_n_u64(0);
  257. #endif
  258. nbBoolBlock = numberOfBools >> 7;
  259. while(nbBoolBlock > 0)
  260. {
  261. aV = vld1q_u32(pA);
  262. bV = vld1q_u32(pB);
  263. pA += 4;
  264. pB += 4;
  265. #ifdef TT
  266. cttV = vandq_u32(aV,bV);
  267. #endif
  268. #ifdef FF
  269. cffV = vandq_u32(vmvnq_u32(aV),vmvnq_u32(bV));
  270. #endif
  271. #ifdef TF
  272. ctfV = vandq_u32(aV,vmvnq_u32(bV));
  273. #endif
  274. #ifdef FT
  275. cftV = vandq_u32(vmvnq_u32(aV),bV);
  276. #endif
  277. #ifdef TT
  278. tmp = vcntq_u8(vreinterpretq_u8_u32(cttV));
  279. tmp2 = vpaddlq_u8(tmp);
  280. tmp3 = vpaddlq_u16(tmp2);
  281. tmp4 = vpaddlq_u32(tmp3);
  282. tmp4tt = vaddq_u64(tmp4tt, tmp4);
  283. #endif
  284. #ifdef FF
  285. tmp = vcntq_u8(vreinterpretq_u8_u32(cffV));
  286. tmp2 = vpaddlq_u8(tmp);
  287. tmp3 = vpaddlq_u16(tmp2);
  288. tmp4 = vpaddlq_u32(tmp3);
  289. tmp4ff = vaddq_u64(tmp4ff, tmp4);
  290. #endif
  291. #ifdef TF
  292. tmp = vcntq_u8(vreinterpretq_u8_u32(ctfV));
  293. tmp2 = vpaddlq_u8(tmp);
  294. tmp3 = vpaddlq_u16(tmp2);
  295. tmp4 = vpaddlq_u32(tmp3);
  296. tmp4tf = vaddq_u64(tmp4tf, tmp4);
  297. #endif
  298. #ifdef FT
  299. tmp = vcntq_u8(vreinterpretq_u8_u32(cftV));
  300. tmp2 = vpaddlq_u8(tmp);
  301. tmp3 = vpaddlq_u16(tmp2);
  302. tmp4 = vpaddlq_u32(tmp3);
  303. tmp4ft = vaddq_u64(tmp4ft, tmp4);
  304. #endif
  305. nbBoolBlock --;
  306. }
  307. #ifdef TT
  308. _ctt += vgetq_lane_u64(tmp4tt, 0) + vgetq_lane_u64(tmp4tt, 1);
  309. #endif
  310. #ifdef FF
  311. _cff +=vgetq_lane_u64(tmp4ff, 0) + vgetq_lane_u64(tmp4ff, 1);
  312. #endif
  313. #ifdef TF
  314. _ctf += vgetq_lane_u64(tmp4tf, 0) + vgetq_lane_u64(tmp4tf, 1);
  315. #endif
  316. #ifdef FT
  317. _cft += vgetq_lane_u64(tmp4ft, 0) + vgetq_lane_u64(tmp4ft, 1);
  318. #endif
  319. nbBoolBlock = numberOfBools & 0x7F;
  320. while(nbBoolBlock >= 32)
  321. {
  322. a = *pA++;
  323. b = *pB++;
  324. shift = 0;
  325. while(shift < 32)
  326. {
  327. ba = a & 1;
  328. bb = b & 1;
  329. a = a >> 1;
  330. b = b >> 1;
  331. #ifdef TT
  332. _ctt += (ba && bb);
  333. #endif
  334. #ifdef FF
  335. _cff += ((1 ^ ba) && (1 ^ bb));
  336. #endif
  337. #ifdef TF
  338. _ctf += (ba && (1 ^ bb));
  339. #endif
  340. #ifdef FT
  341. _cft += ((1 ^ ba) && bb);
  342. #endif
  343. shift ++;
  344. }
  345. nbBoolBlock -= 32;
  346. }
  347. a = *pA++;
  348. b = *pB++;
  349. a = a >> (32 - nbBoolBlock);
  350. b = b >> (32 - nbBoolBlock);
  351. while(nbBoolBlock > 0)
  352. {
  353. ba = a & 1;
  354. bb = b & 1;
  355. a = a >> 1;
  356. b = b >> 1;
  357. #ifdef TT
  358. _ctt += (ba && bb);
  359. #endif
  360. #ifdef FF
  361. _cff += ((1 ^ ba) && (1 ^ bb));
  362. #endif
  363. #ifdef TF
  364. _ctf += (ba && (1 ^ bb));
  365. #endif
  366. #ifdef FT
  367. _cft += ((1 ^ ba) && bb);
  368. #endif
  369. nbBoolBlock --;
  370. }
  371. #ifdef TT
  372. *cTT = _ctt;
  373. #endif
  374. #ifdef FF
  375. *cFF = _cff;
  376. #endif
  377. #ifdef TF
  378. *cTF = _ctf;
  379. #endif
  380. #ifdef FT
  381. *cFT = _cft;
  382. #endif
  383. }
  384. #else
  385. void FUNC(EXT)(const uint32_t *pA
  386. , const uint32_t *pB
  387. , uint32_t numberOfBools
  388. #ifdef TT
  389. , uint32_t *cTT
  390. #endif
  391. #ifdef FF
  392. , uint32_t *cFF
  393. #endif
  394. #ifdef TF
  395. , uint32_t *cTF
  396. #endif
  397. #ifdef FT
  398. , uint32_t *cFT
  399. #endif
  400. )
  401. {
  402. #ifdef TT
  403. uint32_t _ctt=0;
  404. #endif
  405. #ifdef FF
  406. uint32_t _cff=0;
  407. #endif
  408. #ifdef TF
  409. uint32_t _ctf=0;
  410. #endif
  411. #ifdef FT
  412. uint32_t _cft=0;
  413. #endif
  414. uint32_t a,b,ba,bb;
  415. int shift;
  416. while(numberOfBools >= 32)
  417. {
  418. a = *pA++;
  419. b = *pB++;
  420. shift = 0;
  421. while(shift < 32)
  422. {
  423. ba = a & 1;
  424. bb = b & 1;
  425. a = a >> 1;
  426. b = b >> 1;
  427. #ifdef TT
  428. _ctt += (ba && bb);
  429. #endif
  430. #ifdef FF
  431. _cff += ((1 ^ ba) && (1 ^ bb));
  432. #endif
  433. #ifdef TF
  434. _ctf += (ba && (1 ^ bb));
  435. #endif
  436. #ifdef FT
  437. _cft += ((1 ^ ba) && bb);
  438. #endif
  439. shift ++;
  440. }
  441. numberOfBools -= 32;
  442. }
  443. a = *pA++;
  444. b = *pB++;
  445. a = a >> (32 - numberOfBools);
  446. b = b >> (32 - numberOfBools);
  447. while(numberOfBools > 0)
  448. {
  449. ba = a & 1;
  450. bb = b & 1;
  451. a = a >> 1;
  452. b = b >> 1;
  453. #ifdef TT
  454. _ctt += (ba && bb);
  455. #endif
  456. #ifdef FF
  457. _cff += ((1 ^ ba) && (1 ^ bb));
  458. #endif
  459. #ifdef TF
  460. _ctf += (ba && (1 ^ bb));
  461. #endif
  462. #ifdef FT
  463. _cft += ((1 ^ ba) && bb);
  464. #endif
  465. numberOfBools --;
  466. }
  467. #ifdef TT
  468. *cTT = _ctt;
  469. #endif
  470. #ifdef FF
  471. *cFF = _cff;
  472. #endif
  473. #ifdef TF
  474. *cTF = _ctf;
  475. #endif
  476. #ifdef FT
  477. *cFT = _cft;
  478. #endif
  479. }
  480. #endif
  481. #endif /* defined(ARM_MATH_MVEI) */
  482. /**
  483. * @} end of DISTANCEF group
  484. */