allocator.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #ifndef NCNN_ALLOCATOR_H
  15. #define NCNN_ALLOCATOR_H
  16. #ifdef _WIN32
  17. #define WIN32_LEAN_AND_MEAN
  18. #include <windows.h>
  19. #endif
  20. #include "platform.h"
  21. #include <stdlib.h>
  22. #if NCNN_VULKAN
  23. #include <vulkan/vulkan.h>
  24. #endif // NCNN_VULKAN
  25. #if NCNN_PLATFORM_API
  26. #if __ANDROID_API__ >= 26
  27. #include <android/hardware_buffer.h>
  28. #endif // __ANDROID_API__ >= 26
  29. #endif // NCNN_PLATFORM_API
  30. namespace ncnn {
  31. // the alignment of all the allocated buffers
  32. #if NCNN_AVX512
  33. #define NCNN_MALLOC_ALIGN 64
  34. #elif NCNN_AVX
  35. #define NCNN_MALLOC_ALIGN 32
  36. #else
  37. #define NCNN_MALLOC_ALIGN 16
  38. #endif
  39. // we have some optimized kernels that may overread buffer a bit in loop
  40. // it is common to interleave next-loop data load with arithmetic instructions
  41. // allocating more bytes keeps us safe from SEGV_ACCERR failure
  42. #define NCNN_MALLOC_OVERREAD 64
  43. // Aligns a pointer to the specified number of bytes
  44. // ptr Aligned pointer
  45. // n Alignment size that must be a power of two
  46. template<typename _Tp>
  47. static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
  48. {
  49. return (_Tp*)(((size_t)ptr + n - 1) & -n);
  50. }
  51. // Aligns a buffer size to the specified number of bytes
  52. // The function returns the minimum number that is greater or equal to sz and is divisible by n
  53. // sz Buffer size to align
  54. // n Alignment size that must be a power of two
  55. static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
  56. {
  57. return (sz + n - 1) & -n;
  58. }
  59. static NCNN_FORCEINLINE void* fastMalloc(size_t size)
  60. {
  61. #if _MSC_VER
  62. return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
  63. #elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
  64. void* ptr = 0;
  65. if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
  66. ptr = 0;
  67. return ptr;
  68. #elif __ANDROID__ && __ANDROID_API__ < 17
  69. return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
  70. #else
  71. unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
  72. if (!udata)
  73. return 0;
  74. unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
  75. adata[-1] = udata;
  76. return adata;
  77. #endif
  78. }
  79. static NCNN_FORCEINLINE void fastFree(void* ptr)
  80. {
  81. if (ptr)
  82. {
  83. #if _MSC_VER
  84. _aligned_free(ptr);
  85. #elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
  86. free(ptr);
  87. #elif __ANDROID__ && __ANDROID_API__ < 17
  88. free(ptr);
  89. #else
  90. unsigned char* udata = ((unsigned char**)ptr)[-1];
  91. free(udata);
  92. #endif
  93. }
  94. }
  95. #if NCNN_THREADS
  96. // exchange-add operation for atomic operations on reference counters
  97. #if defined __riscv && !defined __riscv_atomic
  98. // riscv target without A extension
  99. static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
  100. {
  101. int tmp = *addr;
  102. *addr += delta;
  103. return tmp;
  104. }
  105. #elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
  106. // atomic increment on the linux version of the Intel(tm) compiler
  107. #define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
  108. #elif defined __GNUC__
  109. #if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
  110. #ifdef __ATOMIC_ACQ_REL
  111. #define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
  112. #else
  113. #define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
  114. #endif
  115. #else
  116. #if defined __ATOMIC_ACQ_REL && !defined __clang__
  117. // version for gcc >= 4.7
  118. #define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
  119. #else
  120. #define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
  121. #endif
  122. #endif
  123. #elif defined _MSC_VER && !defined RC_INVOKED
  124. #define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
  125. #else
  126. // thread-unsafe branch
  127. static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
  128. {
  129. int tmp = *addr;
  130. *addr += delta;
  131. return tmp;
  132. }
  133. #endif
  134. #else // NCNN_THREADS
  135. static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
  136. {
  137. int tmp = *addr;
  138. *addr += delta;
  139. return tmp;
  140. }
  141. #endif // NCNN_THREADS
  142. class NCNN_EXPORT Allocator
  143. {
  144. public:
  145. virtual ~Allocator();
  146. virtual void* fastMalloc(size_t size) = 0;
  147. virtual void fastFree(void* ptr) = 0;
  148. };
  149. class PoolAllocatorPrivate;
  150. class NCNN_EXPORT PoolAllocator : public Allocator
  151. {
  152. public:
  153. PoolAllocator();
  154. ~PoolAllocator();
  155. // ratio range 0 ~ 1
  156. // default cr = 0
  157. void set_size_compare_ratio(float scr);
  158. // budget drop threshold
  159. // default threshold = 10
  160. void set_size_drop_threshold(size_t);
  161. // release all budgets immediately
  162. void clear();
  163. virtual void* fastMalloc(size_t size);
  164. virtual void fastFree(void* ptr);
  165. private:
  166. PoolAllocator(const PoolAllocator&);
  167. PoolAllocator& operator=(const PoolAllocator&);
  168. private:
  169. PoolAllocatorPrivate* const d;
  170. };
  171. class UnlockedPoolAllocatorPrivate;
  172. class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
  173. {
  174. public:
  175. UnlockedPoolAllocator();
  176. ~UnlockedPoolAllocator();
  177. // ratio range 0 ~ 1
  178. // default cr = 0
  179. void set_size_compare_ratio(float scr);
  180. // budget drop threshold
  181. // default threshold = 10
  182. void set_size_drop_threshold(size_t);
  183. // release all budgets immediately
  184. void clear();
  185. virtual void* fastMalloc(size_t size);
  186. virtual void fastFree(void* ptr);
  187. private:
  188. UnlockedPoolAllocator(const UnlockedPoolAllocator&);
  189. UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
  190. private:
  191. UnlockedPoolAllocatorPrivate* const d;
  192. };
  193. #if NCNN_VULKAN
  194. class VulkanDevice;
  195. class NCNN_EXPORT VkBufferMemory
  196. {
  197. public:
  198. VkBuffer buffer;
  199. // the base offset assigned by allocator
  200. size_t offset;
  201. size_t capacity;
  202. VkDeviceMemory memory;
  203. void* mapped_ptr;
  204. // buffer state, modified by command functions internally
  205. mutable VkAccessFlags access_flags;
  206. mutable VkPipelineStageFlags stage_flags;
  207. // initialize and modified by mat
  208. int refcount;
  209. };
  210. class NCNN_EXPORT VkImageMemory
  211. {
  212. public:
  213. VkImage image;
  214. VkImageView imageview;
  215. // underlying info assigned by allocator
  216. int width;
  217. int height;
  218. int depth;
  219. VkFormat format;
  220. VkDeviceMemory memory;
  221. void* mapped_ptr;
  222. // the base offset assigned by allocator
  223. size_t bind_offset;
  224. size_t bind_capacity;
  225. // image state, modified by command functions internally
  226. mutable VkAccessFlags access_flags;
  227. mutable VkImageLayout image_layout;
  228. mutable VkPipelineStageFlags stage_flags;
  229. // in-execution state, modified by command functions internally
  230. mutable int command_refcount;
  231. // initialize and modified by mat
  232. int refcount;
  233. };
  234. class NCNN_EXPORT VkAllocator
  235. {
  236. public:
  237. explicit VkAllocator(const VulkanDevice* _vkdev);
  238. virtual ~VkAllocator();
  239. virtual void clear();
  240. virtual VkBufferMemory* fastMalloc(size_t size) = 0;
  241. virtual void fastFree(VkBufferMemory* ptr) = 0;
  242. virtual int flush(VkBufferMemory* ptr);
  243. virtual int invalidate(VkBufferMemory* ptr);
  244. virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
  245. virtual void fastFree(VkImageMemory* ptr) = 0;
  246. public:
  247. const VulkanDevice* vkdev;
  248. uint32_t buffer_memory_type_index;
  249. uint32_t image_memory_type_index;
  250. uint32_t reserved_type_index;
  251. bool mappable;
  252. bool coherent;
  253. protected:
  254. VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
  255. VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
  256. VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
  257. VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
  258. VkImageView create_imageview(VkImage image, VkFormat format);
  259. };
  260. class VkBlobAllocatorPrivate;
  261. class NCNN_EXPORT VkBlobAllocator : public VkAllocator
  262. {
  263. public:
  264. explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
  265. virtual ~VkBlobAllocator();
  266. public:
  267. // release all budgets immediately
  268. virtual void clear();
  269. virtual VkBufferMemory* fastMalloc(size_t size);
  270. virtual void fastFree(VkBufferMemory* ptr);
  271. virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
  272. virtual void fastFree(VkImageMemory* ptr);
  273. private:
  274. VkBlobAllocator(const VkBlobAllocator&);
  275. VkBlobAllocator& operator=(const VkBlobAllocator&);
  276. private:
  277. VkBlobAllocatorPrivate* const d;
  278. };
  279. class VkWeightAllocatorPrivate;
  280. class NCNN_EXPORT VkWeightAllocator : public VkAllocator
  281. {
  282. public:
  283. explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
  284. virtual ~VkWeightAllocator();
  285. public:
  286. // release all blocks immediately
  287. virtual void clear();
  288. public:
  289. virtual VkBufferMemory* fastMalloc(size_t size);
  290. virtual void fastFree(VkBufferMemory* ptr);
  291. virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
  292. virtual void fastFree(VkImageMemory* ptr);
  293. private:
  294. VkWeightAllocator(const VkWeightAllocator&);
  295. VkWeightAllocator& operator=(const VkWeightAllocator&);
  296. private:
  297. VkWeightAllocatorPrivate* const d;
  298. };
  299. class VkStagingAllocatorPrivate;
  300. class NCNN_EXPORT VkStagingAllocator : public VkAllocator
  301. {
  302. public:
  303. explicit VkStagingAllocator(const VulkanDevice* vkdev);
  304. virtual ~VkStagingAllocator();
  305. public:
  306. // ratio range 0 ~ 1
  307. // default cr = 0.75
  308. void set_size_compare_ratio(float scr);
  309. // release all budgets immediately
  310. virtual void clear();
  311. virtual VkBufferMemory* fastMalloc(size_t size);
  312. virtual void fastFree(VkBufferMemory* ptr);
  313. virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
  314. virtual void fastFree(VkImageMemory* ptr);
  315. private:
  316. VkStagingAllocator(const VkStagingAllocator&);
  317. VkStagingAllocator& operator=(const VkStagingAllocator&);
  318. private:
  319. VkStagingAllocatorPrivate* const d;
  320. };
  321. class VkWeightStagingAllocatorPrivate;
  322. class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
  323. {
  324. public:
  325. explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
  326. virtual ~VkWeightStagingAllocator();
  327. public:
  328. virtual VkBufferMemory* fastMalloc(size_t size);
  329. virtual void fastFree(VkBufferMemory* ptr);
  330. virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
  331. virtual void fastFree(VkImageMemory* ptr);
  332. private:
  333. VkWeightStagingAllocator(const VkWeightStagingAllocator&);
  334. VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
  335. private:
  336. VkWeightStagingAllocatorPrivate* const d;
  337. };
  338. #if NCNN_PLATFORM_API
  339. #if __ANDROID_API__ >= 26
  340. class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
  341. {
  342. public:
  343. VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
  344. virtual ~VkAndroidHardwareBufferImageAllocator();
  345. public:
  346. virtual VkBufferMemory* fastMalloc(size_t size);
  347. virtual void fastFree(VkBufferMemory* ptr);
  348. virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
  349. virtual void fastFree(VkImageMemory* ptr);
  350. private:
  351. VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
  352. VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
  353. public:
  354. int init();
  355. int width() const;
  356. int height() const;
  357. uint64_t external_format() const;
  358. public:
  359. AHardwareBuffer* hb;
  360. AHardwareBuffer_Desc bufferDesc;
  361. VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
  362. VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
  363. VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
  364. };
  365. #endif // __ANDROID_API__ >= 26
  366. #endif // NCNN_PLATFORM_API
  367. #endif // NCNN_VULKAN
  368. } // namespace ncnn
  369. #endif // NCNN_ALLOCATOR_H