STL源码笔记1 —— allocators
简述
allocators是STL中很重要的一个幕后英雄的角色,STL中的容器在使用过程中需要不断的放元素进去和取元素出来,而在此过程中,如何更高效的申请和释放内存是十分影响STL容器的性能的。
operator new() 和 malloc()
首先内存的分配动作,一层层调用下去,最终到了CRT的层面上都是调用malloc()来分配,而malloc再根据所在的操作系统,调用不同的操作系统api才能真正的拿到内存。下面是vs2015和gcc 2.95的源码,里面的 operator new 最终调用 malloc 进行内存分配:
//VS2015中,Microsoft Visual Studio 14.0VCcrtsrcvcruntime
ew_debug.cpp中的源码
//debug模式的_malloc_dbg
void* __CRTDECL operator new(
size_t const size,
int const block_use,
char const* file_name,
int const line_number
)
{
for (;;)
{
if (void* const block = _malloc_dbg(size, block_use, file_name, line_number))
{
return block;
}
if (_callnewh(size) == 0)
{
if (size == SIZE_MAX)
{
__scrt_throw_std_bad_array_new_length();
}
else
{
__scrt_throw_std_bad_alloc();
}
}
}
}
//VS2015中,Microsoft Visual Studio 14.0VCcrtsrclinkopts
othrownew.cpp中的源码
//不抛异常的new 直接调用malloc
void* operator new(size_t size)
{
for (;;)
{
if (void* const block = malloc(size))
return block;
if (_callnewh(size) == 0)
return nullptr;
// The new handler was successful; try to allocate again...
}
}
//在gcc 2.95里面 gcc-2.95.1gcccp
ew1.cc
void * operator new (size_t sz, const std::nothrow_t&) throw()
{
void *p;
/* malloc (0) is unpredictable; avoid it. */
if (sz == 0)
sz = 1;
p = (void *) malloc (sz);
while (p == 0)
{
new_handler handler = __new_handler;
if (! handler)
return 0;
try
{
handler ();
}
catch (bad_alloc &)
{
return 0;
}
p = (void *) malloc (sz);
}
return p;
}
void * operator new (size_t sz) throw (std::bad_alloc)
{
void *p;
/* malloc (0) is unpredictable; avoid it. */
if (sz == 0)
sz = 1;
p = (void *) malloc (sz);
while (p == 0)
{
new_handler handler = __new_handler;
if (! handler)
throw bad_alloc ();
handler ();
p = (void *) malloc (sz);
}
return p;
}
然而malloc分配的内存如果在debug模式下,会有许多额外的信息(包括大小、前后块指针、使用情况等信息),而即使是在release模式下,也至少会有标识大小的字节被占用。那么每次申请的内存就会有额外的开销,如果申请的空间很小,额外的开销占比就会很大。因此,产生了一种使用内存管理,减少这种开销的想法,这也是STL的allocators分配器最核心的功能。
VS2015 中的allocator
在VS中,几个容器的设计是这样的:
//VS中的vector
template<class _Ty,
class _Alloc = allocator<_Ty> >
class vector
: public _Vector_alloc<_Vec_base_types<_Ty, _Alloc> >
{ // varying size array of values
...
};
//VS中的list
template<class _Ty,
class _Alloc = allocator<_Ty> >
class list
: public _List_buy<_Ty, _Alloc>
{ // bidirectional linked list
...
};
可以看到在VS的容器里,默认使用的是allocator这个class,那么再去 Microsoft Visual Studio 14.0VCincludexmemory0
观察allocator的实现:
template<class _Ty>
class allocator
{ // generic allocator for objects of class _Ty
public:
...
_DECLSPEC_ALLOCATOR pointer allocate(size_type _Count)
{ // allocate array of _Count elements
return (static_cast<pointer>(_Allocate(_Count, sizeof (_Ty))));
}
_DECLSPEC_ALLOCATOR pointer allocate(size_type _Count, const void *)
{ // allocate array of _Count elements, ignore hint
return (allocate(_Count));
}
};
allocate调用了_Allocate,再去看_Allocate的实现:
_DECLSPEC_ALLOCATOR void *_Allocate(size_t _Count, size_t _Sz,
bool _Try_aligned_allocation = true)
{ // allocate storage for _Count elements of size _Sz
void *_Ptr = 0;
if (_Count == 0)
return (_Ptr);
// check overflow of multiply
if ((size_t)(-1) / _Sz < _Count)
_Xbad_alloc(); // report no memory
const size_t _User_size = _Count * _Sz;
#if defined(_M_IX86) || defined(_M_X64)
if (_Try_aligned_allocation
&& _BIG_ALLOCATION_THRESHOLD <= _User_size)
{ // allocate large block
static_assert(sizeof (void *) < _BIG_ALLOCATION_ALIGNMENT,
"Big allocations should at least match vector register size");
const size_t _Block_size = _NON_USER_SIZE + _User_size;
if (_Block_size <= _User_size)
_Xbad_alloc(); // report no memory
const uintptr_t _Ptr_container =
reinterpret_cast<uintptr_t>(::operator new(_Block_size));
_SCL_SECURE_ALWAYS_VALIDATE(_Ptr_container != 0);
_Ptr = reinterpret_cast<void *>((_Ptr_container + _NON_USER_SIZE)
& ~(_BIG_ALLOCATION_ALIGNMENT - 1));
static_cast<uintptr_t *>(_Ptr)[-1] = _Ptr_container;
#ifdef _DEBUG
static_cast<uintptr_t *>(_Ptr)[-2] = _BIG_ALLOCATION_SENTINEL;
#endif /* _DEBUG */
}
else
#endif /* defined(_M_IX86) || defined(_M_X64) */
{ // allocate normal block
_Ptr = ::operator new(_User_size);
_SCL_SECURE_ALWAYS_VALIDATE(_Ptr != 0);
}
return (_Ptr);
}
里面就是调用_Xbad_alloc()或者 ::operator new 来实现内存的分配,所以实际上VS并没有对STL的allocator做特别的优化。
同样的,通过查看 deallocate() 对应的源代码,也看出来VS在释放的时候也只是一个对 operator delete 的封装而已。因此,可以认为VS在这一方面并没有做特殊设计。
GCC2.9 中的allocator
同样,我们也去看GCC中容器的实现:
//gcc-2.95.1libstdc++stlstl_vector.h
template <class _Tp, class _Alloc = __STL_DEFAULT_ALLOCATOR(_Tp) >
class vector : protected _Vector_base<_Tp, _Alloc>
{
···
};
//gcc-2.95.1libstdc++stlstl_list.h
template <class _Tp, class _Alloc = __STL_DEFAULT_ALLOCATOR(_Tp) >
class list : protected _List_base<_Tp, _Alloc> {
...
};
//在stl_config.h中
# ifndef __STL_DEFAULT_ALLOCATOR
# ifdef __STL_USE_STD_ALLOCATORS
# define __STL_DEFAULT_ALLOCATOR(T) allocator<T>
# else
# define __STL_DEFAULT_ALLOCATOR(T) alloc
# endif
# endif
可以看到,宏 __STL_DEFAULT_ALLOCATOR,如果没有特别说明的情况下,是使用alloc的,那么我们来看看alloc类的实现:
//默认情况下 __NODE_ALLOCATOR_THREADS 为false,所以是单线程的
# ifdef _NOTHREADS
# define __NODE_ALLOCATOR_LOCK
# define __NODE_ALLOCATOR_UNLOCK
# define __NODE_ALLOCATOR_THREADS false
# define __VOLATILE
# endif
typedef __default_alloc_template<__NODE_ALLOCATOR_THREADS, 0> alloc;
template <bool threads, int inst>
class __default_alloc_template {
...
private:
enum {_ALIGN = 8};
enum {_MAX_BYTES = 128};
enum {_NFREELISTS = _MAX_BYTES/_ALIGN};
union _Obj {
union _Obj* _M_free_list_link;
char _M_client_data[1]; /* The client sees this. */
};
static _Obj* __VOLATILE _S_free_list[_NFREELISTS];
// Chunk allocation state.
static char* _S_start_free;
static char* _S_end_free;
static size_t _S_heap_size;
...
};
借用侯捷老师在课程中用的图,其实在gcc的alloc中,设计了一个16条链表的数组 _S_free_list
,每一条链表负责某个大小的特定区块的分配,分别负责从8个字节到128个字节的区块。当容器需要分配内存的时候,都是从这个分配器中去申请,然后大小向上调整到8的倍数(例如120字节会调整到128字节),然后到分配器里再去对应的链表中搜索是否有空闲的区块(例如128字节需要到 __S_free_list[15] 里面搜索),如果该链表中没有挂着空闲内存,才会通过malloc向系统申请一大块内存再切割。
同样,再往下看,看看该类中的 allocate 和 deallocate的实现:
template <bool threads, int inst>
class __default_alloc_template {
...
static size_t _S_freelist_index(size_t __bytes) {
return (((__bytes) + _ALIGN-1)/_ALIGN - 1);
}
public:
/* __n must be > 0 */
static void* allocate(size_t __n)
{
_Obj* __VOLATILE* __my_free_list;
_Obj* __RESTRICT __result;
if (__n > (size_t) _MAX_BYTES) {
return(malloc_alloc::allocate(__n));
}
__my_free_list = _S_free_list + _S_freelist_index(__n);
__result = *__my_free_list;
if (__result == 0) {
void* __r = _S_refill(_S_round_up(__n));
return __r;
}
*__my_free_list = __result -> _M_free_list_link;
return (__result);
};
/* __p may not be 0 */
static void deallocate(void* __p, size_t __n)
{
_Obj* __q = (_Obj*)__p;
_Obj* __VOLATILE* __my_free_list;
if (__n > (size_t) _MAX_BYTES) {
malloc_alloc::deallocate(__p, __n);
return;
}
__my_free_list = _S_free_list + _S_freelist_index(__n);
__q -> _M_free_list_link = *__my_free_list;
*__my_free_list = __q;
// lock is released here
}
...
};
_S_freelist_index(__n) 这个函数可以计算出需要的空间会落在 _S_free_list 这个数组的哪个index上,再从中取出一块空余内存分配。而如果没有空余内存,则会调用 _S_refill(_S_round_up(__n)) 该函数重新分配一段内存,源代码如下:
template <bool __threads, int __inst>
void*
__default_alloc_template<__threads, __inst>::_S_refill(size_t __n)
{
int __nobjs = 20;
char* __chunk = _S_chunk_alloc(__n, __nobjs);
_Obj* __VOLATILE* __my_free_list;
_Obj* __result;
_Obj* __current_obj;
_Obj* __next_obj;
int __i;
if (1 == __nobjs) return(__chunk);
__my_free_list = _S_free_list + _S_freelist_index(__n);
/* Build free list in chunk */
__result = (_Obj*)__chunk;
*__my_free_list = __next_obj = (_Obj*)(__chunk + __n);
for (__i = 1; ; __i++) {
__current_obj = __next_obj;
__next_obj = (_Obj*)((char*)__next_obj + __n);
if (__nobjs - 1 == __i) {
__current_obj -> _M_free_list_link = 0;
break;
} else {
__current_obj -> _M_free_list_link = __next_obj;
}
}
return(__result);
}
其中,真正申请空间的函数就是 _S_chunk_alloc(__n, __nobjs) ,其中 __nobjs = 20,阅读下面的代码可以看到如果重新分配一块内存,会申请一块20倍大小再多一点的内存:
size_t __total_bytes = __size * __nobjs;
size_t __bytes_to_get = 2 * __total_bytes + _S_round_up(_S_heap_size >> 4);
然后再从该空间中分出一块作为当前分配,具体源码如下:
template <bool __threads, int __inst>
char*
__default_alloc_template<__threads, __inst>::_S_chunk_alloc(size_t __size,
int& __nobjs)
{
char* __result;
size_t __total_bytes = __size * __nobjs;
size_t __bytes_left = _S_end_free - _S_start_free;
if (__bytes_left >= __total_bytes) {
__result = _S_start_free;
_S_start_free += __total_bytes;
return(__result);
} else if (__bytes_left >= __size) {
__nobjs = (int)(__bytes_left/__size);
__total_bytes = __size * __nobjs;
__result = _S_start_free;
_S_start_free += __total_bytes;
return(__result);
} else {
size_t __bytes_to_get =
2 * __total_bytes + _S_round_up(_S_heap_size >> 4);
// Try to make use of the left-over piece.
if (__bytes_left > 0) {
_Obj* __VOLATILE* __my_free_list =
_S_free_list + _S_freelist_index(__bytes_left);
((_Obj*)_S_start_free) -> _M_free_list_link = *__my_free_list;
*__my_free_list = (_Obj*)_S_start_free;
}
_S_start_free = (char*)malloc(__bytes_to_get);
if (0 == _S_start_free) {
size_t __i;
_Obj* __VOLATILE* __my_free_list;
_Obj* __p;
// Try to make do with what we have. That can't
// hurt. We do not try smaller requests, since that tends
// to result in disaster on multi-process machines.
for (__i = __size; __i <= _MAX_BYTES; __i += _ALIGN) {
__my_free_list = _S_free_list + _S_freelist_index(__i);
__p = *__my_free_list;
if (0 != __p) {
*__my_free_list = __p -> _M_free_list_link;
_S_start_free = (char*)__p;
_S_end_free = _S_start_free + __i;
return(_S_chunk_alloc(__size, __nobjs));
// Any leftover piece will eventually make it to the
// right free list.
}
}
_S_end_free = 0; // In case of exception.
_S_start_free = (char*)malloc_alloc::allocate(__bytes_to_get);
// This should either throw an
// exception or remedy the situation. Thus we assume it
// succeeded.
}
_S_heap_size += __bytes_to_get;
_S_end_free = _S_start_free + __bytes_to_get;
return(_S_chunk_alloc(__size, __nobjs));
}
}