zoukankan html css js c++ java

机器学习之内存优化

因为训练数据集往往比较大，而内存会出现不够用的情况，可以通过修改特征的数据类型，从而达到优化压缩的目的

I、普通方法，直接复制调用就行

参考网址：https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

2、封装完成的类

参考网址：https://www.kaggle.com/wkirgsn/fail-safe-parallel-memory-reduction

"""reducing.py
Author: Kirgsn, 2018

Use like this:
调用方式:
>>> import reducing
>>> df = reducing.Reducer().reduce(df)
"""
import numpy as np
import pandas as pd
import time
import gc
from joblib import Parallel, delayed


def measure_time_mem(func):
    def wrapped_reduce(self, df, *args, **kwargs):
        # pre
        mem_usage_orig = df.memory_usage().sum() / self.memory_scale_factor
        start_time = time.time()
        # exec
        ret = func(self, df, *args, **kwargs)
        # post
        mem_usage_new = ret.memory_usage().sum() / self.memory_scale_factor
        end_time = time.time()
        print(f'reduced df from {mem_usage_orig:.4f} MB '
              f'to {mem_usage_new:.4f} MB '
              f'in {(end_time - start_time):.2f} seconds')
        gc.collect()
        return ret
    return wrapped_reduce


class Reducer:
    """
    Class that takes a dict of increasingly big numpy datatypes to transform
    the data of a pandas dataframe into, in order to save memory usage.
    """
    memory_scale_factor = 1024**2  # memory in MB

    def __init__(self, conv_table=None, use_categoricals=True, n_jobs=-1):
        """
        :param conv_table: dict with np.dtypes-strings as keys
        :param use_categoricals: Whether the new pandas dtype "Categoricals"
                shall be used
        :param n_jobs: Parallelization rate
        """

        self.conversion_table = 
            conv_table or {'int': [np.int8, np.int16, np.int32, np.int64],
                           'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
                           'float': [np.float32, ]}
        self.use_categoricals = use_categoricals
        self.n_jobs = n_jobs

    def _type_candidates(self, k):
        for c in self.conversion_table[k]:
            i = np.iinfo(c) if 'int' in k else np.finfo(c)
            yield c, i

    @measure_time_mem
    def reduce(self, df, verbose=False):
        """Takes a dataframe and returns it with all data transformed to the
        smallest necessary types.

        :param df: pandas dataframe
        :param verbose: If True, outputs more information
        :return: pandas dataframe with reduced data types
        """
        ret_list = Parallel(n_jobs=self.n_jobs)(delayed(self._reduce)
                                                (df[c], c, verbose) for c in
                                                df.columns)

        del df
        gc.collect()
        return pd.concat(ret_list, axis=1)

    def _reduce(self, s, colname, verbose):
        # skip NaNs
        if s.isnull().any():
            if verbose: print(f'{colname} has NaNs - Skip..')
            return s
        # detect kind of type
        coltype = s.dtype
        if np.issubdtype(coltype, np.integer):
            conv_key = 'int' if s.min() < 0 else 'uint'
        elif np.issubdtype(coltype, np.floating):
            conv_key = 'float'
        else:
            if isinstance(coltype, object) and self.use_categoricals:
                # check for all-strings series
                if s.apply(lambda x: isinstance(x, str)).all():
                    if verbose: print(f'convert {colname} to categorical')
                    return s.astype('category')
            if verbose: print(f'{colname} is {coltype} - Skip..')
            return s
        # find right candidate
        for cand, cand_info in self._type_candidates(conv_key):
            if s.max() <= cand_info.max and s.min() >= cand_info.min:
                if verbose: print(f'convert {colname} to {cand}')
                return s.astype(cand)

        # reaching this code is bad. Probably there are inf, or other high numbs
        print(f"WARNING: {colname} doesn't fit the grid with 
max: {s.max()} "
              f"and 
min: {s.min()}")
        print('Dropping it..')

3、可以将其转换为feather格式，降低内存占用

ps：转为feather格式的化需要配合pyarrow这个包，所以 pip install pyarrow

　用法很简单，结合1或者2方法先压缩数据类型，然后再生成feather文件

# 生成一个feather文件
your_df.to_feather(path)

# 读取feather文件

pd.read_feather(path)

查看全文

相关阅读:
在Centos 7下编译openwrt+njit-client
开博随笔
 Chapter 6. Statements
Chapter 4. Arrays and Pointers
Chapter 3. Library Types
Chapter 2. Variables and Basic Types
关于stm32不常用的中断,如何添加, 比如timer10 timer11等
 keil 报错 expected an identifier
案例分析串口的地不要接到电源上会烧掉
 案例分析 CAN OPEN 调试记录进度

原文地址：https://www.cnblogs.com/gambler/p/11988859.html

最新文章
spring事务
 spring aop
字符串格式化
 字符串比较
 字符串的追加
 字符串拷贝
 字符串逆置
 非空格字符串个数
 字符串出现次数
 主函数的形参