zoukankan      html  css  js  c++  java
  • 机器学习之内存优化

    因为训练数据集往往比较大,而内存会出现不够用的情况,可以通过修改特征的数据类型,从而达到优化压缩的目的

    I、普通方法,直接复制调用就行

    参考网址:https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

    def reduce_mem_usage(df, verbose=True):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2    
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
        end_mem = df.memory_usage().sum() / 1024**2
        if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
        return df 

    2、封装完成的类

    参考网址:https://www.kaggle.com/wkirgsn/fail-safe-parallel-memory-reduction

    """reducing.py
    Author: Kirgsn, 2018
    
    Use like this:
    调用方式: >>> import reducing >>> df = reducing.Reducer().reduce(df)
    """ import numpy as np import pandas as pd import time import gc from joblib import Parallel, delayed def measure_time_mem(func): def wrapped_reduce(self, df, *args, **kwargs): # pre mem_usage_orig = df.memory_usage().sum() / self.memory_scale_factor start_time = time.time() # exec ret = func(self, df, *args, **kwargs) # post mem_usage_new = ret.memory_usage().sum() / self.memory_scale_factor end_time = time.time() print(f'reduced df from {mem_usage_orig:.4f} MB ' f'to {mem_usage_new:.4f} MB ' f'in {(end_time - start_time):.2f} seconds') gc.collect() return ret return wrapped_reduce class Reducer: """ Class that takes a dict of increasingly big numpy datatypes to transform the data of a pandas dataframe into, in order to save memory usage. """ memory_scale_factor = 1024**2 # memory in MB def __init__(self, conv_table=None, use_categoricals=True, n_jobs=-1): """ :param conv_table: dict with np.dtypes-strings as keys :param use_categoricals: Whether the new pandas dtype "Categoricals" shall be used :param n_jobs: Parallelization rate """ self.conversion_table = conv_table or {'int': [np.int8, np.int16, np.int32, np.int64], 'uint': [np.uint8, np.uint16, np.uint32, np.uint64], 'float': [np.float32, ]} self.use_categoricals = use_categoricals self.n_jobs = n_jobs def _type_candidates(self, k): for c in self.conversion_table[k]: i = np.iinfo(c) if 'int' in k else np.finfo(c) yield c, i @measure_time_mem def reduce(self, df, verbose=False): """Takes a dataframe and returns it with all data transformed to the smallest necessary types. :param df: pandas dataframe :param verbose: If True, outputs more information :return: pandas dataframe with reduced data types """ ret_list = Parallel(n_jobs=self.n_jobs)(delayed(self._reduce) (df[c], c, verbose) for c in df.columns) del df gc.collect() return pd.concat(ret_list, axis=1) def _reduce(self, s, colname, verbose): # skip NaNs if s.isnull().any(): if verbose: print(f'{colname} has NaNs - Skip..') return s # detect kind of type coltype = s.dtype if np.issubdtype(coltype, np.integer): conv_key = 'int' if s.min() < 0 else 'uint' elif np.issubdtype(coltype, np.floating): conv_key = 'float' else: if isinstance(coltype, object) and self.use_categoricals: # check for all-strings series if s.apply(lambda x: isinstance(x, str)).all(): if verbose: print(f'convert {colname} to categorical') return s.astype('category') if verbose: print(f'{colname} is {coltype} - Skip..') return s # find right candidate for cand, cand_info in self._type_candidates(conv_key): if s.max() <= cand_info.max and s.min() >= cand_info.min: if verbose: print(f'convert {colname} to {cand}') return s.astype(cand) # reaching this code is bad. Probably there are inf, or other high numbs print(f"WARNING: {colname} doesn't fit the grid with max: {s.max()} " f"and min: {s.min()}") print('Dropping it..')

     3、可以将其转换为feather格式,降低内存占用

         ps:转为feather格式的化需要配合pyarrow这个包,所以 pip install pyarrow

      用法很简单,结合1或者2方法先压缩数据类型,然后再生成feather文件

    # 生成一个feather文件
    your_df.to_feather(path)

    # 读取feather文件
    pd.read_feather(path)
  • 相关阅读:
    深入了解SQLServer系统数据库工作原理(转)
    什么是动态语言(转)
    ASP.NET 2.0客户端回调的实现分析
    什么是“分布式应用系统”
    SQLServer数据库安全管理机制详解
    什么是 CLR(转)
    docker容器下的asp.net core项目发布运维
    VLAN技术
    用getDrawingCache方法获取ImageView中的图像需要注意的问题
    交换机的工作原理
  • 原文地址:https://www.cnblogs.com/gambler/p/11988859.html
Copyright © 2011-2022 走看看