zoukankan      html  css  js  c++  java
  • 机器学习之内存优化

    因为训练数据集往往比较大,而内存会出现不够用的情况,可以通过修改特征的数据类型,从而达到优化压缩的目的

    I、普通方法,直接复制调用就行

    参考网址:https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

    def reduce_mem_usage(df, verbose=True):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2    
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
        end_mem = df.memory_usage().sum() / 1024**2
        if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
        return df 

    2、封装完成的类

    参考网址:https://www.kaggle.com/wkirgsn/fail-safe-parallel-memory-reduction

    """reducing.py
    Author: Kirgsn, 2018
    
    Use like this:
    调用方式: >>> import reducing >>> df = reducing.Reducer().reduce(df)
    """ import numpy as np import pandas as pd import time import gc from joblib import Parallel, delayed def measure_time_mem(func): def wrapped_reduce(self, df, *args, **kwargs): # pre mem_usage_orig = df.memory_usage().sum() / self.memory_scale_factor start_time = time.time() # exec ret = func(self, df, *args, **kwargs) # post mem_usage_new = ret.memory_usage().sum() / self.memory_scale_factor end_time = time.time() print(f'reduced df from {mem_usage_orig:.4f} MB ' f'to {mem_usage_new:.4f} MB ' f'in {(end_time - start_time):.2f} seconds') gc.collect() return ret return wrapped_reduce class Reducer: """ Class that takes a dict of increasingly big numpy datatypes to transform the data of a pandas dataframe into, in order to save memory usage. """ memory_scale_factor = 1024**2 # memory in MB def __init__(self, conv_table=None, use_categoricals=True, n_jobs=-1): """ :param conv_table: dict with np.dtypes-strings as keys :param use_categoricals: Whether the new pandas dtype "Categoricals" shall be used :param n_jobs: Parallelization rate """ self.conversion_table = conv_table or {'int': [np.int8, np.int16, np.int32, np.int64], 'uint': [np.uint8, np.uint16, np.uint32, np.uint64], 'float': [np.float32, ]} self.use_categoricals = use_categoricals self.n_jobs = n_jobs def _type_candidates(self, k): for c in self.conversion_table[k]: i = np.iinfo(c) if 'int' in k else np.finfo(c) yield c, i @measure_time_mem def reduce(self, df, verbose=False): """Takes a dataframe and returns it with all data transformed to the smallest necessary types. :param df: pandas dataframe :param verbose: If True, outputs more information :return: pandas dataframe with reduced data types """ ret_list = Parallel(n_jobs=self.n_jobs)(delayed(self._reduce) (df[c], c, verbose) for c in df.columns) del df gc.collect() return pd.concat(ret_list, axis=1) def _reduce(self, s, colname, verbose): # skip NaNs if s.isnull().any(): if verbose: print(f'{colname} has NaNs - Skip..') return s # detect kind of type coltype = s.dtype if np.issubdtype(coltype, np.integer): conv_key = 'int' if s.min() < 0 else 'uint' elif np.issubdtype(coltype, np.floating): conv_key = 'float' else: if isinstance(coltype, object) and self.use_categoricals: # check for all-strings series if s.apply(lambda x: isinstance(x, str)).all(): if verbose: print(f'convert {colname} to categorical') return s.astype('category') if verbose: print(f'{colname} is {coltype} - Skip..') return s # find right candidate for cand, cand_info in self._type_candidates(conv_key): if s.max() <= cand_info.max and s.min() >= cand_info.min: if verbose: print(f'convert {colname} to {cand}') return s.astype(cand) # reaching this code is bad. Probably there are inf, or other high numbs print(f"WARNING: {colname} doesn't fit the grid with max: {s.max()} " f"and min: {s.min()}") print('Dropping it..')

     3、可以将其转换为feather格式,降低内存占用

         ps:转为feather格式的化需要配合pyarrow这个包,所以 pip install pyarrow

      用法很简单,结合1或者2方法先压缩数据类型,然后再生成feather文件

    # 生成一个feather文件
    your_df.to_feather(path)

    # 读取feather文件
    pd.read_feather(path)
  • 相关阅读:
    在Centos 7下编译openwrt+njit-client
    开博随笔
    Chapter 6. Statements
    Chapter 4. Arrays and Pointers
    Chapter 3. Library Types
    Chapter 2.  Variables and Basic Types
    关于stm32不常用的中断,如何添加, 比如timer10 timer11等
    keil 报错 expected an identifier
    案例分析 串口的地不要接到电源上 会烧掉
    案例分析 CAN OPEN 调试记录 进度
  • 原文地址:https://www.cnblogs.com/gambler/p/11988859.html
Copyright © 2011-2022 走看看