Skip to content

Commit

Permalink
内存优化.
Browse files Browse the repository at this point in the history
  • Loading branch information
enjoysport2022 committed Jul 26, 2021
1 parent 0714e37 commit 5cda17a
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 2 deletions.
5 changes: 4 additions & 1 deletion autox/autox.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .process_data import feature_combination, train_test_divide, clip_label
from .process_data import feature_filter
from .process_data.feature_type_recognition import Feature_type_recognition
from .util import log
from .util import log, reduce_mem_usage

class AutoX():
def __init__(self, target, train_name, test_name, path, feature_type = {}, id = []):
Expand Down Expand Up @@ -77,6 +77,9 @@ def get_submit(self):
df_list = [df, self.dfs_['FE_count'], self.dfs_['FE_stat']]
self.dfs_['FE_all'] = feature_combination(df_list)

# 内存优化
self.dfs_['FE_all'] = reduce_mem_usage(self.dfs_['FE_all'])

# train和test数据切分
train_length = self.info_['shape_of_train']
train, test = train_test_divide(self.dfs_['FE_all'], train_length)
Expand Down
41 changes: 40 additions & 1 deletion autox/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import warnings
warnings.filterwarnings('ignore')
import numpy as np

# log
import logging
Expand All @@ -22,4 +23,42 @@ def log(entry, level='info'):
global nesting_level
space = '-' * (4 * nesting_level)

getattr(LOGGER, level)(f"{space} {entry}")
getattr(LOGGER, level)(f"{space} {entry}")

def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024 ** 2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

for col in df.columns:
col_type = df[col].dtype

if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')

end_mem = df.memory_usage().sum() / 1024 ** 2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

return df

0 comments on commit 5cda17a

Please sign in to comment.