内存优化.

4paradigm · Jul 26, 2021 · 5cda17a · 5cda17a
1 parent 0714e37
commit 5cda17a
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 2 deletions.
diff --git a/autox/autox.py b/autox/autox.py
@@ -5,7 +5,7 @@
 from .process_data import feature_combination, train_test_divide, clip_label
 from .process_data import feature_filter
 from .process_data.feature_type_recognition import Feature_type_recognition
-from .util import log
+from .util import log, reduce_mem_usage
 
 class AutoX():
     def __init__(self, target, train_name, test_name, path, feature_type = {}, id = []):
@@ -77,6 +77,9 @@ def get_submit(self):
         df_list = [df, self.dfs_['FE_count'], self.dfs_['FE_stat']]
         self.dfs_['FE_all'] = feature_combination(df_list)
 
+        # 内存优化
+        self.dfs_['FE_all'] = reduce_mem_usage(self.dfs_['FE_all'])
+
         # train和test数据切分
         train_length = self.info_['shape_of_train']
         train, test = train_test_divide(self.dfs_['FE_all'], train_length)

diff --git a/autox/util.py b/autox/util.py
@@ -1,5 +1,6 @@
 import warnings
 warnings.filterwarnings('ignore')
+import numpy as np
 
 # log
 import logging
@@ -22,4 +23,42 @@ def log(entry, level='info'):
     global nesting_level
     space = '-' * (4 * nesting_level)
 
-    getattr(LOGGER, level)(f"{space} {entry}")
+    getattr(LOGGER, level)(f"{space} {entry}")
+
+def reduce_mem_usage(df):
+    """ iterate through all the columns of a dataframe and modify the data type
+        to reduce memory usage.
+    """
+    start_mem = df.memory_usage().sum() / 1024 ** 2
+    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
+
+    for col in df.columns:
+        col_type = df[col].dtype
+
+        if col_type != object:
+            c_min = df[col].min()
+            c_max = df[col].max()
+            if str(col_type)[:3] == 'int':
+                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
+                    df[col] = df[col].astype(np.int8)
+                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
+                    df[col] = df[col].astype(np.int16)
+                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
+                    df[col] = df[col].astype(np.int32)
+                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
+                    df[col] = df[col].astype(np.int64)
+            else:
+                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
+                    df[col] = df[col].astype(np.float16)
+                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
+                    df[col] = df[col].astype(np.float32)
+                else:
+                    df[col] = df[col].astype(np.float64)
+        else:
+            df[col] = df[col].astype('category')
+
+    end_mem = df.memory_usage().sum() / 1024 ** 2
+    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
+    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
+
+    return df