Skip to content

Commit

Permalink
update autoxserver.
Browse files Browse the repository at this point in the history
  • Loading branch information
enjoysport2022 committed Jan 18, 2022
1 parent 3a085a0 commit a29628a
Show file tree
Hide file tree
Showing 16 changed files with 751 additions and 90 deletions.
45 changes: 45 additions & 0 deletions autox/autox_server/feature_engineer/fe_accumulate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import warnings
import pandas as pd
import time
from autox.autox_server.util import log
from tqdm import tqdm
warnings.filterwarnings('ignore')


def fe_accumulate(G_df_dict, G_data_info, G_hist, is_train, remain_time):
# 对G_df_dict['BIG']表做扩展特征

start = time.time()
log('[+] feature engineer, accumulate')
time_col = G_data_info['target_time']

if is_train:
G_hist['FE_Accumulate'] = {}
G_hist['FE_Accumulate']['normal'] = []
G_hist['FE_Accumulate']['time'] = []

for col in tqdm(G_hist['big_cols_cat']):
G_hist['FE_Accumulate']['normal'].append(col)
log("accumulate normal features: {}".format(G_hist['FE_Accumulate']['normal']))

if G_data_info['time_series_data'] == 'true':
if G_hist['big_data_type'][time_col] == 'Unix_timestamp':
G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col)

for col in tqdm(G_hist['big_cols_cat']):
G_hist['FE_Accumulate']['time'].append(col)
log("window features: {}".format(G_hist['FE_Accumulate']['time']))

G_df_dict['FE_Accumulate'] = pd.DataFrame()
for col in tqdm(G_hist['FE_Accumulate']['normal']):
G_df_dict['FE_Accumulate'][f'{col}_acc_cnt'] = G_df_dict['BIG'].groupby(col).cumcount()

for col in tqdm(G_hist['FE_Accumulate']['time']):
G_df_dict['FE_Accumulate'][f'{col}_min_{time_col}'] = G_df_dict['BIG'].groupby(col)[time_col].transform('min')
G_df_dict['FE_Accumulate'][f'{col}_acc_cnt_div_delta_time'] = G_df_dict['FE_Accumulate'][f'{col}_acc_cnt'] / \
(G_df_dict['BIG'][time_col] - G_df_dict['FE_Accumulate'][f'{col}_min_{time_col}'] + 1)

end = time.time()
remain_time -= (end - start)
log("remain_time: {} s".format(remain_time))
return remain_time
52 changes: 52 additions & 0 deletions autox/autox_server/feature_engineer/fe_count_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import warnings
import pandas as pd
import time
from autox.autox_server.util import log
from tqdm import tqdm
warnings.filterwarnings('ignore')

def fe_count_map(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
# 对G_df_dict['BIG']表做扩展特征

start = time.time()
log('[+] feature engineer, count')

Id = G_data_info['target_id']
target = G_data_info['target_label']

if is_train:
G_hist['FE_count'] = {}
G_hist['FE_count']['feature_map'] = {}
G_hist['FE_count']['cnt_features'] = []
size_of_big = G_df_dict['BIG'].shape[0]

cnt_features = []
for col in G_df_dict['BIG'].columns:
if col in [target] + Id:
continue
if '_in_' in col:
continue
if 'int' in str(G_df_dict['BIG'][col].dtype):
if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8 and G_df_dict['BIG'][col].nunique() < 200000:
cnt_features.append(col)
G_hist['FE_count']['cnt_features'] = cnt_features
log("count features: {}".format(cnt_features))

for f in cnt_features:
temp = pd.DataFrame(G_df_dict['BIG'][f])
temp[f + '_cnt'] = temp.groupby([f])[f].transform('count')
temp.index = temp[f]
temp = temp.drop(f, axis=1)
faeture_map = temp.to_dict()[f + '_cnt']
G_hist['FE_count']['feature_map'][f] = faeture_map

if not AMPERE:
G_df_dict['FE_count'] = pd.DataFrame()
for f in G_hist['FE_count']['cnt_features']:
G_df_dict['FE_count'][f + "_cnt"] = G_df_dict['BIG'][f].map(G_hist['FE_count']['feature_map'][f])

end = time.time()
remain_time -= (end - start)
log("time consumption: {}".format(str(end - start)))
log("remain_time: {} s".format(remain_time))
return remain_time
33 changes: 33 additions & 0 deletions autox/autox_server/feature_engineer/fe_count_ratio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import warnings
import pandas as pd
import time
from autox.autox_server.util import log
from tqdm import tqdm
warnings.filterwarnings('ignore')

def fe_count_ratio(G_df_dict, G_data_info, G_hist, is_train, remain_time):
# 对G_df_dict['BIG']表做扩展特征
start = time.time()
log('[+] feature engineer, count ratio')

if is_train:
G_hist['FE_count_ratio'] = {}
size_of_big = G_df_dict['BIG'].shape[0]

cnt_ratio_features = []
for col in G_hist['big_cols_cat'] + G_hist['big_cols_num']:
if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8:
cnt_ratio_features.append(col)
G_hist['FE_count_ratio'] = cnt_ratio_features
log("count ratio features: {}".format(cnt_ratio_features))

G_df_dict['FE_count_ratio'] = pd.DataFrame()
for col in tqdm(G_hist['FE_count_ratio']):
G_df_dict['FE_count_ratio'][col + "_cnt_ratio"] = G_df_dict['BIG'].groupby(col)[col].transform('count') / \
G_df_dict['BIG'].shape[0]

end = time.time()
remain_time -= (end - start)
log("time consumption: {}".format(str(end - start)))
log("remain_time: {} s".format(remain_time))
return remain_time
67 changes: 67 additions & 0 deletions autox/autox_server/feature_engineer/fe_frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
import time
from autox.autox_server.util import log

def _groupby_agg_rolling(df, keys, col, op, k, col_time):
name = 'WIN_{}_{}_({})_({})'.format(k, op.upper(), '_'.join(keys), col)
if type(k) == int:
s = df.groupby(keys)[[col]]
df_gp = s.rolling(window = k).agg(op) # rolling by number
else:
closed = 'left' # [left, right)
# closed = 'both' # [left, right]
s = df.groupby(keys)[[col_time, col]]
df_gp = s.rolling(window = k, on = col_time, closed = closed).agg(op).iloc[:, -1:] # rolling by time
df_gp.columns = [name]
df_gp = df_gp.sort_index(level = 1).reset_index(drop = True)
return df_gp

def fe_frequency(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
# 对G_df_dict['BIG']表做扩展特征

start = time.time()
log('[+] feature engineer, frequency')

big_size = G_df_dict['BIG'].shape[0]
time_col = G_data_info['target_time']

if is_train:
G_hist['FE_frequency'] = {}
G_hist['FE_frequency']['keys'] = []
G_hist['FE_frequency']['cols'] = []

if G_data_info['time_series_data'] == 'true':
# !先对df排序
G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col)

keys_features = []
for col in G_hist['big_cols_cat']:
if big_size * 0.005 < G_df_dict['BIG'][col].nunique() < big_size * 0.01:
keys_features.append(col)
G_hist['FE_frequency']['keys'] = keys_features
log("FE_frequency keys: {}".format(keys_features))

cols_features = []
for col in G_hist['big_cols_cat']:
if big_size * 0.6 < G_df_dict['BIG'][col].nunique() < big_size * 0.8:
cols_features.append(col)
G_hist['FE_frequency']['cols'] = cols_features
log("FE_frequency cols: {}".format(cols_features))

if not AMPERE:
G_df_dict['FE_frequency'] = pd.DataFrame()
for col in G_hist['FE_frequency']['cols']:
for key_ in G_hist['FE_frequency']['keys']:
df = G_df_dict['BIG'][[key_, col]].copy()
keys = [key_]
df['x'] = df.groupby(keys + [col])[col].transform('count') / df.groupby(keys)[col].transform('count')
df['y'] = df.groupby(keys)['x'].transform('max')
G_df_dict['FE_frequency'][f'{key_}__with__{col}__frequency'] = df['y']

end = time.time()
remain_time -= (end - start)
log("time consumption: {}".format(str(end - start)))
log("remain_time: {} s".format(remain_time))
return remain_time
37 changes: 37 additions & 0 deletions autox/autox_server/feature_engineer/fe_hash_discrete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import time
import warnings

import pandas as pd
from tqdm import tqdm
from autox.autox_server.util import log
warnings.filterwarnings('ignore')

def fe_hash_discrete(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
# 对G_df_dict['BIG']表做扩展特征
start = time.time()
log('[+] feature engineer, hash_discrete')

if is_train:
G_hist['FE_hash_discrete'] = []
col_hash_discrete = []
if G_hist['super_big_data']:
for col in G_hist['big_cols_cat']:
# nunique大于10000的特征,截断保留4位
if G_df_dict['BIG'][col].nunique() > 10000:
col_hash_discrete.append(col)

G_hist['FE_hash_discrete'] = col_hash_discrete
log("hash_discrete features: {}".format(G_hist['FE_hash_discrete']))

if not AMPERE:
G_df_dict['FE_hash_discrete'] = pd.DataFrame()
for col in tqdm(G_hist['FE_c']):
G_df_dict['FE_kv'][f"{col}_hash_discrete"] = G_df_dict['BIG'][col].apply(lambda x: str(x)[-4:])

# todo: 对应feql直接discrete签名

end = time.time()
remain_time -= (end - start)
log("time consumption: {}".format(str(end - start)))
log("remain_time: {} s".format(remain_time))
return remain_time
66 changes: 66 additions & 0 deletions autox/autox_server/feature_engineer/fe_kv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import warnings
import pandas as pd
import numpy as np
import time
from autox.autox_server.util import log
from tqdm import tqdm
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from pypinyin import pinyin, lazy_pinyin, Style

def str2map(s):
if str(s) == 'None':
return {}
return {si.split(':')[0]: si.split(':')[1] for si in s.split(',')}

def get_keys(kv):
return list(kv.keys())

def fe_kv(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
# 对G_df_dict['BIG']表做扩展特征
start = time.time()
log('[+] feature engineer, kv')

if is_train:
G_hist['FE_kv'] = {}
G_hist['FE_kv']['cols'] = []
G_hist['FE_kv']['col_top_keys'] = {}

cols_kv = [x for x in G_hist['big_cols_kv'] if x in G_df_dict['BIG'].columns]
G_hist['FE_kv']['cols'] = cols_kv
log("kv features: {}".format(G_hist['FE_kv']['cols']))

for col in cols_kv:
temp = G_df_dict['BIG'][[col]].copy()
temp[col] = temp[col].apply(lambda x: str2map(x))
temp[col + '_keys'] = temp[col].apply(lambda x: get_keys(x))

vectorizer = CountVectorizer(max_features=100)
vectorizer.fit_transform(temp[col + '_keys'].astype(str))
G_hist['FE_kv']['col_top_keys'][col] = vectorizer.get_feature_names()

if not AMPERE:
G_df_dict['FE_kv'] = pd.DataFrame()
for col in tqdm(G_hist['FE_kv']['cols']):
for key_ in G_hist['FE_kv']['col_top_keys'][col]:
temp = G_df_dict['BIG'][[col]].copy()
temp[col] = temp[col].apply(lambda x: str2map(x))
try:
G_df_dict['FE_kv'][f"{col}__{key_}__kv"] = temp[col].apply(lambda x: float(x.get(key_, np.nan)))
except:
pass

G_hist['FE_kv']['rename'] = {}
cols_name = []
for i, col in enumerate(G_df_dict['FE_kv'].columns):
col_rename = ''.join(lazy_pinyin(col)) + f'__idx{i}'
cols_name.append(col_rename)
G_hist['FE_kv']['rename'][col_rename] = col
G_df_dict['FE_kv'].columns = cols_name


end = time.time()
remain_time -= (end - start)
log("time consumption: {}".format(str(end - start)))
log("remain_time: {} s".format(remain_time))
return remain_time
48 changes: 48 additions & 0 deletions autox/autox_server/feature_engineer/fe_stat_for_same_prefix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import warnings
import pandas as pd
import time
from autox.autox_server.util import log
from tqdm import tqdm
warnings.filterwarnings('ignore')
import re

def fe_stat_for_same_prefix(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
# 对G_df_dict['BIG']表做扩展特征
start = time.time()
log('[+] feature engineer, stat_for_same_prefix')

if is_train:
G_hist['FE_stat_for_same_prefix'] = []

cols_agg_list = []
cols = G_df_dict['BIG'].columns
c_1_list = [col for col in cols if bool(re.search(r'_1$', str(col)))]
for c_1 in c_1_list:
c_list = [c_1]
for i in range(2, 20):
c_i = c_1.replace('_1', '_{}'.format(i))
if c_i in cols:
c_list.append(c_i)
num_flag = True
for item in c_list:
if str(G_df_dict['BIG'][item].dtype) == 'object':
num_flag = False
if num_flag and 3 <= len(c_list) <= 3:
cols_agg_list.append(c_list)
G_hist['FE_stat_for_same_prefix'] = cols_agg_list
log("stat_for_same_prefix features: {}".format(G_hist['FE_stat_for_same_prefix']))

if not AMPERE:
G_df_dict['FE_stat_for_same_prefix'] = pd.DataFrame()
for cols_agg in tqdm(G_hist['FE_stat_for_same_prefix']):
G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__mean'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].mean(axis = 1)
# G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__median'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].median(axis = 1)
G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__min'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].min(axis = 1)
G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__max'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].max(axis = 1)
# G_df_dict['FE_stat_for_same_prefix']['{}__stat_for_same_prefix__std'.format('__col__'.join(cols_agg))] = G_df_dict['BIG'][cols_agg].std(axis = 1)

end = time.time()
remain_time -= (end - start)
log("time consumption: {}".format(str(end - start)))
log("remain_time: {} s".format(remain_time))
return remain_time
37 changes: 37 additions & 0 deletions autox/autox_server/feature_engineer/fe_time_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import warnings
import pandas as pd
import time
from autox.autox_server.util import log
from tqdm import tqdm
warnings.filterwarnings('ignore')

def fe_time_count(G_df_dict, G_data_info, G_hist, is_train, remain_time, AMPERE):
# 对G_df_dict['BIG']表做扩展特征
start = time.time()
log('[+] feature engineer, time count')
time_col = G_data_info['target_time']

if is_train:
G_hist['FE_time_count'] = []
size_of_big = G_df_dict['BIG'].shape[0]
if G_data_info['time_series_data'] == 'true':
G_df_dict['BIG'] = G_df_dict['BIG'].sort_values(by=time_col)
for col in G_hist['big_cols_cat']:
if G_df_dict['BIG'][col].nunique() < size_of_big * 0.8:
G_hist['FE_time_count'].append(col)

if G_hist['big_data']:
G_hist['FE_time_count'] = []

log("time count features: {}".format(G_hist['FE_time_count']))

if not AMPERE:
G_df_dict['FE_time_count'] = pd.DataFrame()
for col in tqdm(G_hist['FE_time_count']):
G_df_dict['FE_time_count'][f'{col}__time_count'] = G_df_dict['BIG'].groupby([col, time_col])[col].transform('count')

end = time.time()
remain_time -= (end - start)
log("time consumption: {}".format(str(end - start)))
log("remain_time: {} s".format(remain_time))
return remain_time
Loading

0 comments on commit a29628a

Please sign in to comment.