1 回答
TA贡献1934条经验 获得超2个赞
考虑使用每个目标级别的多个类的列表理解来构建 x 和 kde 的列表。并且不是在每次迭代中打印结果,而是将结果绑定到数据框中:
def intersection_area_new(data, bandwidth, margin, target_variable_name):
# Collect the names of the independent variables
data = data.dropna()
# determine the number of unique classes from a multi-class target variable and save them as a list.
classes = data['target'].unique()
kde_dicts = []
for column_name in data.columns[:-1]:
# BUILD LIST OF x's AND kde's
x_s = [data.loc[(data[target_variable_name] == i), str(column_name)] for i in classes]
kde_s = [gaussian_kde(x, bw_method=bandwidth) for x in x_s]
x_min = min([x.min() for x in x_s]) # find the lowest value between two minimum points
x_max = min([x.max() for x in x_s]) # find the lowest value between two maximum points
dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data
x_min -= dx
x_max += dx
x_array = np.linspace(x_min, x_max, 500)
kde_x_s = [kde(x_array) for kde in kde_s]
inters_x = np.array(kde_x_s).min(axis=0)
area_inters_x = np.trapz(inters_x, x_array) # intersection of kdes
kde_dicts.append({'target': target_variable_name,
'column': column_name,
'intersection': area_inters_x})
return pd.DataFrame(kde_dicts)
输出
output = intersection_area_new(sample_dataset, None, 0.5, "target")
print(output.head(10))
# target column intersection
# 0 target var1 0.842256
# 1 target var2 0.757190
# 2 target var3 0.676021
# 3 target var4 0.873074
# 4 target var5 0.763626
# 5 target var6 0.868560
添加回答
举报