首页猿问如何在 python 中实现...

如何在 python 中实现 EM-GMM？

Python

森栏 2023-05-23 14:53:00

如下所示：import numpy as npdef PDF(data, means, variances): return 1/(np.sqrt(2 * np.pi * variances) + eps) * np.exp(-1/2 * (np.square(data - means) / (variances + eps)))def EM_GMM(data, k, iterations): weights = np.ones((k, 1)) / k # shape=(k, 1) means = np.random.choice(data, k)[:, np.newaxis] # shape=(k, 1) variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1) data = np.repeat(data[np.newaxis, :], k, 0) # shape=(k, n) for step in range(iterations): # Expectation step likelihood = PDF(data, means, np.sqrt(variances)) # shape=(k, n) # Maximization step b = likelihood * weights # shape=(k, n) b /= np.sum(b, axis=1)[:, np.newaxis] + eps # updage means, variances, and weights means = np.sum(b * data, axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps) variances = np.sum(b * np.square(data - means), axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps) weights = np.mean(b, axis=1)[:, np.newaxis] return means, variances我认为这是错误的，因为输出是两个向量，其中一个代表means值，另一个代表variances值。让我对实现产生怀疑的模糊点是它返回0.00000000e+000大部分可以看到的输出，并且不需要真正可视化这些输出。顺便说一句，输入数据是时间序列数据。我已经检查了所有内容并进行了多次跟踪，但没有出现错误。

查看完整描述

2 回答

心有法竹

TA贡献1866条经验获得超5个赞

我看到的关键点是means初始化。按照sklearn Gaussian Mixture的默认实现，我切换到 KMeans，而不是随机初始化。

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

plt.style.use('seaborn')

eps=1e-8

def PDF(data, means, variances):

return 1/(np.sqrt(2 * np.pi * variances) + eps) * np.exp(-1/2 * (np.square(data - means) / (variances + eps)))

def EM_GMM(data, k=3, iterations=100, init_strategy='kmeans'):

weights = np.ones((k, 1)) / k # shape=(k, 1)

if init_strategy=='kmeans':

from sklearn.cluster import KMeans

km = KMeans(k).fit(data[:, None])

means = km.cluster_centers_ # shape=(k, 1)

else: # init_strategy=='random'

means = np.random.choice(data, k)[:, np.newaxis] # shape=(k, 1)

variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)

data = np.repeat(data[np.newaxis, :], k, 0) # shape=(k, n)

for step in range(iterations):

# Expectation step

likelihood = PDF(data, means, np.sqrt(variances)) # shape=(k, n)

# Maximization step

b = likelihood * weights # shape=(k, n)

b /= np.sum(b, axis=1)[:, np.newaxis] + eps

# updage means, variances, and weights

means = np.sum(b * data, axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)

variances = np.sum(b * np.square(data - means), axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)

weights = np.mean(b, axis=1)[:, np.newaxis]

return means, variances

这似乎更一致地产生所需的输出：

s = np.array([25.31 , 24.31 , 24.12 , 43.46 , 41.48666667,

41.48666667, 37.54 , 41.175 , 44.81 , 44.44571429,

44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,

44.44571429, 44.44571429, 39.71 , 26.69 , 34.15 ,

24.94 , 24.75 , 24.56 , 24.38 , 35.25 ,

44.62 , 44.94 , 44.815 , 44.69 , 42.31 ,

40.81 , 44.38 , 44.56 , 44.44 , 44.25 ,

43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,

40.75 , 32.31 , 36.08 , 30.135 , 24.19 ])

k=3

n_iter=100

means, variances = EM_GMM(s, k, n_iter)

print(means,variances)

[[44.42596231]

[24.509301 ]

[35.4137508 ]]

[[0.07568723]

[0.10583743]

[0.52125856]]

# Plotting the results

colors = ['green', 'red', 'blue', 'yellow']

bins = np.linspace(np.min(s)-2, np.max(s)+2, 100)

plt.figure(figsize=(10,7))

plt.xlabel('$x$')

plt.ylabel('pdf')

sns.scatterplot(s, [0.05] * len(s), color='navy', s=40, marker=2, label='Series data')

for i, (m, v) in enumerate(zip(means, variances)):

sns.lineplot(bins, PDF(bins, m, v), color=colors[i], label=f'Cluster {i+1}')

plt.legend()

plt.plot()

//img1.sycdn.imooc.com/646c63170001c9ba06070419.jpg

最后我们可以看到纯随机初始化产生了不同的结果；让我们看看结果means：

for _ in range(5):

print(EM_GMM(s, k, n_iter, init_strategy='random')[0], '\n')

[[44.42596231]

[44.42596231]

[44.42596231]]

[[44.42596231]

[24.509301 ]

[30.1349997 ]]

[[44.42596231]

[35.4137508 ]

[44.42596231]]

[[44.42596231]

[30.1349997 ]

[44.42596231]]

[[44.42596231]

[44.42596231]

[44.42596231]]

可以看出这些结果有多么不同，在某些情况下，结果均值是恒定的，这意味着初始化选择了 3 个相似的值并且在迭代时没有太大变化。在中添加一些打印语句EM_GMM将澄清这一点。

反对回复 2023-05-23

一只萌萌小番薯

TA贡献1795条经验获得超7个赞

# Expectation step

likelihood = PDF(data, means, np.sqrt(variances))

我们为什么要sqrt过去variances？pdf 函数接受差异。所以这应该是PDF(data, means, variances)。

另一个问题，

# Maximization step

b = likelihood * weights # shape=(k, n)

b /= np.sum(b, axis=1)[:, np.newaxis] + eps

上面第二行应该是b /= np.sum(b, axis=0)[:, np.newaxis] + eps

同样在的初始化中variances，

variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)

为什么我们要随机初始化方差？我们有data和means，为什么不像中那样计算当前估计方差vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1)？

通过这些更改，这是我的实现，

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

plt.style.use('seaborn')

eps=1e-8

def pdf(data, means, vars):

denom = np.sqrt(2 * np.pi * vars) + eps

numer = np.exp(-0.5 * np.square(data - means) / (vars + eps))

return numer /denom

def em_gmm(data, k, n_iter, init_strategy='k_means'):

weights = np.ones((k, 1), dtype=np.float32) / k

if init_strategy == 'k_means':

from sklearn.cluster import KMeans

km = KMeans(k).fit(data[:, None])

means = km.cluster_centers_

else:

means = np.random.choice(data, k)[:, np.newaxis]

data = np.repeat(data[np.newaxis, :], k, 0)

vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1)

for step in range(n_iter):

p = pdf(data, means, vars)

b = p * weights

denom = np.expand_dims(np.sum(b, axis=0), 0) + eps

b = b / denom

means_n = np.sum(b * data, axis=1)

means_d = np.sum(b, axis=1) + eps

means = np.expand_dims(means_n / means_d, -1)

vars = np.sum(b * np.square(data - means), axis=1) / means_d

vars = np.expand_dims(vars, -1)

weights = np.expand_dims(np.mean(b, axis=1), -1)

return means, vars

def main():

s = np.array([25.31, 24.31, 24.12, 43.46, 41.48666667,

41.48666667, 37.54, 41.175, 44.81, 44.44571429,

44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,

44.44571429, 44.44571429, 39.71, 26.69, 34.15,

24.94, 24.75, 24.56, 24.38, 35.25,

44.62, 44.94, 44.815, 44.69, 42.31,

40.81, 44.38, 44.56, 44.44, 44.25,

43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,

40.75, 32.31, 36.08, 30.135, 24.19])

k = 3

n_iter = 100

means, vars = em_gmm(s, k, n_iter)

y = 0

colors = ['green', 'red', 'blue', 'yellow']

bins = np.linspace(np.min(s) - 2, np.max(s) + 2, 100)

plt.figure(figsize=(10, 7))

plt.xlabel('$x$')

plt.ylabel('pdf')

sns.scatterplot(s, [0.0] * len(s), color='navy', s=40, marker=2, label='Series data')

for i, (m, v) in enumerate(zip(means, vars)):

sns.lineplot(bins, pdf(bins, m, v), color=colors[i], label=f'Cluster {i + 1}')

plt.legend()

plt.plot()

plt.show()

pass

这是我的结果。

//img1.sycdn.imooc.com//646c633100012b5106580449.jpg

反对回复 2023-05-23

2 回答
0 关注
274 浏览

关注

添加回答

0/150

提交

取消

热搜

最近搜索清空

如何在 python 中实现 EM-GMM？

如何在 python 中实现 EM-GMM？

2 回答

添加回答