地址: https://www.kaggle.com/c/instacart-market-basket-analysis

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
# 读取四张表的数据
prior = pd.read_csv('instacart-market-basket-analysis/order_products__prior.csv')
products = pd.read_csv('instacart-market-basket-analysis/products.csv')
orders = pd.read_csv('instacart-market-basket-analysis/orders.csv')
aisles = pd.read_csv('instacart-market-basket-analysis/aisles.csv')
# 合并四张表到一张表 (用户-物品类别)
_mg = pd.merge(prior,products,on=['product_id','product_id'])
_mg = pd.merge(_mg,orders,on=['order_id','order_id'])
mt = pd.merge(_mg,aisles,on=['aisle_id','aisle_id'])
#print(mt.head(10))
# 交叉表 (特殊的分组工具)
cross = pd.crosstab(mt['user_id'],mt['aisle'])
#print(cross.head(10))
# 进行主成分分析
pca = PCA(n_components=0.9)
data = pca.fit_transform(cross)
#print(data.shape)
# 把样本数量减少
x = data[:500]
print(x.shape)
# 假设用户一共分为四个类别
km = KMeans(n_clusters=4)
km.fit(x)
predict = km.predict(x)
print(predict)
# 显示聚类
plt.figure(figsize=(10,10))
# 建立四个颜色的列表 (无实际意义)
colored = ['orange','green','blue','purple']
color = [colored[i] for i in predict]
plt.scatter(x[:,1],x[:,20],color=color)
plt.xlabel('1')
plt.ylabel('2')
plt.show()
# 评判聚类效果,轮廓系数
silhouette_score(x,predict)

代码运行结果

(500, 27)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 0 0 1 0 0
 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 3 0 0 0 1 0 0 0 0 0 0 0 0
 2 0 0 0 1 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 3 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 3 0 0 0 0 0 0 0 1 0 2 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 3 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 3
 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 2 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0]
# 评判结果
0.6102387230295737

发表回复