数据挖掘流程简单示例10min
套路:
- 准备数据
- 实现算法
- 测试算法
任务1:亲和性分析
如果一个顾客买了商品X,那么他们可能愿意买商品Y衡量方法:
-
支持度support := 所有买X的人数
-
置信度confidence := 所有买X和Y的人数所有买X的人数\frac{ 所有买X和Y的人数 } { 所有买X的人数 }所有买X的人数所有买X和Y的人数
# 引入库
import numpy as np
from operator import itemgetter
# 准备数据# 创造随机生成的数据
X = np.zeros((100, 5), dtype='bool')
for i in range(X.shape[0]):if np.random.random() < 0.3:# A bread winnerX[i][0] = 1if np.random.random() < 0.5:# Who likes milkX[i][1] = 1if np.random.random() < 0.2:# Who likes cheeseX[i][2] = 1if np.random.random() < 0.25:# Who likes applesX[i][3] = 1if np.random.random() < 0.5:# Who likes bananasX[i][4] = 1else:# Not a bread winnerif np.random.random() < 0.5:# Who likes milkX[i][1] = 1if np.random.random() < 0.2:# Who likes cheeseX[i][2] = 1if np.random.random() < 0.25:# Who likes applesX[i][3] = 1if np.random.random() < 0.5:# Who likes bananasX[i][4] = 1else:if np.random.random() < 0.8:# Who likes cheeseX[i][2] = 1if np.random.random() < 0.6:# Who likes applesX[i][3] = 1if np.random.random() < 0.7:# Who likes bananasX[i][4] = 1if X[i].sum() == 0:X[i][4] = 1 # Must buy something, so gets bananas
np.savetxt("./data/affinity_dataset.txt", X, fmt='%d') # 保存# 读取数据
dataset_filename = "./data/affinity_dataset.txt"
X = np.loadtxt(dataset_filename) # 加载数据
n_samples, n_features = X.shape
print(X.shape)
print(X[:5])'''
(100, 5)
[[0. 0. 1. 1. 0.][1. 1. 0. 0. 0.][1. 0. 0. 1. 1.][0. 1. 1. 0. 1.][0. 1. 0. 0. 0.]]'''
我们定义的规则为:买了苹果又买香蕉
下面 rule_valid 表示买了苹果又买香蕉的有多少人
显然支持度 := 所有买X的人数,即支持度=rule_valid
所以置信度=支持度/总人数
# 文件affinity_dataset.txt是生成的数据,得我们来指定列
features = ["bread", "milk", "cheese", "apples", "bananas"]num_apple_purchases = 0 # 计数
for sample in X:if sample[3] == 1: # 记录买 Apples 的有多少人num_apple_purchases += 1
print("买苹果的有{0}人".format(num_apple_purchases))rule_valid = 0
rule_invalid = 0
for sample in X:if sample[3] == 1: # 买了苹果if sample[4] == 1:# 又买香蕉的rule_valid += 1else:# 不买香蕉的rule_invalid += 1
print("买了苹果又买香蕉的有{0}人".format(rule_valid))
print("买了苹果不买香蕉的有{0}人".format(rule_invalid))# 计算支持度support和置信度confidence
support = rule_valid # 支持度是符合“买了苹果又买香蕉”这个规则的人数
confidence = rule_valid / num_apple_purchases
print("支持度support = {0} 置信度confidence = {1:.3f}.".format(support, confidence))
# 置信度的百分比形式
print("置信度confidence的百分比形式为 {0:.1f}%.".format(100 * confidence))'''
买苹果的有39人
买了苹果又买香蕉的有23人
买了苹果不买香蕉的有16人
支持度support = 23 置信度confidence = 0.590.
置信度confidence的百分比形式为 59.0%.
'''
from collections import defaultdict
# 上面"买了苹果又买香蕉"是一种情况,现在把所有可能的情况都做一遍
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)for sample in X:for premise in range(n_features):if sample[premise] == 0: continue# 先买premise,premise代表一种食物,记做Xnum_occurences[premise] += 1for conclusion in range(n_features):if premise == conclusion: continue # 跳过买X又买X的情况if sample[conclusion] == 1: # 又买了conclusion,conclusion代表一种食物,记做Yvalid_rules[(premise, conclusion)] += 1 # 买X买Yelse: invalid_rules[(premise, conclusion)] += 1 # 买X没买Y
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
for premise, conclusion in confidence:premise_name = features[premise]conclusion_name = features[conclusion]print("Rule: 买了{0},又买{1}".format(premise_name, conclusion_name))print(" - 置信度Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))print(" - 支持度Support: {0}".format(support[(premise, conclusion)]))print("")'''
Rule: 买了cheese,又买apples- 置信度Confidence: 0.553- 支持度Support: 26Rule: 买了apples,又买cheese- 置信度Confidence: 0.667- 支持度Support: 26Rule: 买了bread,又买milk- 置信度Confidence: 0.619- 支持度Support: 13Rule: 买了milk,又买bread- 置信度Confidence: 0.265- 支持度Support: 13Rule: 买了bread,又买apples- 置信度Confidence: 0.286- 支持度Support: 6Rule: 买了bread,又买bananas- 置信度Confidence: 0.476- 支持度Support: 10Rule: 买了apples,又买bread- 置信度Confidence: 0.154- 支持度Support: 6Rule: 买了apples,又买bananas- 置信度Confidence: 0.590- 支持度Support: 23Rule: 买了bananas,又买bread- 置信度Confidence: 0.185- 支持度Support: 10Rule: 买了bananas,又买apples- 置信度Confidence: 0.426- 支持度Support: 23Rule: 买了milk,又买cheese- 置信度Confidence: 0.204- 支持度Support: 10Rule: 买了milk,又买bananas- 置信度Confidence: 0.429- 支持度Support: 21Rule: 买了cheese,又买milk- 置信度Confidence: 0.213- 支持度Support: 10Rule: 买了cheese,又买bananas- 置信度Confidence: 0.532- 支持度Support: 25Rule: 买了bananas,又买milk- 置信度Confidence: 0.389- 支持度Support: 21Rule: 买了bananas,又买cheese- 置信度Confidence: 0.463- 支持度Support: 25Rule: 买了bread,又买cheese- 置信度Confidence: 0.238- 支持度Support: 5Rule: 买了cheese,又买bread- 置信度Confidence: 0.106- 支持度Support: 5Rule: 买了milk,又买apples- 置信度Confidence: 0.184- 支持度Support: 9Rule: 买了apples,又买milk- 置信度Confidence: 0.231- 支持度Support: 9
'''
# 封装一下方便调用
def print_rule(premise, conclusion, support, confidence, features):premise_name = features[premise]conclusion_name = features[conclusion]print("Rule: 买了{0},又买{1}".format(premise_name, conclusion_name))print(" - 置信度Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))print(" - 支持度Support: {0}".format(support[(premise, conclusion)]))print("")premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)'''
Rule: 买了milk,又买apples- 置信度Confidence: 0.184- 支持度Support: 9
'''
# 按支持度support排序
from pprint import pprint
pprint(list(support.items()))'''
[((2, 3), 26),((3, 2), 26),((0, 1), 13),((1, 0), 13),((0, 3), 6),((0, 4), 10),((3, 0), 6),((3, 4), 23),((4, 0), 10),((4, 3), 23),((1, 2), 10),((1, 4), 21),((2, 1), 10),((2, 4), 25),((4, 1), 21),((4, 2), 25),((0, 2), 5),((2, 0), 5),((1, 3), 9),((3, 1), 9)]
'''
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5): # 打印前5个print("Rule #{0}".format(index + 1))(premise, conclusion) = sorted_confidence[index][0]print_rule(premise, conclusion, support, confidence, features)'''
Rule #1
Rule: 买了apples,又买cheese- 置信度Confidence: 0.667- 支持度Support: 26Rule #2
Rule: 买了bread,又买milk- 置信度Confidence: 0.619- 支持度Support: 13Rule #3
Rule: 买了apples,又买bananas- 置信度Confidence: 0.590- 支持度Support: 23Rule #4
Rule: 买了cheese,又买apples- 置信度Confidence: 0.553- 支持度Support: 26Rule #5
Rule: 买了cheese,又买bananas- 置信度Confidence: 0.532- 支持度Support: 25
'''
任务2:Iris植物分类
给出某一植物部分特征,预测该植物属于哪一类
特征:
- 萼片长宽sepal width, sepal height
- 花瓣长宽petal width, petal height
算法:
- For 给定的每个特征
For 该特征对应的真值(即植物是哪一类) -
- 预测值:基于该特征预测的次数最多的类,即在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类
-
- 计算预测值与真值的误差
- 对上面计算的误差求和
- 使用误差最小的特征作为最终模型
很显然,“在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类”是基于大数据的
不过这样的规则过于简单,下面继续实验会发现准确率只有 60% 左右,当然还是比随机预测 50% 好!
from sklearn.datasets import load_iris
#X, y = np.loadtxt("X_classification.txt"), np.loadtxt("y_classification.txt") # 本地加载数据,我先下载好在 data 文件夹里了
dataset = load_iris() # 或者自己亲自下载数据再加载也行
X = dataset.data
y = dataset.target
print(dataset.DESCR) # 打印下数据集介绍
n_samples, n_features = X.shape
# Compute the mean for each attribute计算平均值
attribute_means = X.mean(axis=0)
assert attribute_means.shape == (n_features,)
X_d = np.array(X >= attribute_means, dtype='int')
# 划分训练集和测试集
from sklearn.cross_validation import train_test_split# 设置随机数种子以便复现书里的内容
random_state = 14X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("训练集数据有 {} 条".format(y_train.shape))
print("测试集数据有 {} 条".format(y_test.shape))'''
训练集数据有 (112,) 条
测试集数据有 (38,) 条
'''
from collections import defaultdict
from operator import itemgetterdef train(X, y_true, feature):"""Computes the predictors and error for a given feature using the OneR algorithmParameters----------X: array [n_samples, n_features]The two dimensional array that holds the dataset. Each row is a sample, each columnis a feature.y_true: array [n_samples,]The one dimensional array that holds the class values. Corresponds to X, such thaty_true[i] is the class value for sample X[i].feature: intAn integer corresponding to the index of the variable we wish to test.0 <= variable < n_featuresReturns-------predictors: dictionary of tuples: (value, prediction)For each item in the array, if the variable has a given value, make the given prediction.error: floatThe ratio of training data that this rule incorrectly predicts."""# 1.一些等下要用的变量(数据的形状如上)n_samples, n_features = X.shapeassert 0 <= feature < n_featuresvalues = set(X[:,feature])predictors = dict()errors = []# 2.算法(对照上面的算法流程)# 已经给定特征 feature,作为函数参数传过来了for current_value in values: # For 该特征对应的真值(即植物是哪一类)most_frequent_class, error = train_feature_value(X, y_true, feature, current_value) # 预测值:基于该特征预测的次数最多的类,即在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类predictors[current_value] = most_frequent_classerrors.append(error)# 计算预测值与真值的误差total_error = sum(errors)# 对上面计算的误差求和# python里求和函数 sum([1, 2, 3]) == 1 + 2 + 3 == 6return predictors, total_error# Compute what our predictors say each sample is based on its value
#y_predicted = np.array([predictors[sample[feature]] for sample in X])def train_feature_value(X, y_true, feature, value):# 预测值:基于该特征预测的次数最多的类,即在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类# 我们需要一个字典型变量存每个变量预测正确的次数class_counts = defaultdict(int)# 对每个二元组(类别,真值)迭代计数for sample, y in zip(X, y_true):if sample[feature] == value:class_counts[y] += 1# 现在选被预测最多的类别,需要排序。(我们认为被预测最多的类别就是正确的)sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)most_frequent_class = sorted_class_counts[0][0]# 误差定义为分类“错误”的次数,这里“错误”指样本中没有分类为我们预测的值,即样本的真实类别不是“被预测最多的类别”n_samples = X.shape[1]error = sum([class_count for class_value, class_count in class_counts.items()if class_value != most_frequent_class])return most_frequent_class, error
# For 给定的每个特征,计算所有预测值(这里 for 写到 list 里面是 python 的语法糖)
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
# 现在选择最佳模型并保存为 "model"
# 按误差排序
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("最佳模型基于第 {0} 个变量,误差为 {1:.2f}".format(best_variable, best_error))# 选最好的模型,也就是误差最小的模型
model = {'variable': best_variable,'predictor': all_predictors[best_variable][0]}
print(model)'''
最佳模型基于第 2 个变量,误差为 37.00
{'variable': 2, 'predictor': {0: 0, 1: 2}}
'''
def predict(X_test, model):variable = model['variable']predictor = model['predictor']y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])return y_predicted
y_predicted = predict(X_test, model)
print(y_predicted)
accuracy = np.mean(y_predicted == y_test) * 100
print("在测试集上的准确率 {:.1f}%".format(accuracy))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))'''
[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 22]
在测试集上的准确率 65.8%precision recall f1-score support0 0.94 1.00 0.97 171 0.00 0.00 0.00 132 0.40 1.00 0.57 8avg / total 0.51 0.66 0.55 38
'''
在测试集上的准确率 65.8%,比完全随机预测 50% 好一点点。