本文主要是简单的构建了一个分类器。首先是针对iris数据集,构建了一个只用阈值来分类的情况。之后简介了下交叉验证。然后考虑了更实际的数据集,使用了UCI的数据集,并考虑特征处理,使用logistics回归分类。最后,简单的说了一下分析的思路和一些待思考的问题。这里数据集和问题是参考了书籍《building machine learning system with python》,建模过程和分析属于个人见解,请批判阅读。




import numpy as np
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
import pandas as pd
%matplotlib inline
import seaborn as sns
# styles = ["white", "dark", "whitegrid", "darkgrid", "ticks"]

iris = sns.load_dataset("iris")
sns.pairplot(iris, hue="species", size=3.0)



这里使用了新的数据集,是小麦种子数据。有7个特征,area A(面积), perimeter P(周长), compactness C = 4piA/P\^2(紧密度), length of kernel(胚长度), width of kernel(胚宽度), asymmetry coefficient(偏度系数), length of kernel groove(胚槽长度)。

seeds = pd.read_excel('seeds_dataset.xlsx', names=['A','P','C','length','width','asy', 'gro','class'], header=None)

A P C length width asy gro class
count 210 210 210 210 210 210 210 210
mean 14.84 14.55 0.87 5.62 3.25 3.70 5.40 2.00
std 2.90 1.30 0.02 0.44 0.37 1.50 0.49 0.81
min 10.59 12.41 0.80 4.89 2.63 0.76 4.51 1.00
25% 12.27 13.45 0.85 5.26 2.94 2.56 5.04 1.00
50% 14.35 14.32 0.87 5.52 3.23 3.59 5.22 2.00
75% 17.30 15.71 0.88 5.97 3.56 4.76 5.87 3.00
max 21.18 17.25 0.91 6.67 4.03 8.45 6.55 3.00

sns.pairplot(seeds, hue="class", size=2.5)

f, ax = plt.subplots(figsize=(7, 7))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.corrplot(seeds, annot=True, sig_stars=False,
             diag_names=False, cmap=cmap, ax=ax)

原始数据中,部分分隔符错乱,所以我们转成了excel。通过数据描述,我们发现都是数值的,且没有缺失值,但单位不一。另外,各个变量之间的相关关系很明显。 所以,我们考虑先使用一些模型来做分类,使用交叉验证,看看结果如何。如果效果不好,使用归一化以及做一些特征选择,再看看效果。 此外,这里我们使用两类模型,一类是可解释的,一类是注重分类精度的。

from sklearn import cross_validation as c_v
from sklearn import svm
from sklearn import linear_model

# svm model
# we can choose some feature for the model, based on the corrplot, we could drop the "P"
# and in this part, we use all the features.
feature = ['A','P','C','length','width','asy', 'gro'] # ['A','P','C','length','width','asy', 'gro']
for i in range(1, 11):
    i = float(i) / 10
    model_svm = svm.SVC(C=i)
    score_svm = c_v.cross_val_score(model_svm, seeds[feature], seeds['class'], cv=10)
    print 'svm: with the C={c}, the score is {s:.3}, var is {v:.3}'.format(c=i, s=score_svm.mean(), v=score_svm.var())

model_lr = linear_model.LogisticRegression()
score_lr = c_v.cross_val_score(model_lr, seeds[feature], seeds['class'], cv=10)
print 'LogisticRegression: the score is {s:.3}, var is {v:.3}'.format(s=score_lr.mean(), v=score_lr.var())



from sklearn import preprocessing as pps
X_train, X_test, y_train, y_test = c_v.train_test_split(seeds.iloc[:,:-1], seeds['class'], test_size=0.2, random_state=1234)
scaler = pps.StandardScaler().fit(X_train) #StandardScaler(), MinMaxScaler() , Normalizer()

model_lr1 = linear_model.LogisticRegression(), y_train)
print model_lr1.score(X_train, y_train), model_lr1.score(X_test, y_test)

model_lr2 = linear_model.LogisticRegression(), y_train)
print model_lr2.score(scaler.transform(X_train), y_train), model_lr2.score(scaler.transform(X_test), y_test)

# just for a test
model_lr = linear_model.LogisticRegression()
scaler = pps.StandardScaler().fit(seeds.iloc[:,:-1])
score_lr1 = c_v.cross_val_score(model_lr, seeds.iloc[:,:-1], seeds['class'], cv=10)
score_lr2 = c_v.cross_val_score(model_lr, scaler.transform(seeds.iloc[:,:-1]), seeds['class'], cv=10)

print 'the score is {s:.3f}, var is {v:.5f}'.format(s=score_lr1.mean(), v=score_lr1.var())
print 'the score is {s:.3f}, var is {v:.5f}'.format(s=score_lr2.mean(), v=score_lr2.var())

print 'weight for class0:{0[0]:.3}, {0[1]:.3}, {0[2]:.3}, {0[3]:.3}, {0[4]:.3}, {0[5]:.3}, {0[6]:.3}'.format(model_lr2.coef_[0])
print 'weight for class1:{0[0]:.3}, {0[1]:.3}, {0[2]:.3}, {0[3]:.3}, {0[4]:.3}, {0[5]:.3}, {0[6]:.3}'.format(model_lr2.coef_[1])
print 'weight for class2:{0[0]:.3}, {0[1]:.3}, {0[2]:.3}, {0[3]:.3}, {0[4]:.3}, {0[5]:.3}, {0[6]:.3}'.format(model_lr2.coef_[2])
print model_lr2.coef_.sum(axis=0), model_lr2.coef_.var(axis=0)





# k is the colum we wanna add with the sqrt transform
# j is the colum we wanna add with the square transform
# r is the colum we wanna delete
k = [1,3,6] # k =[3, 6]
j = [3,6]
r = [2,3,6]
q = [[1,6]]
# area A(面积0), perimeter P(周长1), compactness C = 4piA/P^2(紧密度2), length of kernel(胚长度3),
# width of kernel(胚宽度4), asymmetry coefficient(偏度系数5), length of kernel groove(胚槽长度6)

X_train1 = X_train.copy()
tmp1 = np.sqrt(X_train[:,k]).reshape((X_train1.shape[0],len(k)))
tmp2 = np.square(X_train[:,j]).reshape((X_train1.shape[0],len(j)))
X_train1 = np.concatenate((X_train1, tmp1, tmp2), axis=1)
for i in q:
    tmp3 = np.multiply(X_train[:,i[0]], X_train[:,i[0]]).reshape((X_train1.shape[0],1))
    X_train1 = np.concatenate((X_train1, tmp3), axis=1)

X_train1 = np.delete(X_train1, r, axis=1)

X_test1 = X_test.copy()
tmp1 = np.sqrt(X_test[:,k]).reshape((X_test1.shape[0],len(k)))
tmp2 = np.square(X_test[:,j]).reshape((X_test1.shape[0],len(j)))
X_test1 = np.concatenate((X_test1, tmp1, tmp2), axis=1)
for i in q:
    tmp3 = np.multiply(X_test[:,i[0]], X_test[:,i[0]]).reshape((X_test.shape[0],1))
    X_test1 = np.concatenate((X_test1, tmp3), axis=1)

X_test1 = np.delete(X_test1, r, axis=1)

print X_train1.shape,X_test1.shape

scaler = pps.StandardScaler().fit(X_train1) #StandardScaler(), MinMaxScaler() , Normalizer()

model_lr1 = linear_model.LogisticRegression(), y_train)
print model_lr1.score(X_train1, y_train), model_lr1.score(X_test1, y_test)

model_lr2 = linear_model.LogisticRegression(), y_train)
print model_lr2.score(scaler.transform(X_train1), y_train), model_lr2.score(scaler.transform(X_test1), y_test)

# just for a test
X =  seeds.iloc[:,:-1]
tmp1 = np.sqrt(X.iloc[:,k])
tmp2 = np.square(X.iloc[:,j])
X = np.concatenate((X, tmp1, tmp2), axis=1)
for i in q:
    tmp3 = np.multiply(X[:,i[0]], X[:,i[0]]).reshape((X.shape[0],1))
    X = np.concatenate((X, tmp3), axis=1)
X = np.delete(X, r, axis=1)
print X.shape

model_lr = linear_model.LogisticRegression()
scaler = pps.StandardScaler().fit(X)
score_lr1 = c_v.cross_val_score(model_lr, X, seeds['class'], cv=10)
score_lr2 = c_v.cross_val_score(model_lr, scaler.transform(X), seeds['class'], cv=10)
print 'the score is {s:.3f}, var is {v:.5f}'.format(s=score_lr1.mean(), v=score_lr1.var())
print 'the score is {s:.3f}, var is {v:.5f}'.format(s=score_lr2.mean(), v=score_lr2.var())




model_lr = linear_model.LogisticRegression(), y_train)
print model_lr.score(X_train, y_train), model_lr.score(X_test, y_test)
print 'weight for class0:{0[0]:.3}, {0[1]:.3}, {0[2]:.3}, {0[3]:.3}, {0[4]:.3}, {0[5]:.3}, {0[6]:.3}'.format(model_lr.coef_[0])
print 'weight for class1:{0[0]:.3}, {0[1]:.3}, {0[2]:.3}, {0[3]:.3}, {0[4]:.3}, {0[5]:.3}, {0[6]:.3}'.format(model_lr.coef_[1])
print 'weight for class2:{0[0]:.3}, {0[1]:.3}, {0[2]:.3}, {0[3]:.3}, {0[4]:.3}, {0[5]:.3}, {0[6]:.3}'.format(model_lr.coef_[2])
print model_lr.coef_.sum(axis=0), model_lr.coef_.var(axis=0)

y_pred = model_lr.predict(X_train)
error_choose = y_pred != y_train
error = X_train[error_choose,:]
c_pred = y_pred[error_choose].reshape(error.shape[0],1)
c_train = y_train[error_choose].reshape(error.shape[0],1)
error = np.concatenate((error,c_pred, c_train),axis=1)
f = [1,3,6,-1,-2]
name = ['A','P','C','length','width','asy', 'gro','c_pred','c_train']
error = pd.DataFrame(error[:,f],columns=[name[i] for i in f])
sns.pairplot(error, hue="c_train", size=2.5)


最后,我们看一下整体。首先我们并没有完整的按照数据挖掘流程进行分析。这里,主要集中在建模的讨论中,而且毕竟是练习,所以也比较粗糙。简单的可视化了数据的情况,之后直接套模型看看效果,然后根据一些情况进行调整归一化和特征调整。有很多细节,我们可以深入的思考,比如得到的回归系数,我们如何解释?如果根据这些东西,进行调参呢?最后增加的特征,也只是简单的增加点,没有深入的探讨,当然也参考了数据的分布(pairplot图)。另外,在编程过程中,也需要注意内存的释放,这里因为数据集比较小,我没有使用delete variable来释放内存。还有有很多东西值得进一步的思考的东西,不再赘述!
