# python创建分类器小结

``X = np.array([[3, 1], [2, 5], [1, 8], [6, 4], [5, 2], [3, 5], [4, 7], [4, -1]])``

``````y = [0, 1, 1, 0, 0, 1, 1, 0]
class_0 = np.array([X[i] for i in range(len(X)) if y[i] == 0])
class_1 = np.array([X[i] for i in range(len(X)) if y[i] == 1])``````

``````plt.figure()
# 画散点图 (scatterplot)
plt.scatter(class_0[:, 0], class_0[:, 1], color='black', marker='s')
plt.scatter(class_1[:, 0], class_1[:, 1], color='black', marker='x')
plt.show()``````

``````line_x = range(10)
line_y = line_x
plt.plot(line_x, line_y, color='black', linewidth=3)
plt.show()``````

Sigmoid函数是一个S型的函数，当自变量z趋近正无穷时，因变量g(z)趋近于1，而当z趋近负无穷时，g(z)趋近于0，它能够将任何实数映射到(0,1)区间，使其可用于将任意值函数转换为更适合二分类的函数。

``````import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn import linear_model``````

``````iris = datasets.load_iris()
x = iris.data[:, :2]
y = iris.target
print('x前10行:\n', x[:10])
print('y前10行:\n', y[:10])``````

``````plt.figure()
plt.scatter(x[:, 0], x[:, 1], c=y)
plt.show()``````

``clf = linear_model.LogisticRegression(solver='liblinear', C=1000)``

``clf.fit(X, y)``

np.c_方法是按行连接两个矩阵，要求两个矩阵的行数相等。(扩展一下，同理，np.r_ 方法就是按列连接两个矩阵，要求两个矩阵的列数相等）

``````x_min, x_max = min(X[:, 0]) - 1.0, max(X[:, 0]) + 1.0
y_min, y_max = min(X[:, 1]) - 1.0, max(X[:, 1]) + 1.0
print('x_min:', x_min, 'x_max:', x_max)
print('y_min:', y_min, 'y_max:', y_max)
# 设置网格步长
step_size = 0.05
# 定义网格
x_values, y_values = np.meshgrid(np.arange(x_min, x_max, step_size), np.arange(y_min, y_max, step_size))
# 展平，连接
x_, y_ = np.c_[x_values.ravel(), y_values.ravel()][:, 0], np.c_[x_values.ravel(), y_values.ravel()][:, 1]
print('x_: \n', x_)
print('y_: \n', y_)``````

``````y_pred = clf.predict(np.c_[x_.ravel(), y_.ravel()]).reshape(x_.shape)
print(y_pred)``````

``````cmap_light = ListedColormap(['#AAAAFF','#AAFFAA','#FFAAAA'])
plt.figure()
plt.pcolormesh(x_, y_, y_pred, cmap=cmap_light)
plt.xlim(x_.min(), x_.max())
plt.ylim(y_.min(), y_.max())
plt.show()``````

``plt.scatter(x[:, 0], x[:, 1], c=y)``

我们把参数C（对错误的惩罚值）调整一下，设置成1，看看效果

``clf = linear_model.LogisticRegression(solver='liblinear', C=1)``

``````from sklearn.naive_bayes import GaussianNB
from utils.views import plot_classifier, plot_confusion_matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report``````

``````X, y = make_classification(n_samples=2000, n_features=2, n_redundant=0, n_classes=4, n_clusters_per_class=1, random_state=0)
print('X前10行数据： \n', X[: 10])
print('y前10行数据， \n', y[:10])``````

n_samples： 2000个样本

n_features：2个特征

n_redundant：冗余特征数0个

n_classes： 4个类别

n_clusters_per_class：每个簇1个类

random_state:  随机数种子，随便定义，确定随机数种子后，多次反复执行该语句，生成的数据结果是一样的。如果不确定的话，每次生成的数据随机。

``````plt.Figure()
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()``````

``X_train, X_test, y_tran, y_test = train_test_split(X, y, test_size=0.25)``

``clf = GaussianNB()``

``clf.fit(X_train, y_tran)``

预测结果，传入测试集：

``y_pred = clf.predict(X_test)``

``````plot_classifier(clf, X_test, y_test)
def plot_classifier(clf, X, y):
# 定义图形取值范围
x_min, x_max = min(X[:, 0]) - 1.0, max(X[:, 0]) + 1.0
y_min, y_max = min(X[:, 1]) - 1.0, max(X[:, 1]) + 1.0
print('x_min:', round(x_min, 2), 'x_max:', round(x_max, 2))
print('y_min:', round(y_min, 2), 'y_max:', round(y_max, 2))
# 网格(grid) 数据求解方程的值，画出边界
# 设置网格步长
step_size = 0.01
# 定义网格
x_values, y_values = np.meshgrid(np.arange(x_min, x_max, step_size), np.arange(y_min, y_max, step_size))

# 展平，连接
x_, y_ = np.c_[x_values.ravel(), y_values.ravel()][:, 0], np.c_[x_values.ravel(), y_values.ravel()][:, 1]

# 预测结果
mesh_output = clf.predict(np.c_[x_values.ravel(), y_values.ravel()])

# 数组维度变形
mesh_output = mesh_output.reshape(x_values.shape)
plt.figure()
# 选择配色方案‘
plt.pcolormesh(x_values, y_values, mesh_output, cmap=plt.cm.gray)
plt.scatter(X[:, 0], X[:, 1], c=y, s=80, edgecolors='black', linewidths=1) # cmap=plt.cm.Paired

# 设置图形的取值范围
plt.xlim(x_values.min(), x_values.max())
plt.ylim(y_values.min(), y_values.max())
# 设置x轴与y轴
plt.xticks((np.arange(int(min(X[:, 0]) - 1), int(max(X[:, 0]) + 1), 1.0)))
plt.yticks((np.arange(int(min(X[:, 1]) - 1), int(max(X[:, 1]) + 1), 1.0)))
plt.show()``````

``````accuracy = clf.score(X_test, y_test)
print('accuracy:---', accuracy)``````

``````accuracy_cv = cross_val_score(clf, X, y, scoring='accuracy', cv=10)
print('accuracy_cv:---', round(accuracy_cv.mean(), 2))

f1 = cross_val_score(clf, X, y, scoring='f1_weighted', cv=10)
print('f1:', round(f1.mean(), 4))

precision = cross_val_score(clf, X, y, scoring='precision_weighted', cv=10)
print('precision:', round(precision.mean(), 4))

recall = cross_val_score(clf, X, y, scoring='recall_weighted', cv=10)
print('recall:', round(recall.mean(), 4))``````

``````confusion_mat = confusion_matrix(y_test, y_pred)
print('confusion_mat: \n', confusion_mat)
plot_confusion_matrix(confusion_mat)``````

对于混淆矩阵，还可以进行可视化

``````plt.imshow(confusion_mat, interpolation='nearest', cmap='gray')   # 亮色： cmap=plt.cm.Paired
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(4)
plt.xticks(tick_marks, tick_marks)
plt.yticks(tick_marks, tick_marks)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()``````

sklearn类还内置了性能报告，我们可以直接用classification_report方法进行提取查看算法的分类效果。

``````target_names = ['Class-0', 'Class-1', 'Class-2', 'Class-3']
report = classification_report(y_test, y_pred, target_names=target_names)
print(report)``````

6个属性变量分别为：

「维护费」maint：取值范围是vhigh，high，med，low

「车门数」doors：取值范围 2,3,4,5more

「可容纳人数」persons：取值范围2,4, more

「后备箱大小」lug_boot: 取值范围 small，med，big

「安全性」safety：取值范围low，med，high

``````import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, validation_curve
import numpy as np
from utils.views import plot_curve
import pandas as pd``````

加载数据：

``````input_file = 'data/car.data.txt'
df.rename(columns={0:'buying', 1:'maint', 2:'doors', 3:'persons', 4:'lug_boot', 5:'safety', 6:'quality'}, inplace=True)

``````label_encoder = []
for i in range(df.shape[1]):
label_encoder.append(preprocessing.LabelEncoder())
df.iloc[:, i] = label_encoder[-1].fit_transform(df.iloc[:, i])

``````X = df.iloc[:, :-1]
y = df.iloc[:, -1]``````

``````params = {
'n_estimators': 200,
'max_depth': 8,
'random_state': 7
}
clf = RandomForestClassifier(**params)
clf.fit(X, y)``````

``````accuracy = cross_val_score(clf, X, y, scoring='accuracy', cv=10)
print('accuracy:', round(accuracy.mean(), 3))``````

``````input_data = ['low', 'vhigh', '2', '2', 'small', 'low']
input_data_encoded = [-1] * len(input_data)
for i, item in enumerate(input_data):
input_data_encoded[i] = int(label_encoder[i].transform([input_data[i]]))
input_data_encoded = np.array(input_data_encoded)
print(input_data_encoded)``````

``````output_class = clf.predict(input_data_encoded.reshape(1, -1))
print('output class:', label_encoder[-1].inverse_transform(output_class)[0])``````

``````
parameter_grid = np.linspace(25, 200, 8).astype(int)
train_scores, validation_scores = validation_curve(clf, X, y, param_name='n_estimators',
param_range=parameter_grid, cv=5)
print('\n ##### VALIDATION CURVES #####')
print('\nParam: n_estimators \n Training scores: \n', train_scores)
print('\nParam: n_estimators \n Validation scores:\n', validation_scores)``````

验证曲线画图：

``````plt.figure()
plt.plot(parameter_grid, 100 * np.average(train_scores, axis=1), color='black')
plt.title('Training curve')
plt.xlabel( 'Number of estimators')
plt.ylabel('Accuracy')
plt.show()``````

由图可以看出，estimate在100附近，达到最大的准确率。

``````max_depth_grid = np.linspace(2, 10, 5).astype(int)
train_scores, validation_scores = validation_curve(clf, X, y, param_name='max_depth',
param_range=max_depth_grid, cv=5)
plot_curve(max_depth_grid, train_scores, 'Validation curve', 'Maximum depth of the tree')``````

学习曲线可以帮助我们理解训练数据集的大小对机器学习模型的影响。当计算能力限制的时候，这点非常有用。下面改变训练数据集的大小，绘制学习曲线。

``````parameter_grid = np.array([200, 500, 800, 1100])
train_size, train_scores, validation_scores = learning_curve(clf, X, y, train_sizes=parameter_grid, cv=10)
print('\n ##### LEARNING CURVES #####')
print('\n Training scores: \n', train_scores)
print('\n Validation scores:\n', validation_scores)
plot_curve(parameter_grid, train_scores, 'Learning curve', 'Number of training samples')``````

car.data.txt:    https://url87.ctfile.com/f/21704187-595799592-6f0749?p=7287 (访问密码: 7287)

原文作者：wangpengcufe
原文地址: https://www.cnblogs.com/wangpengcufe/p/16345610.html
本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。