机器学习总结基础-1
2024-06-20 16:31:17 0 举报
AI智能生成
机器学习总结基础-1是一份详细的概述,涵盖了机器学习的核心概念、算法和应用。这份资料以易于理解的语言阐述了各种基础算法,如线性回归、逻辑回归、决策树、支持向量机等,并介绍了它们在实际问题中的实现。此外,文档还包括了数据预处理、特征工程、模型评估等关键步骤,以及如何利用Python等编程语言进行机器学习实践。无论是对初学者还是有一定基础的学习者,这份资料都是理解和掌握机器学习的宝贵资源。
作者其他创作
大纲/内容
numpy
numpy.genfromtxt
genfromtxt("world_alcohol.txt", delimiter=",", dtype=str)
创建
arr = np.random.random((4, 6))
arr.shape[0]
arr.shape[1]
arr.ndim
arr.size
arr.reshape(3, 8)
arr.reshape(-1)
score = np.random.randint(80, 100, 30)
score[score >= 95]
score[score <= 85]
arr2 = np.random.randint(0, 10, (4, 4))
arr3 = np.random.randint(0, 10, (4, 4))
arr3 = np.random.randint(0, 10, (4, 4))
arr4 = np.zeros((16, 16))
arr4[0] = 1
arr4[15] = 1
arr4[:, 0] = 1
arr4[:, 15] = 1
arr4[15] = 1
arr4[:, 0] = 1
arr4[:, 15] = 1
arr4 = np.ones((2, 3, 4), dtype = np.int32)
3维矩阵,2条 3行,4列
arr5 = np.random.randint(0, 100, 10000)
arr.view()
指向位置不同,数据相同
arr.copy()
指向位置不同,数据不同
np.tile(a, (3, 5))
对a进行扩展, 行扩充3倍, 列扩充5倍
np.sort(a, axis = 1)
按列进行排序
i = np.argsort(a)
对索引进行排序,按照从小到大排序
a[i]
排序后的数据
选择
ver = np.array([5 ,6, 7, 8])
import numpy as np
x = np.array([[1,2,3],[4,5,6]])
x = np.array([[1,2,3],[4,5,6]])
world[1, 4]
索引从0开始,第二行,第5列
world[2, 2]
第3行,第3列
world[0 : 3]
第1行,前3列的值
world[:, 1]
所有行,第2列
world[:, 0:2]
所有行,前2列
world[1:3, 0:2]
第2行和第3行,前2列
计算
min
max
sum(axis = 0)
行相加, axis =1 列相加
A = np.array([[1,1],[0,1]])
B = np.array([[2,0],[3,4]])
# 位置相乘
print (A * B)
# 矩阵乘法
print(A.dot(B))
print(np.dot(A, B))
B = np.array([[2,0],[3,4]])
# 位置相乘
print (A * B)
# 矩阵乘法
print(A.dot(B))
print(np.dot(A, B))
np.exp(B)
e 的 B次幂
np.sqrt(B)
B次方跟
np.floor
向下取整
a.rabel()
矩阵转换为向量
特征归一化
mu = np.average(X, axis = 0)
求均值
dihms = np.std(X, axis = 0, ddof = 1)
求方差
X = (X - mu) / sigma
转换
np.arange(15)
创建15个数据
reshape(3, 5)
转换成矩阵
reshape(3, -1)
# 间隔固定数据累加创建数据
x = np.arange(10,30,2)
x = np.arange(10,30,2)
# 相同间隔创建数据
x = np.linspace(0, 20, 100)
x = np.linspace(0, 20, 100)
astype(float)
类型转换
拼接
np.hstack([arr2, arr3])
np.vstack([arr2, arr3])
分割
np.hsplit(a, 3)
按行,切分成3份
np.hsplit(a, (3, 4))
按行,第4行,第5列切分
np.vsplit(a, 3)
按列,切分成3份
ndarray
# 最小二乘法,求解方程
# 定义误差函数
# param: 需要求解的直线系数【a, b】, 直线方程为 y = ax + b
# x: 观测数据x
# y: 观测数据y
import numpy as np
from scipy import optimize
def error_fun(param, x, y):
expect_y = param[0] * x + param[1]
err = y - expect_y
return err
x = np.array([1,2,3,4])
y = np.array([6,5,7,10])
# 初始化系数
init_param = np.array([0.1, 0.1])
# 求解
res, k = optimize.leastsq(error_fun, init_param, args=(x, y))
print(res[0], res[1], k)
# 定义误差函数
# param: 需要求解的直线系数【a, b】, 直线方程为 y = ax + b
# x: 观测数据x
# y: 观测数据y
import numpy as np
from scipy import optimize
def error_fun(param, x, y):
expect_y = param[0] * x + param[1]
err = y - expect_y
return err
x = np.array([1,2,3,4])
y = np.array([6,5,7,10])
# 初始化系数
init_param = np.array([0.1, 0.1])
# 求解
res, k = optimize.leastsq(error_fun, init_param, args=(x, y))
print(res[0], res[1], k)
pandas
Series
s1 = pd.Series({'a':11, 'b':22, 'c':33})
s2 = pd.Series([11, 22, 33], index = ['a', 'b', 'c'])
DataFrame
子主题
加载
pd.read_excel
pd.read_csv
摘要
dt.head()
显式前5条数据
dt.head(10)
dt.tail()
显式后5条数据
dt.tail(10)
dt.columns
dt.shape
筛选
过滤数据
显式前3行
df[1:3]
显示特定的行、列
df.loc[1:3, ['star']]
显式指定列
df[["star","new_star"]]
df['star'] == '力荐'
df [ df['star'] == '力荐' ]
loc
df.loc[0]
显式第1行,第1个数据
df.loc[3:6]
第一行,3-6列数据
df.loc[83, "Age"]
查看83行, Age列的数据
按列获取
df['ID']
获取列名为ID的列
df["ID", "NAME"]
a_is_null = pd.isnull(a,type=int32)
显式空数据
len(a)
显式个数
df.apply(func)
数据执行func函数
iloc
处理
df.fillna(axis = 1)
df.fillna(axis = 0, subset=["age", "name"])
df.dropna()
排序
df.sort_values("Age", ascending = False)
按Age 升序排序
df.reset_index(drop = True)
重置索引
pd.to_datetime(a)
时间转换
计算
df['EK'].max()
min()
mean()
df.sort_values("X", inplace = True)
np.add(a, b)
np.sin(a)
显式
增加列名
df.columns = ['star', 'vote', 'shorts']
删除
删除空数据
df.dropna()
数据聚合
df.groupby('star').sum()
创建新列
df['new_star'] = df['star'].map(star_to_number)
分组
自定义区间离散化(分箱、分组)
# 对Amount 进行自定义离散--分箱分组
cut_points = [0, 13, 14, 15, 16]
RFM['radius_mean_bin'] = pd.cut(RFM['radius_mean'], bins = cut_points)
RFM.head()
cut_points = [0, 13, 14, 15, 16]
RFM['radius_mean_bin'] = pd.cut(RFM['radius_mean'], bins = cut_points)
RFM.head()
等宽离散化
RFM['radius_mean_width_bin'] = pd.cut(RFM.radius_mean, 20)
RFM.head()
RFM.head()
等深离散化
RFM['radius_mean_depth_bin'] = pd.qcut(RFM.radius_mean, 5)
RFM.head()
RFM.head()
画图
散点图
data.plot.scatter(x = var, y= 'SalePrice', ylim = (0, 800000))
plt.scatter(y_test, y_test_preds) # 预测与实际值散点图
plt.plot([0, max(y_test)], [0, max(y_test_preds)], lw = 1, linestyle='--')
plt.xlabel('实际值')
plt.ylabel('预测值')
plt.show()
plt.plot([0, max(y_test)], [0, max(y_test_preds)], lw = 1, linestyle='--')
plt.xlabel('实际值')
plt.ylabel('预测值')
plt.show()
柱状图
# age分布
train_src.hist(column='Age', bins = 50)
train_src.hist(column='Age', bins = 50)
# 性别分布
train_src["Sex"].value_counts().plot(kind = "bar")
train_src["Sex"].value_counts().plot(kind = "bar")
# 性别与目标生存的相关性
pd.crosstab(train_src.Sex, train_src.Survived).plot(kind = "bar")
pd.crosstab(train_src.Sex, train_src.Survived).plot(kind = "bar")
# Age 与目标生存的相关性
train_src.age = pd.cut(train_src.Age, [0, 5, 15, 20, 25, 35, 50, 60, 100])
pd.crosstab(train_src.age, train_src["Survived"]).plot(kind = "bar")
train_src.age = pd.cut(train_src.Age, [0, 5, 15, 20, 25, 35, 50, 60, 100])
pd.crosstab(train_src.age, train_src["Survived"]).plot(kind = "bar")
填充
classes = ["03020" + str(i) for i in range(1, 10)] + ["0302" + str(i) for i in range(10,31)]
objectes = ["语文","数学","英语","物理","化学","计算机"]
df = pd.DataFrame(np.random.randint(70, 100, (30, 6)), classes, objectes)
objectes = ["语文","数学","英语","物理","化学","计算机"]
df = pd.DataFrame(np.random.randint(70, 100, (30, 6)), classes, objectes)
gender = ['男' if np.random.random() < 0.5 else '女' for i in range(30)]
df2 = pd.DataFrame(gender, index = classes, columns = ['性别'])
df2 = pd.DataFrame(gender, index = classes, columns = ['性别'])
合并
c = pd.concat([df_cp, df2], axis = 1)
matplotlib
基本设置
import matplotlib.pyplot as plt
plt.plot(df['a'], df['b'])
x轴, y轴数据
plt.show()
展现图表
plt.xticks(rotation=45)
指定x轴数据显式角度
plt.xlabel('xxxx')
plt.xlabel('yyyy')
plt.title("标题")
子图
fig = plt.figure(figsize=(3, 6))
指定画图区间, figsize = (长, 宽)
ax1= fig.add_subplot(2, 2, 1)
2行, 2列,第一个图
ax2=fig.add_subplot(2, 2, 2)
2行, 2列,第二个图
ax4=fig.add_subplot(2, 2, 4)
2行, 2列,第四个图
ax.plot(df)
显式摘要
plt.legend(loc = 'best')
label = xxxx
plt.legend(loc = 'upper left')
柱状图
fig, ax = plt.subplots()
ax.bar(x, y, 0.3)
plt.show()
ax.bar(x, y, 0.3)
plt.show()
默认纵向
ax.barh(x, y, 0.5)
横向
fig, ax = plt.subplots()
ax.hist(x, y, range=(4, 5), bins = 20)
plt.show()
ax.hist(x, y, range=(4, 5), bins = 20)
plt.show()
range 指定区间, bins 20个柱
ax.set_xlim(0, 50)
指定区间
ax.set_ylim(0, 50)
散点图
fig, ax = plt.subplots()
ax.scatter(x, y, 0.5)
plt.show()
ax.scatter(x, y, 0.5)
plt.show()
箱线图
fig, ax = plt.subplots()
ax.boxplot(x)
plt.show()
ax.boxplot(x)
plt.show()
# 绘制前三个特征的直方图
first_three_features = features[:3]
fig, axes = plt.subplots(nrows = 3, ncols = 1, figsize = (8, 10))
for i, feature in enumerate(first_three_features):
axes[i].hist(X[feature], bins = 30, color='skyblue', edgecolor='black')
axes[i].set_title(f"{feature} - 直方图")
axes[i].set_xlabel('值')
axes[i].set_ylabel('频数')
axes[i].grid(True, axis = 'y', linestyle='--', linewidth = 0.7, alpha = 0.6)
plt.tight_layout()
plt.show()
first_three_features = features[:3]
fig, axes = plt.subplots(nrows = 3, ncols = 1, figsize = (8, 10))
for i, feature in enumerate(first_three_features):
axes[i].hist(X[feature], bins = 30, color='skyblue', edgecolor='black')
axes[i].set_title(f"{feature} - 直方图")
axes[i].set_xlabel('值')
axes[i].set_ylabel('频数')
axes[i].grid(True, axis = 'y', linestyle='--', linewidth = 0.7, alpha = 0.6)
plt.tight_layout()
plt.show()
直方图
df_user['R值_x'].plot(kind = 'hist', bins = 20, title = '新进度分布直方图')
plt.hist(user_id_itemcnt.values)
散点图
import seaborn as sns
plt.scatter(df.index, df['A'])
plt.show()
plt.scatter(df.index, df['A'])
plt.show()
折线图
import matplotlib.pyplot as plt
plt.plot(df.index, df['A'], )
plt.show()
plt.plot(df.index, df['A'], )
plt.show()
plt.plot(df.index, df['A'],
color='#FFAA00', # 颜色
linestyle='--', # 线条样式
linewidth=3, # 线条宽度
marker='D') # 点标记
color='#FFAA00', # 颜色
linestyle='--', # 线条样式
linewidth=3, # 线条宽度
marker='D') # 点标记
密度图
# 画密度图
def density_plot(data):
plt.rcParams['axes.unicode_minus'] = False # 显式正负号
p = data.plot(kind = 'kde', linewidth = 2, subplots = True, sharex = False, figsize = (10, 9))
[p[i].set_ylabel(data3.columns[i], fontproperties = 'SimHei') for i in range(k)]
plt.legend()
return plt
for i in range(k):
density_plot(data3)
def density_plot(data):
plt.rcParams['axes.unicode_minus'] = False # 显式正负号
p = data.plot(kind = 'kde', linewidth = 2, subplots = True, sharex = False, figsize = (10, 9))
[p[i].set_ylabel(data3.columns[i], fontproperties = 'SimHei') for i in range(k)]
plt.legend()
return plt
for i in range(k):
density_plot(data3)
手肘图
from sklearn.cluster import KMeans #导入KMeans模块
def show_elbow(df, ax, title):
distance_list = []
K = range(1, 9)
for k in K:
kmeans = KMeans(n_clusters = k, max_iter = 100)
kmeans = kmeans.fit(df)
distance_list.append(kmeans.inertia_)
ax.plot(K, distance_list, 'bx-')
ax.set_xlabel('k')
ax.set_ylabel('距离均方误差')
ax.set_title(title)
fig, axes = plt.subplots(1, 3, figsize = (18, 6))
show_elbow(df_user[['R值_x']], axes[0], 'R值聚类K值手肘图')
show_elbow(df_user[['F值_x']], axes[1], 'F值聚类K值手肘图')
show_elbow(df_user[['M值_x']], axes[2], 'M值聚类K值手肘图')
plt.tight_layout()
plt.show()
def show_elbow(df, ax, title):
distance_list = []
K = range(1, 9)
for k in K:
kmeans = KMeans(n_clusters = k, max_iter = 100)
kmeans = kmeans.fit(df)
distance_list.append(kmeans.inertia_)
ax.plot(K, distance_list, 'bx-')
ax.set_xlabel('k')
ax.set_ylabel('距离均方误差')
ax.set_title(title)
fig, axes = plt.subplots(1, 3, figsize = (18, 6))
show_elbow(df_user[['R值_x']], axes[0], 'R值聚类K值手肘图')
show_elbow(df_user[['F值_x']], axes[1], 'F值聚类K值手肘图')
show_elbow(df_user[['M值_x']], axes[2], 'M值聚类K值手肘图')
plt.tight_layout()
plt.show()
绘制分类矩阵
# 误分类矩阵
cnf_matrix = metrics.confusion_matrix(train_y, train_y_pred)
# 绘画分类矩阵
def show_confusion_matrix(cnf_matrix, class_labels):
plt.matshow(cnf_matrix, cmap=plt.cm.YlGn, alpha = 0.7)
ax = plt.gca()
ax.set_xlabel('Predicted Label', fontsize = 16)
ax.set_xticks(range(0, len(class_labels)))
ax.set_xticklabels(class_labels, rotation=45)
ax.set_ylabel('Actual Label', fontsize=16, rotation = 90)
ax.set_yticks(range(0, len(class_labels)))
ax.set_yticklabels(class_labels)
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()
for row in range(len(cnf_matrix)):
for col in range(len(cnf_matrix[row])):
ax.text(col, row, cnf_matrix[row][col], va = 'center', ha = 'center', fontsize = 16)
class_labels = [0, 1]
show_confusion_matrix(cnf_matrix, class_labels)
cnf_matrix = metrics.confusion_matrix(train_y, train_y_pred)
# 绘画分类矩阵
def show_confusion_matrix(cnf_matrix, class_labels):
plt.matshow(cnf_matrix, cmap=plt.cm.YlGn, alpha = 0.7)
ax = plt.gca()
ax.set_xlabel('Predicted Label', fontsize = 16)
ax.set_xticks(range(0, len(class_labels)))
ax.set_xticklabels(class_labels, rotation=45)
ax.set_ylabel('Actual Label', fontsize=16, rotation = 90)
ax.set_yticks(range(0, len(class_labels)))
ax.set_yticklabels(class_labels)
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()
for row in range(len(cnf_matrix)):
for col in range(len(cnf_matrix[row])):
ax.text(col, row, cnf_matrix[row][col], va = 'center', ha = 'center', fontsize = 16)
class_labels = [0, 1]
show_confusion_matrix(cnf_matrix, class_labels)
绘制验证曲线(validation curve)
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
train_scores, test_scores = validation_curve(
estimator = lr, # 指定模型
X = train_X,
y = train_y,
param_name = 'C',
param_range= param_range,
cv = 10)
# 计算训练集合测试集准确率的均值和标准差
train_mean = np.mean(train_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)
test_mean = np.mean(test_scores, axis = 1)
test_std = np.std(test_scores, axis = 1)
# 对训练集的准确率作图
plt.plot(param_range, train_mean, color = 'blue', marker='o', markersize = 5, label = 'training accuracy')
# 对训练集准确率的可信范围(上下限)作图: 上限: train_mean + train_std 下限:train_mean - train_std
plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha = 0.15, color = 'blue')
# 对测试集的准确率作图
plt.plot(param_range, test_mean, color = 'blue', marker='o', markersize = 5, label = 'test accuracy')
# 对测试集准确率的可信范围(上下限)作图: 上限: test_mean + test_std 下限:test_mean - test_std
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha = 0.15, color = 'green')
plt.grid()
plt.xscale('log')
plt.legend(loc= 'lower right')
plt.xlabel('Parmeter')
plt.ylabel('Accuracy')
plt.ylim([0.6, 1.0])
plt.tight_layout()
plt.show()
train_scores, test_scores = validation_curve(
estimator = lr, # 指定模型
X = train_X,
y = train_y,
param_name = 'C',
param_range= param_range,
cv = 10)
# 计算训练集合测试集准确率的均值和标准差
train_mean = np.mean(train_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)
test_mean = np.mean(test_scores, axis = 1)
test_std = np.std(test_scores, axis = 1)
# 对训练集的准确率作图
plt.plot(param_range, train_mean, color = 'blue', marker='o', markersize = 5, label = 'training accuracy')
# 对训练集准确率的可信范围(上下限)作图: 上限: train_mean + train_std 下限:train_mean - train_std
plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha = 0.15, color = 'blue')
# 对测试集的准确率作图
plt.plot(param_range, test_mean, color = 'blue', marker='o', markersize = 5, label = 'test accuracy')
# 对测试集准确率的可信范围(上下限)作图: 上限: test_mean + test_std 下限:test_mean - test_std
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha = 0.15, color = 'green')
plt.grid()
plt.xscale('log')
plt.legend(loc= 'lower right')
plt.xlabel('Parmeter')
plt.ylabel('Accuracy')
plt.ylim([0.6, 1.0])
plt.tight_layout()
plt.show()
seaborn
sns.set()
初始化默认参数
主题风格
sns.set_style("dark")
darkgrid
whitegrid
dark
white
ticks
调色板
color_palette()
能传入任何matplotlib所支持的颜色
不写参数则默认颜色
不写参数则默认颜色
set_palette()
设置所有图的颜色
hls_palette()
亮度和饱和度
lightness
亮度
saturation
饱和度
sns.palplot(sns.hls_palette(8, l=.3, s =.8))
sns.palplot(sns.color_palette("Paired", 10))
连续色板
sns.palplot(sns.color_palette("Blues"))
sns.palplot(sns.color_palette("BuGn_r"))
反转渐变,增加_r
sns.palplot(sns.color_palette("cubehelix", 8))
sns.palplot(sns.cubehelix_palette(8, start = .5, rot = -.75))
cubehelix_palette()
色调线性变换
定制连续调色板
light_palette()
sns.palplot(sns.light_palette("green", reverse = True))
dark_palette()
分类色板
palette = sns.color_palette()
sns.palplot(palette)
sns.palplot(palette)
6种颜色循环主题
deep
muted
pastel
bright
dark
colorblind
圆形画板
pal = sns.palplot(sns.color_palette("hls", 8))
sns.boxplot(data = df, palette = pal )
画图
样式
画图与轴线距离
sns.despine(offset = 30)
保留的坐标轴
sns.despine(left = True)
设置风格
sns.set_context('paper', font_scale=1.5, rc={"lines.linewidth": 2.5})
其他风格
notebook
poster
talk
paper
设置颜色
palette = {"male": "g", "female": "m"}
设置图形
markers = ["^", "o"]
设置线
linestyles = ["-", "--"]
方向
orient = "h"
设置透明度
alpha=0.7
bdistplot
柱状图
sns.bdistplot(data = df, bins=20, kde=False, fit=stats.gamma)
fit 显式分布曲线
jointplot
sns.jointplot(x = "X", y = "y", data = df )
sns.jointplot(x = "X", y = "y", kind = "hex", color = "k", data = df )
sns.pairplot(iris)
绘画单特征分布,及2个特征之间的关系图
箱线图
sns.boxplot(data=tips, x = "total_bill", y = "day", hue = "sex")
小提琴图
sns.violinplot(data=tips, x = "total_bill", y = "day", hue = "sex")
regplot()
sns.regplot(x = "total_bill", y = "tip", data = tips)
sns.lmplot(x = "total_bill", y = "tip", data = tips)
# x_jitter 增加随机浮动
sns.regplot(x="size", y = "tip", data = tips, x_jitter=0.5)
sns.regplot(x="size", y = "tip", data = tips, x_jitter=0.5)
stripplot()
sns.stripplot(x = "day", y = "total_bill", data = tips)
# jitter 偏移数据,防止聚堆
sns.stripplot(x = "day", y = "total_bill", data = tips, jitter = True)
sns.stripplot(x = "day", y = "total_bill", data = tips, jitter = True)
swarmplot()
sns.swarmplt(x = "day", y = "total_bill", data = tips)
boxplot
sns.boxplot(x = "day", y = "total_bill", hue = "sex", data = tips)
barplot
sns.barplot(x="sex", y = "survived", hue = "class", data = titanic)
pointplot
sns.pointplot(x="sex", y = "survived", hue = "class", data = titanic)
点图,比较差异性
factorplot
sns.factorplot(kind = "bar", x = "day", y = "total_bill", hue = "smoker", data = tips)
参数
FacetGrid
g = sns.FacetGrid(tips, col = "time", hue = "sex", palette="set1", size = 5, hue_kws={"marker": ["^", "o"]})
g.map(plt.hist, "total_bill", "tip", s = 100, linewidth=.5, edgecolor = "white")
g.set_axis_labels("xxxxx", "yyyy")
g.set(xticks=[10, 30, 50], yticks=[2, 6, 10])
g.fig.subplots_adjust(wspack=.5, hspack=.5) # 设置图间隔
g.add_legend()
g.map(plt.hist, "total_bill", "tip", s = 100, linewidth=.5, edgecolor = "white")
g.set_axis_labels("xxxxx", "yyyy")
g.set(xticks=[10, 30, 50], yticks=[2, 6, 10])
g.fig.subplots_adjust(wspack=.5, hspack=.5) # 设置图间隔
g.add_legend()
添加说明项
add_legend()
size = 4
aspect = .5
长宽比
row_order
row_order = Categorical(["x", "y"])
指定顺序
s
点的大小
s = 50
linewidth
linewidth = .5
线宽
edgecolor
edgecolor = "white"
颜色
set_axis_labels
g.set_axis_labels("x", "y")
显式坐标轴标签
vars
vars = ["sex", "Age"]
指定绘画的数据
组合
sns.violinplot(x = "day", y = "total_bill", data = tips, inner= None)
sns.swarmplot(x = "day", y = "total_bill", data = tips, color= "w", alpha = 0.5)
sns.swarmplot(x = "day", y = "total_bill", data = tips, color= "w", alpha = 0.5)
正态分布
import seaborn as sns
from scipy.stats import norm
# 分布 黑色线为正态分布,蓝色为真实分布,房价非正态分布
sns.distplot(train['SalePrice'], fit = norm)
from scipy.stats import norm
# 分布 黑色线为正态分布,蓝色为真实分布,房价非正态分布
sns.distplot(train['SalePrice'], fit = norm)
热力图
sns.heatmap(corr, vmax = 1, vmin=5, vmax = 10, square = True, center = 0)
center = 0
设置以0为中心
箱线图
plt.figure(figsize = (12, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = train)
plt.axis(ymin = 0 , ymax = 800000)
plt.xticks(rotation=90)
plt.show()
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = train)
plt.axis(ymin = 0 , ymax = 800000)
plt.xticks(rotation=90)
plt.show()
# y变量箱线图
train.SalePrice.plot(kind = 'box', sym = 'b*')
train.SalePrice.plot(kind = 'box', sym = 'b*')
corr_cols = corr.loc[:, corr.loc['SalePrice', :].abs() > corr_threshvalue].columns
number_para[corr_cols].plot(sym='b*', kind = 'box', subplots = True, figsize = (20, 8))
number_para[corr_cols].plot(sym='b*', kind = 'box', subplots = True, figsize = (20, 8))
直方图
count_classes.plot(kind = 'bar')
小提琴图
#小提琴图
import seaborn as sns
plt.figure(figsize = (12, 8))
sns.violinplot(data = pd.DataFrame(X_selected_standardized, columns = selected_features), plaette="Set3")
plt.title('小提琴图-标准化后数据')
plt.ylabel('标准化值')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()
import seaborn as sns
plt.figure(figsize = (12, 8))
sns.violinplot(data = pd.DataFrame(X_selected_standardized, columns = selected_features), plaette="Set3")
plt.title('小提琴图-标准化后数据')
plt.ylabel('标准化值')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()
scipy
qq图
from scipy import stats
# 利用Q-Q图判断数据是否偏离正态分布
stats.probplot(train['SalePrice'], plot = plt)
# 利用Q-Q图判断数据是否偏离正态分布
stats.probplot(train['SalePrice'], plot = plt)
谱系聚类图
# 聚类数据
from scipy.cluster.hierarchy import linkage, dendrogram # 柱状图
Z = linkage(data3, method = 'ward', metric = 'euclidean')
# 画谱系聚类图
P = dendrogram(Z, 0)
plt.show()
from scipy.cluster.hierarchy import linkage, dendrogram # 柱状图
Z = linkage(data3, method = 'ward', metric = 'euclidean')
# 画谱系聚类图
P = dendrogram(Z, 0)
plt.show()
sklearn
标准化压缩
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X_standardized = scaler.fit_transform(X)
参数优化
from sklearn.model_selection import GridSearchCV
svm = SVC()
grid_search = GridSearchCV(svm, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
svm = SVC()
grid_search = GridSearchCV(svm, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
算法
XGBoost
import xgboost as xgb
watchlist = [(dtest, 'val'), (dtrain, 'train')]
params = {
'booster':'gbtree',
'objective':'reg:linear',
'early_stopping_rounds': 50,
'eval_metric':'rmse',
'gamma': 0,
'max_depth': 5,
'subsample': 0.6,
'colsample_bytree': 0.9,
'min_child_weight': 1,
'eta': 0.02,
'seed': 123456,
'nthread': 3,
'silent': 0
}
model = xgb.train(params, dtrain, num_boost_round=200, evals = watchlist)
predict_y = model.predict(dtest, ntree_limit = model.best_ntree_limit)
watchlist = [(dtest, 'val'), (dtrain, 'train')]
params = {
'booster':'gbtree',
'objective':'reg:linear',
'early_stopping_rounds': 50,
'eval_metric':'rmse',
'gamma': 0,
'max_depth': 5,
'subsample': 0.6,
'colsample_bytree': 0.9,
'min_child_weight': 1,
'eta': 0.02,
'seed': 123456,
'nthread': 3,
'silent': 0
}
model = xgb.train(params, dtrain, num_boost_round=200, evals = watchlist)
predict_y = model.predict(dtest, ntree_limit = model.best_ntree_limit)
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)
params = {
'booster':'gbtree',
'objective':'binary:logistic',
'early_stopping_rounds': 50,
'eval_metric':'auc',
'gamma': 0,
'max_depth': 5,
'subsample': 0.6,
'colsample_bytree': 0.9,
'min_child_weight': 1,
'eta': 0.02,
'seed': 123456,
'nthread': 3,
'silent': 0
}
watchlist = [(dtest, 'val'), (dtrain, 'train')]
model = xgb.train(params, dtrain, num_boost_round=200, evals = watchlist)
predict_y = model.predict(dtest, ntree_limit= model.best_ntree_limit)
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)
params = {
'booster':'gbtree',
'objective':'binary:logistic',
'early_stopping_rounds': 50,
'eval_metric':'auc',
'gamma': 0,
'max_depth': 5,
'subsample': 0.6,
'colsample_bytree': 0.9,
'min_child_weight': 1,
'eta': 0.02,
'seed': 123456,
'nthread': 3,
'silent': 0
}
watchlist = [(dtest, 'val'), (dtrain, 'train')]
model = xgb.train(params, dtrain, num_boost_round=200, evals = watchlist)
predict_y = model.predict(dtest, ntree_limit= model.best_ntree_limit)
SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
svm.support_
svm.support_vectors_
svm.n_support_
svm.dual_coef_
svm.intercept_
pred = svm.predict(X_test)
print(classification_report(y_test, pred))
svm = SVC()
svm.fit(X_train, y_train)
svm.support_
svm.support_vectors_
svm.n_support_
svm.dual_coef_
svm.intercept_
pred = svm.predict(X_test)
print(classification_report(y_test, pred))
K近邻
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print(classification_report(y_test, pred))
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print(classification_report(y_test, pred))
parameters = {
'n_neighbors':[3, 5, 10, 15, 20, 30],
'weights': ['uniform', 'distance'],
'p':[1, 2]
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, parameters, scoring='accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
'n_neighbors':[3, 5, 10, 15, 20, 30],
'weights': ['uniform', 'distance'],
'p':[1, 2]
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, parameters, scoring='accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
Kmeans
kmeans = KMeans(n_clusters = 3)
kmeans.fit(features)
kmeans.cluster_centers_
kmeans.labels_
kmeans.fit_predict(features)
cluster_label = kmeans.predict(features)
kmeans.fit(features)
kmeans.cluster_centers_
kmeans.labels_
kmeans.fit_predict(features)
cluster_label = kmeans.predict(features)
# 我们计算K值从1到10对应的平均畸变程度
# 利用scipy 求解距离
from scipy.spatial.distance import cdist
K = range(1, 10)
meandistortions = []
for k in K:
kmeans = KMeans(n_clusters = k)
kmeans.fit(features)
meandistortions.append(sum(np.min(cdist(features, kmeans.cluster_centers_, 'euclidean'), axis = 1))/features.shape[0])
plt.plot(K, meandistortions, 'bx--')
plt.xlabel('k')
plt.ylabel('loss')
plt.title('find the best k value')
# 利用scipy 求解距离
from scipy.spatial.distance import cdist
K = range(1, 10)
meandistortions = []
for k in K:
kmeans = KMeans(n_clusters = k)
kmeans.fit(features)
meandistortions.append(sum(np.min(cdist(features, kmeans.cluster_centers_, 'euclidean'), axis = 1))/features.shape[0])
plt.plot(K, meandistortions, 'bx--')
plt.xlabel('k')
plt.ylabel('loss')
plt.title('find the best k value')
GBDT
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
gbdt = GradientBoostingRegressor()
gbdt.fit(X_train, y_train)
gbdt.score(X_test, y_test)
print(gbdt.feature_importances_)
from sklearn.model_selection import GridSearchCV
gbdt = GradientBoostingRegressor()
gbdt.fit(X_train, y_train)
gbdt.score(X_test, y_test)
print(gbdt.feature_importances_)
parameters = {
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}
gbdt = GradientBoostingRegressor()
grid_search = GridSearchCV(gbdt, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}
gbdt = GradientBoostingRegressor()
grid_search = GridSearchCV(gbdt, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
gbdt = GradientBoostingClassifier()
gbdt.fit(X_train, y_train)
gbdt.score(X_test, y_test)
print(gbdt.feature_importances_)
from sklearn.model_selection import GridSearchCV
gbdt = GradientBoostingClassifier()
gbdt.fit(X_train, y_train)
gbdt.score(X_test, y_test)
print(gbdt.feature_importances_)
parameters = {
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}
gbdt = GradientBoostingClassifier()
grid_search = GridSearchCV(gbdt, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}
gbdt = GradientBoostingClassifier()
grid_search = GridSearchCV(gbdt, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
AdaBoost
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
dtree = DecisionTreeRegressor()
adaboost = AdaBoostRegressor(base_estimator = dtree)
adaboost.fit(X_train, y_train)
adaboost.score(X_test, y_test)
print(adaboost.feature_importances_)
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
dtree = DecisionTreeRegressor()
adaboost = AdaBoostRegressor(base_estimator = dtree)
adaboost.fit(X_train, y_train)
adaboost.score(X_test, y_test)
print(adaboost.feature_importances_)
parameters = {
'n_estimators':[30, 50, 80, 100],
'learning_rate': [0.01, 0.1, 0.2]
}
dtree = DecisionTreeRegressor()
adaboost = AdaBoostRegressor(base_estimator = dtree)
grid_search = GridSearchCV(adaboost, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
'n_estimators':[30, 50, 80, 100],
'learning_rate': [0.01, 0.1, 0.2]
}
dtree = DecisionTreeRegressor()
adaboost = AdaBoostRegressor(base_estimator = dtree)
grid_search = GridSearchCV(adaboost, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dtree = DecisionTreeClassifier()
adaboost = AdaBoostClassifier(base_estimator = dtree)
adaboost.fit(X_train, y_train)
adaboost.score(X_test, y_test)
print(adaboost.feature_importances_)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dtree = DecisionTreeClassifier()
adaboost = AdaBoostClassifier(base_estimator = dtree)
adaboost.fit(X_train, y_train)
adaboost.score(X_test, y_test)
print(adaboost.feature_importances_)
parameters = {
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}
dtree = DecisionTreeClassifier()
adaboost = AdaBoostClassifier(base_estimator = dtree)
grid_search = GridSearchCV(adaboost, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}
dtree = DecisionTreeClassifier()
adaboost = AdaBoostClassifier(base_estimator = dtree)
grid_search = GridSearchCV(adaboost, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
线性回归
一元线性回归
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)
print(regr.coef_)
print(regr.intercept_)
y_pred = regr.predict(train_x)
plt.scatter(train_x, train_y, color='black')
plt.plot(train_x, train_y, color='blue', linewidth=3)
plt.show()
regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)
print(regr.coef_)
print(regr.intercept_)
y_pred = regr.predict(train_x)
plt.scatter(train_x, train_y, color='black')
plt.plot(train_x, train_y, color='blue', linewidth=3)
plt.show()
多元线性回归
普通线性回归
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regr.score(X_train, y_train)
print(regr.coef_)
print(regr.intercept_)
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regr.score(X_train, y_train)
print(regr.coef_)
print(regr.intercept_)
Ridge回归(L2正则)
ridgereg = linear_model.Ridge()
ridgereg.fit(X_train, y_train)
ridgereg.score(X_train, y_train)
print(regr.coef_)
print(regr.intercept_)
ridgereg.fit(X_train, y_train)
ridgereg.score(X_train, y_train)
print(regr.coef_)
print(regr.intercept_)
Lasso回归(L1正则)
lassoreg = linear_model.Lasso()
lassoreg.fit(X_train, y_train)
lassoreg.score(X_train, y_train)
print(lassoreg.coef_)
print(lassoreg.intercept_)
lassoreg.fit(X_train, y_train)
lassoreg.score(X_train, y_train)
print(lassoreg.coef_)
print(lassoreg.intercept_)
预测
pred_y = regr.predict(X_test)
y_test['pred_y'] = pred_y
pred = pd.concat([X_test, y_test], axis = 1)
y_test['pred_y'] = pred_y
pred = pd.concat([X_test, y_test], axis = 1)
非线性回归
多项式回归
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
x = data[['radius_mean']]
y = data[['concavity_worst']]
quadratic_featurizer = PolynomialFeatures(degree = 5, interaction_only=False, include_bias = False)
x_quadratic = quadratic_featurizer.fit_transform(x)
x_quadratic
regressor_quadratic = linear_model.LinearRegression()
regressor_quadratic.fit(x_quadratic, y)
regressor_quadratic.score(x_quadratic, y)
print(regressor_quadratic.coef_)
print(regressor_quadratic.intercept_)
from sklearn import linear_model
x = data[['radius_mean']]
y = data[['concavity_worst']]
quadratic_featurizer = PolynomialFeatures(degree = 5, interaction_only=False, include_bias = False)
x_quadratic = quadratic_featurizer.fit_transform(x)
x_quadratic
regressor_quadratic = linear_model.LinearRegression()
regressor_quadratic.fit(x_quadratic, y)
regressor_quadratic.score(x_quadratic, y)
print(regressor_quadratic.coef_)
print(regressor_quadratic.intercept_)
预测
y_pred = regressor_quadratic.predict(x_quadratic)
plt.scatter(x, y, color='black')
plt.plot(x, y_pred, color='blue', linewidth=3)
plt.show()
plt.scatter(x, y, color='black')
plt.plot(x, y_pred, color='blue', linewidth=3)
plt.show()
随机森林
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
print(iris.feature_names)
print(rf.feature_importances_)
from sklearn.model_selection import GridSearchCV
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
print(iris.feature_names)
print(rf.feature_importances_)
parameters = {
'n_estimators':[5, 10, 15, 20, 30],
'max_features':[1, 2, 3, 4],
'criterion': ['mse', 'mae']
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
'n_estimators':[5, 10, 15, 20, 30],
'max_features':[1, 2, 3, 4],
'criterion': ['mse', 'mae']
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
print(iris.feature_names)
print(rf.feature_importances_)
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
print(iris.feature_names)
print(rf.feature_importances_)
parameters = {
'n_estimators':[5, 10, 15, 20, 30],
'max_features':[1, 2, 3, 4],
'criterion': ['gini', 'entropy']
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
'n_estimators':[5, 10, 15, 20, 30],
'max_features':[1, 2, 3, 4],
'criterion': ['gini', 'entropy']
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
classification_report(y_train, y_train_pred)
y_test_pred = lr.predict(X_test)
classification_report(y_test, y_test_pred)
lr.intercept_
lr.coef_
pd.DataFrame(list(zip(np.transpose(lr.coef_), train_X.columns)), columns = ['coef', 'columns'])
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
classification_report(y_train, y_train_pred)
y_test_pred = lr.predict(X_test)
classification_report(y_test, y_test_pred)
lr.intercept_
lr.coef_
pd.DataFrame(list(zip(np.transpose(lr.coef_), train_X.columns)), columns = ['coef', 'columns'])
多分类
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class = 'ovr')
lr.fit(x, y)
pred = lr.predict(x)
classification_report(y, pred)
matrix = confusion_matrix(y, pred)
matrix
lr = LogisticRegression(multi_class = 'ovr')
lr.fit(x, y)
pred = lr.predict(x)
classification_report(y, pred)
matrix = confusion_matrix(y, pred)
matrix
决策树
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
dtree = tree.DecisionTreeClassifier(max_depth = 3)
dtree.fit(X_train, y_train)
dtree.classes_
dtree.feature_importances_
dtree.max_features_
dtree.n_classes_
dtree.n_features_in_
dtree.n_outputs_
dtree.tree_
pred = dtree.predict(X_test)
print(classification_report(y_test, pred))
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
dtree = tree.DecisionTreeClassifier(max_depth = 3)
dtree.fit(X_train, y_train)
dtree.classes_
dtree.feature_importances_
dtree.max_features_
dtree.n_classes_
dtree.n_features_in_
dtree.n_outputs_
dtree.tree_
pred = dtree.predict(X_test)
print(classification_report(y_test, pred))
## 2. 在jupyter 中直接显式图片
from IPython.display import Image
import pydotplus
dot_data = tree.export_graphviz(dtree, out_file=None,
feature_names = iris.feature_names,
class_names = iris.target_names,
filled = True, rounded = True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
from IPython.display import Image
import pydotplus
dot_data = tree.export_graphviz(dtree, out_file=None,
feature_names = iris.feature_names,
class_names = iris.target_names,
filled = True, rounded = True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
parameters = {
'criterion':['gini', 'entropy'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
}
dtree = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(dtree, parameters, scoring= 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
'criterion':['gini', 'entropy'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
}
dtree = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(dtree, parameters, scoring= 'accuracy', cv = 5)
grid_search.fit(x, y)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
回归树
from sklearn import tree
dtree = tree.DecisionTreeRegressor(max_depth=3)
dtree.fit(X_train, y_train)
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_pred = dtree.predict(X_test)
# 解释方差
explained_variance_score(y_test, y_pred)
# 绝对平均误差
mean_absolute_error(y_test, y_pred)
# 均方误差
mean_squared_error(y_test, y_pred)
# 决定系数 R2
r2_score(y_test, y_pred)
dtree = tree.DecisionTreeRegressor(max_depth=3)
dtree.fit(X_train, y_train)
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_pred = dtree.predict(X_test)
# 解释方差
explained_variance_score(y_test, y_pred)
# 绝对平均误差
mean_absolute_error(y_test, y_pred)
# 均方误差
mean_squared_error(y_test, y_pred)
# 决定系数 R2
r2_score(y_test, y_pred)
#查看决策树
from IPython.display import Image
import pydotplus
dot_data = tree.export_graphviz(dtree, out_file= None,
filled=True, rounded=True,
special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
from IPython.display import Image
import pydotplus
dot_data = tree.export_graphviz(dtree, out_file= None,
filled=True, rounded=True,
special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
朴素贝叶斯
GaussianNB
from sklearn import naive_bayes
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
nb1 = naive_bayes.GaussianNB()
nb1.fit(X_train, y_train)
from sklearn.metrics import classification_report
pred = nb1.predict(X_test)
print(classification_report(y_test, pred))
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
nb1 = naive_bayes.GaussianNB()
nb1.fit(X_train, y_train)
from sklearn.metrics import classification_report
pred = nb1.predict(X_test)
print(classification_report(y_test, pred))
MultinomialNB
nb2 = naive_bayes.MultinomialNB()
nb2.fit(X, y)
print(nb2.predict(X))
nb2.fit(X, y)
print(nb2.predict(X))
BernoulliNB
nb3 = naive_bayes.BernoulliNB()
nb3.fit(X, y)
print(nb3.predict(X))
nb3.fit(X, y)
print(nb3.predict(X))
anacoda
conda list
conda install numpy
anaconda search -t conda tensorflow
anaconda show jjhelmus/tensorflow
0 条评论
下一页