机器学习总结基础-1

2024-06-20 16:31:17   0  举报





AI智能生成

机器学习总结基础-1是一份详细的概述，涵盖了机器学习的核心概念、算法和应用。这份资料以易于理解的语言阐述了各种基础算法，如线性回归、逻辑回归、决策树、支持向量机等，并介绍了它们在实际问题中的实现。此外，文档还包括了数据预处理、特征工程、模型评估等关键步骤，以及如何利用Python等编程语言进行机器学习实践。无论是对初学者还是有一定基础的学习者，这份资料都是理解和掌握机器学习的宝贵资源。

机器学习

人工智能

人工智能算法

机器学习算法

作者其他创作

大纲/内容

numpy

numpy.genfromtxt

genfromtxt("world_alcohol.txt", delimiter=",", dtype=str)

创建

arr = np.random.random((4, 6))

arr.shape[0]

arr.shape[1]

arr.ndim

arr.size

arr.reshape(3, 8)

arr.reshape(-1)

score = np.random.randint(80, 100, 30)

score[score >= 95]

score[score <= 85]

arr2 = np.random.randint(0, 10, (4, 4))
arr3 = np.random.randint(0, 10, (4, 4))

arr4 = np.zeros((16, 16))

arr4[0] = 1
arr4[15] = 1
arr4[:, 0] = 1
arr4[:, 15] = 1

arr4 = np.ones((2, 3, 4), dtype = np.int32)

3维矩阵，2条 3行，4列

arr5 = np.random.randint(0, 100, 10000)

arr.view()

指向位置不同，数据相同

arr.copy()

指向位置不同，数据不同

np.tile(a, (3, 5))

对a进行扩展，行扩充3倍，列扩充5倍

np.sort(a, axis = 1)

按列进行排序

i = np.argsort(a)

对索引进行排序，按照从小到大排序

a[i]

排序后的数据

选择

ver = np.array([5 ,6, 7, 8])

import numpy as np
x = np.array([[1,2,3],[4,5,6]])

world[1, 4]

索引从0开始，第二行，第5列

world[2, 2]

第3行，第3列

world[0 : 3]

第1行，前3列的值

world[:, 1]

所有行，第2列

world[:, 0:2]

所有行，前2列

world[1:3, 0:2]

第2行和第3行，前2列

计算

min

max

sum(axis = 0)

行相加， axis =1 列相加

A = np.array([[1,1],[0,1]])
B = np.array([[2,0],[3,4]])

# 位置相乘
print (A * B)

# 矩阵乘法
print(A.dot(B))
print(np.dot(A, B))

np.exp(B)

e 的 B次幂

np.sqrt(B)

B次方跟

np.floor

向下取整

a.rabel()

矩阵转换为向量

特征归一化

mu = np.average(X, axis = 0)

求均值

dihms = np.std(X, axis = 0, ddof = 1)

求方差

X = (X - mu) / sigma

转换

np.arange(15)

创建15个数据

reshape(3, 5)

转换成矩阵

reshape(3, -1)

# 间隔固定数据累加创建数据
x = np.arange(10,30,2)

# 相同间隔创建数据
x = np.linspace(0, 20, 100)

astype(float)

类型转换

拼接

np.hstack([arr2, arr3])

np.vstack([arr2, arr3])

分割

np.hsplit(a, 3)

按行，切分成3份

np.hsplit(a, (3, 4))

按行，第4行，第5列切分

np.vsplit(a, 3)

按列，切分成3份

ndarray

# 最小二乘法，求解方程
# 定义误差函数
# param: 需要求解的直线系数【a, b】, 直线方程为 y = ax + b
# x: 观测数据x
# y: 观测数据y
import numpy as np
from scipy import optimize

def error_fun(param, x, y):
expect_y = param[0] * x + param[1]
err = y - expect_y
return err

x = np.array([1,2,3,4])
y = np.array([6,5,7,10])
# 初始化系数
init_param = np.array([0.1, 0.1])

# 求解
res, k = optimize.leastsq(error_fun, init_param, args=(x, y))
print(res[0], res[1], k)

pandas

Series

s1 = pd.Series({'a':11, 'b':22, 'c':33})

s2 = pd.Series([11, 22, 33], index = ['a', 'b', 'c'])

DataFrame

子主题

加载

pd.read_excel

pd.read_csv

摘要

dt.head()

显式前5条数据

dt.head(10)

dt.tail()

显式后5条数据

dt.tail(10)

dt.columns

dt.shape

筛选

过滤数据

显式前3行

df[1:3]

显示特定的行、列

df.loc[1:3, ['star']]

显式指定列

df[["star","new_star"]]

df['star'] == '力荐'

df [ df['star'] == '力荐' ]

loc

df.loc[0]

显式第1行，第1个数据

df.loc[3:6]

第一行，3-6列数据

df.loc[83, "Age"]

查看83行， Age列的数据

按列获取

df['ID']

获取列名为ID的列

df["ID", "NAME"]

a_is_null = pd.isnull(a，type=int32)

显式空数据

len(a)

显式个数

df.apply(func)

数据执行func函数

iloc

处理

df.fillna(axis = 1)

df.fillna(axis = 0, subset=["age", "name"])

df.dropna()

排序

df.sort_values("Age", ascending = False)

按Age 升序排序

df.reset_index(drop = True)

重置索引

pd.to_datetime(a)

时间转换

计算

df['EK'].max()

min()

mean()

df.sort_values("X", inplace = True)

np.add(a, b)

np.sin(a)

显式

增加列名

df.columns = ['star', 'vote', 'shorts']

删除

删除空数据

df.dropna()

数据聚合

df.groupby('star').sum()

创建新列

df['new_star'] = df['star'].map(star_to_number)

分组

自定义区间离散化（分箱、分组）

# 对Amount 进行自定义离散--分箱分组
cut_points = [0, 13, 14, 15, 16]
RFM['radius_mean_bin'] = pd.cut(RFM['radius_mean'], bins = cut_points)
RFM.head()

等宽离散化

RFM['radius_mean_width_bin'] = pd.cut(RFM.radius_mean, 20)
RFM.head()

等深离散化

RFM['radius_mean_depth_bin'] = pd.qcut(RFM.radius_mean, 5)
RFM.head()

画图

散点图

data.plot.scatter(x = var, y= 'SalePrice', ylim = (0, 800000))

plt.scatter(y_test, y_test_preds) # 预测与实际值散点图
plt.plot([0, max(y_test)], [0, max(y_test_preds)], lw = 1, linestyle='--')
plt.xlabel('实际值')
plt.ylabel('预测值')
plt.show()

柱状图

# age分布
train_src.hist(column='Age', bins = 50)

# 性别分布
train_src["Sex"].value_counts().plot(kind = "bar")

# 性别与目标生存的相关性
pd.crosstab(train_src.Sex, train_src.Survived).plot(kind = "bar")

# Age 与目标生存的相关性
train_src.age = pd.cut(train_src.Age, [0, 5, 15, 20, 25, 35, 50, 60, 100])
pd.crosstab(train_src.age, train_src["Survived"]).plot(kind = "bar")

填充

classes = ["03020" + str(i) for i in range(1, 10)] + ["0302" + str(i) for i in range(10,31)]
objectes = ["语文","数学","英语","物理","化学","计算机"]
df = pd.DataFrame(np.random.randint(70, 100, (30, 6)), classes, objectes)

gender = ['男' if np.random.random() < 0.5 else '女' for i in range(30)]
df2 = pd.DataFrame(gender, index = classes, columns = ['性别'])

合并

c = pd.concat([df_cp, df2], axis = 1)

matplotlib

基本设置

import matplotlib.pyplot as plt

plt.plot(df['a'], df['b'])

x轴， y轴数据

plt.show()

展现图表

plt.xticks(rotation=45)

指定x轴数据显式角度

plt.xlabel('xxxx')

plt.xlabel('yyyy')

plt.title("标题")

子图

fig = plt.figure(figsize=(3, 6))

指定画图区间, figsize = （长，宽）

ax1= fig.add_subplot(2, 2, 1)

2行， 2列，第一个图

ax2=fig.add_subplot(2, 2, 2)

2行， 2列，第二个图

ax4=fig.add_subplot(2, 2, 4)

2行， 2列，第四个图

ax.plot(df)

显式摘要

plt.legend(loc = 'best')

label = xxxx

plt.legend(loc = 'upper left')

柱状图

fig, ax = plt.subplots()
ax.bar(x, y, 0.3)
plt.show()

默认纵向

ax.barh(x, y, 0.5)

横向

fig, ax = plt.subplots()
ax.hist(x, y, range=(4, 5), bins = 20)
plt.show()

range 指定区间， bins 20个柱

ax.set_xlim(0, 50)

指定区间

ax.set_ylim(0, 50)

散点图

fig, ax = plt.subplots()
ax.scatter(x, y, 0.5)
plt.show()

箱线图

fig, ax = plt.subplots()
ax.boxplot(x)
plt.show()

# 绘制前三个特征的直方图
first_three_features = features[:3]
fig, axes = plt.subplots(nrows = 3, ncols = 1, figsize = (8, 10))

for i, feature in enumerate(first_three_features):
axes[i].hist(X[feature], bins = 30, color='skyblue', edgecolor='black')
axes[i].set_title(f"{feature} - 直方图")
axes[i].set_xlabel('值')
axes[i].set_ylabel('频数')
axes[i].grid(True, axis = 'y', linestyle='--', linewidth = 0.7, alpha = 0.6)

plt.tight_layout()
plt.show()

直方图

df_user['R值_x'].plot(kind = 'hist', bins = 20, title = '新进度分布直方图')

plt.hist(user_id_itemcnt.values)

散点图

import seaborn as sns
plt.scatter(df.index, df['A'])
plt.show()

折线图

import matplotlib.pyplot as plt
plt.plot(df.index, df['A'], )
plt.show()

plt.plot(df.index, df['A'],
color='#FFAA00', # 颜色
linestyle='--', # 线条样式
linewidth=3, # 线条宽度
marker='D') # 点标记

密度图

# 画密度图
def density_plot(data):
plt.rcParams['axes.unicode_minus'] = False # 显式正负号
p = data.plot(kind = 'kde', linewidth = 2, subplots = True, sharex = False, figsize = (10, 9))
[p[i].set_ylabel(data3.columns[i], fontproperties = 'SimHei') for i in range(k)]
plt.legend()
return plt

for i in range(k):
density_plot(data3)

手肘图

from sklearn.cluster import KMeans #导入KMeans模块
def show_elbow(df, ax, title):
distance_list = []
K = range(1, 9)
for k in K:
kmeans = KMeans(n_clusters = k, max_iter = 100)
kmeans = kmeans.fit(df)
distance_list.append(kmeans.inertia_)
ax.plot(K, distance_list, 'bx-')
ax.set_xlabel('k')
ax.set_ylabel('距离均方误差')
ax.set_title(title)

fig, axes = plt.subplots(1, 3, figsize = (18, 6))

show_elbow(df_user[['R值_x']], axes[0], 'R值聚类K值手肘图')
show_elbow(df_user[['F值_x']], axes[1], 'F值聚类K值手肘图')
show_elbow(df_user[['M值_x']], axes[2], 'M值聚类K值手肘图')

plt.tight_layout()
plt.show()

绘制分类矩阵

# 误分类矩阵
cnf_matrix = metrics.confusion_matrix(train_y, train_y_pred)
# 绘画分类矩阵
def show_confusion_matrix(cnf_matrix, class_labels):
plt.matshow(cnf_matrix, cmap=plt.cm.YlGn, alpha = 0.7)
ax = plt.gca()
ax.set_xlabel('Predicted Label', fontsize = 16)
ax.set_xticks(range(0, len(class_labels)))
ax.set_xticklabels(class_labels, rotation=45)
ax.set_ylabel('Actual Label', fontsize=16, rotation = 90)
ax.set_yticks(range(0, len(class_labels)))
ax.set_yticklabels(class_labels)
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()

for row in range(len(cnf_matrix)):
for col in range(len(cnf_matrix[row])):
ax.text(col, row, cnf_matrix[row][col], va = 'center', ha = 'center', fontsize = 16)

class_labels = [0, 1]
show_confusion_matrix(cnf_matrix, class_labels)

绘制验证曲线(validation curve)

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
train_scores, test_scores = validation_curve(
estimator = lr, # 指定模型
X = train_X,
y = train_y,
param_name = 'C',
param_range= param_range,
cv = 10)

# 计算训练集合测试集准确率的均值和标准差
train_mean = np.mean(train_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)

test_mean = np.mean(test_scores, axis = 1)
test_std = np.std(test_scores, axis = 1)

# 对训练集的准确率作图
plt.plot(param_range, train_mean, color = 'blue', marker='o', markersize = 5, label = 'training accuracy')
# 对训练集准确率的可信范围（上下限）作图：上限： train_mean + train_std 下限：train_mean - train_std
plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha = 0.15, color = 'blue')

# 对测试集的准确率作图
plt.plot(param_range, test_mean, color = 'blue', marker='o', markersize = 5, label = 'test accuracy')
# 对测试集准确率的可信范围（上下限）作图：上限： test_mean + test_std 下限：test_mean - test_std
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha = 0.15, color = 'green')

plt.grid()
plt.xscale('log')
plt.legend(loc= 'lower right')
plt.xlabel('Parmeter')
plt.ylabel('Accuracy')
plt.ylim([0.6, 1.0])
plt.tight_layout()

plt.show()

seaborn

sns.set()

初始化默认参数

主题风格

sns.set_style("dark")

darkgrid

whitegrid

dark

white

ticks

调色板

color_palette()

能传入任何matplotlib所支持的颜色
不写参数则默认颜色

set_palette()

设置所有图的颜色

hls_palette()

亮度和饱和度

lightness

亮度

saturation

饱和度

sns.palplot(sns.hls_palette(8, l=.3, s =.8))

sns.palplot(sns.color_palette("Paired", 10))

连续色板

sns.palplot(sns.color_palette("Blues"))

sns.palplot(sns.color_palette("BuGn_r"))

反转渐变，增加_r

sns.palplot(sns.color_palette("cubehelix", 8))

sns.palplot(sns.cubehelix_palette(8, start = .5, rot = -.75))

cubehelix_palette()

色调线性变换

定制连续调色板

light_palette()

sns.palplot(sns.light_palette("green", reverse = True))

dark_palette()

分类色板

palette = sns.color_palette()
sns.palplot(palette)

6种颜色循环主题

deep

muted

pastel

bright

dark

colorblind

圆形画板

pal = sns.palplot(sns.color_palette("hls", 8))

sns.boxplot(data = df, palette = pal )

画图

样式

画图与轴线距离

sns.despine(offset = 30)

保留的坐标轴

sns.despine(left = True)

设置风格

sns.set_context('paper', font_scale=1.5, rc={"lines.linewidth": 2.5})

其他风格

notebook

poster

talk

paper

设置颜色

palette = {"male": "g", "female": "m"}

设置图形

markers = ["^", "o"]

设置线

linestyles = ["-", "--"]

方向

orient = "h"

设置透明度

alpha=0.7

bdistplot

柱状图

sns.bdistplot(data = df, bins=20, kde=False, fit=stats.gamma)

fit 显式分布曲线

jointplot

sns.jointplot(x = "X", y = "y", data = df )

sns.jointplot(x = "X", y = "y", kind = "hex", color = "k", data = df )

sns.pairplot(iris)

绘画单特征分布，及2个特征之间的关系图

箱线图

sns.boxplot(data=tips, x = "total_bill", y = "day", hue = "sex")

小提琴图

sns.violinplot(data=tips, x = "total_bill", y = "day", hue = "sex")

regplot()

sns.regplot(x = "total_bill", y = "tip", data = tips)

sns.lmplot(x = "total_bill", y = "tip", data = tips)

# x_jitter 增加随机浮动
sns.regplot(x="size", y = "tip", data = tips, x_jitter=0.5)

stripplot()

sns.stripplot(x = "day", y = "total_bill", data = tips)

# jitter 偏移数据，防止聚堆
sns.stripplot(x = "day", y = "total_bill", data = tips, jitter = True)

swarmplot()

sns.swarmplt(x = "day", y = "total_bill", data = tips)

boxplot

sns.boxplot(x = "day", y = "total_bill", hue = "sex", data = tips)

barplot

sns.barplot(x="sex", y = "survived", hue = "class", data = titanic)

pointplot

sns.pointplot(x="sex", y = "survived", hue = "class", data = titanic)

点图，比较差异性

factorplot

sns.factorplot(kind = "bar", x = "day", y = "total_bill", hue = "smoker", data = tips)

参数

FacetGrid

g = sns.FacetGrid(tips, col = "time", hue = "sex", palette="set1", size = 5, hue_kws={"marker": ["^", "o"]})
g.map(plt.hist, "total_bill", "tip", s = 100, linewidth=.5, edgecolor = "white")
g.set_axis_labels("xxxxx", "yyyy")
g.set(xticks=[10, 30, 50], yticks=[2, 6, 10])
g.fig.subplots_adjust(wspack=.5, hspack=.5) # 设置图间隔
g.add_legend()

添加说明项

add_legend()

size = 4

aspect = .5

长宽比

row_order

row_order = Categorical(["x", "y"])

指定顺序

点的大小

s = 50

linewidth

linewidth = .5

线宽

edgecolor

edgecolor = "white"

颜色

set_axis_labels

g.set_axis_labels("x", "y")

显式坐标轴标签

vars

vars = ["sex", "Age"]

指定绘画的数据

组合

sns.violinplot(x = "day", y = "total_bill", data = tips, inner= None)
sns.swarmplot(x = "day", y = "total_bill", data = tips, color= "w", alpha = 0.5)

正态分布

import seaborn as sns
from scipy.stats import norm
# 分布黑色线为正态分布，蓝色为真实分布，房价非正态分布
sns.distplot(train['SalePrice'], fit = norm)

热力图

sns.heatmap(corr, vmax = 1, vmin=5, vmax = 10, square = True, center = 0)

center = 0

设置以0为中心

箱线图

plt.figure(figsize = (12, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = train)
plt.axis(ymin = 0 , ymax = 800000)
plt.xticks(rotation=90)
plt.show()

# y变量箱线图
train.SalePrice.plot(kind = 'box', sym = 'b*')

corr_cols = corr.loc[:, corr.loc['SalePrice', :].abs() > corr_threshvalue].columns
number_para[corr_cols].plot(sym='b*', kind = 'box', subplots = True, figsize = (20, 8))

直方图

count_classes.plot(kind = 'bar')

小提琴图

#小提琴图
import seaborn as sns
plt.figure(figsize = (12, 8))
sns.violinplot(data = pd.DataFrame(X_selected_standardized, columns = selected_features), plaette="Set3")
plt.title('小提琴图-标准化后数据')
plt.ylabel('标准化值')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()

scipy

qq图

from scipy import stats
# 利用Q-Q图判断数据是否偏离正态分布
stats.probplot(train['SalePrice'], plot = plt)

谱系聚类图

# 聚类数据
from scipy.cluster.hierarchy import linkage, dendrogram # 柱状图

Z = linkage(data3, method = 'ward', metric = 'euclidean')
# 画谱系聚类图
P = dendrogram(Z, 0)
plt.show()

sklearn

标准化压缩

scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

参数优化

from sklearn.model_selection import GridSearchCV
svm = SVC()
grid_search = GridSearchCV(svm, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

算法

XGBoost

import xgboost as xgb
watchlist = [(dtest, 'val'), (dtrain, 'train')]
params = {
'booster':'gbtree',
'objective':'reg:linear',
'early_stopping_rounds': 50,
'eval_metric':'rmse',
'gamma': 0,
'max_depth': 5,
'subsample': 0.6,
'colsample_bytree': 0.9,
'min_child_weight': 1,
'eta': 0.02,
'seed': 123456,
'nthread': 3,
'silent': 0
}
model = xgb.train(params, dtrain, num_boost_round=200, evals = watchlist)
predict_y = model.predict(dtest, ntree_limit = model.best_ntree_limit)

import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)
params = {
'booster':'gbtree',
'objective':'binary:logistic',
'early_stopping_rounds': 50,
'eval_metric':'auc',
'gamma': 0,
'max_depth': 5,
'subsample': 0.6,
'colsample_bytree': 0.9,
'min_child_weight': 1,
'eta': 0.02,
'seed': 123456,
'nthread': 3,
'silent': 0
}
watchlist = [(dtest, 'val'), (dtrain, 'train')]
model = xgb.train(params, dtrain, num_boost_round=200, evals = watchlist)
predict_y = model.predict(dtest, ntree_limit= model.best_ntree_limit)

SVM

from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)

svm.support_
svm.support_vectors_
svm.n_support_
svm.dual_coef_
svm.intercept_

pred = svm.predict(X_test)
print(classification_report(y_test, pred))

K近邻

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

pred = knn.predict(X_test)
print(classification_report(y_test, pred))

parameters = {
'n_neighbors':[3, 5, 10, 15, 20, 30],
'weights': ['uniform', 'distance'],
'p':[1, 2]
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, parameters, scoring='accuracy', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

Kmeans

kmeans = KMeans(n_clusters = 3)
kmeans.fit(features)

kmeans.cluster_centers_
kmeans.labels_

kmeans.fit_predict(features)
cluster_label = kmeans.predict(features)

# 我们计算K值从1到10对应的平均畸变程度

# 利用scipy 求解距离
from scipy.spatial.distance import cdist
K = range(1, 10)
meandistortions = []
for k in K:
kmeans = KMeans(n_clusters = k)
kmeans.fit(features)
meandistortions.append(sum(np.min(cdist(features, kmeans.cluster_centers_, 'euclidean'), axis = 1))/features.shape[0])

plt.plot(K, meandistortions, 'bx--')
plt.xlabel('k')
plt.ylabel('loss')
plt.title('find the best k value')

GBDT

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
gbdt = GradientBoostingRegressor()
gbdt.fit(X_train, y_train)

gbdt.score(X_test, y_test)

print(gbdt.feature_importances_)

parameters = {
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}

gbdt = GradientBoostingRegressor()

grid_search = GridSearchCV(gbdt, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
gbdt = GradientBoostingClassifier()
gbdt.fit(X_train, y_train)

gbdt.score(X_test, y_test)

print(gbdt.feature_importances_)

parameters = {
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}

gbdt = GradientBoostingClassifier()

grid_search = GridSearchCV(gbdt, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

AdaBoost

from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeRegressor()
adaboost = AdaBoostRegressor(base_estimator = dtree)
adaboost.fit(X_train, y_train)

adaboost.score(X_test, y_test)

print(adaboost.feature_importances_)

parameters = {
'n_estimators':[30, 50, 80, 100],
'learning_rate': [0.01, 0.1, 0.2]
}

dtree = DecisionTreeRegressor()
adaboost = AdaBoostRegressor(base_estimator = dtree)

grid_search = GridSearchCV(adaboost, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeClassifier()
adaboost = AdaBoostClassifier(base_estimator = dtree)
adaboost.fit(X_train, y_train)

adaboost.score(X_test, y_test)

print(adaboost.feature_importances_)

parameters = {
'n_estimators':[30, 50, 80, 100],
'learning_rate':[0.1, 0.2, 0.5, 1]
}

dtree = DecisionTreeClassifier()
adaboost = AdaBoostClassifier(base_estimator = dtree)

grid_search = GridSearchCV(adaboost, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

线性回归

一元线性回归

from sklearn import linear_model

regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)
print(regr.coef_)
print(regr.intercept_)

y_pred = regr.predict(train_x)
plt.scatter(train_x, train_y, color='black')
plt.plot(train_x, train_y, color='blue', linewidth=3)
plt.show()

多元线性回归

普通线性回归

from sklearn import linear_model

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regr.score(X_train, y_train)

print(regr.coef_)
print(regr.intercept_)

Ridge回归（L2正则）

ridgereg = linear_model.Ridge()
ridgereg.fit(X_train, y_train)
ridgereg.score(X_train, y_train)

print(regr.coef_)
print(regr.intercept_)

Lasso回归（L1正则）

lassoreg = linear_model.Lasso()
lassoreg.fit(X_train, y_train)
lassoreg.score(X_train, y_train)

print(lassoreg.coef_)
print(lassoreg.intercept_)

预测

pred_y = regr.predict(X_test)

y_test['pred_y'] = pred_y
pred = pd.concat([X_test, y_test], axis = 1)

非线性回归

多项式回归

from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

x = data[['radius_mean']]
y = data[['concavity_worst']]

quadratic_featurizer = PolynomialFeatures(degree = 5, interaction_only=False, include_bias = False)
x_quadratic = quadratic_featurizer.fit_transform(x)
x_quadratic

regressor_quadratic = linear_model.LinearRegression()
regressor_quadratic.fit(x_quadratic, y)
regressor_quadratic.score(x_quadratic, y)

print(regressor_quadratic.coef_)
print(regressor_quadratic.intercept_)

预测

y_pred = regressor_quadratic.predict(x_quadratic)
plt.scatter(x, y, color='black')
plt.plot(x, y_pred, color='blue', linewidth=3)

plt.show()

随机森林

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

rf.score(X_test, y_test)

print(iris.feature_names)
print(rf.feature_importances_)

parameters = {
'n_estimators':[5, 10, 15, 20, 30],
'max_features':[1, 2, 3, 4],
'criterion': ['mse', 'mae']
}

rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, parameters, scoring = 'r2', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

rf.score(X_test, y_test)

print(iris.feature_names)
print(rf.feature_importances_)

parameters = {
'n_estimators':[5, 10, 15, 20, 30],
'max_features':[1, 2, 3, 4],
'criterion': ['gini', 'entropy']
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

逻辑回归

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
classification_report(y_train, y_train_pred)

y_test_pred = lr.predict(X_test)
classification_report(y_test, y_test_pred)

lr.intercept_
lr.coef_
pd.DataFrame(list(zip(np.transpose(lr.coef_), train_X.columns)), columns = ['coef', 'columns'])

多分类

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class = 'ovr')
lr.fit(x, y)

pred = lr.predict(x)
classification_report(y, pred)
matrix = confusion_matrix(y, pred)
matrix

决策树

from sklearn import tree

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

dtree = tree.DecisionTreeClassifier(max_depth = 3)
dtree.fit(X_train, y_train)

dtree.classes_
dtree.feature_importances_
dtree.max_features_
dtree.n_classes_
dtree.n_features_in_
dtree.n_outputs_
dtree.tree_

pred = dtree.predict(X_test)
print(classification_report(y_test, pred))

## 2. 在jupyter 中直接显式图片
from IPython.display import Image
import pydotplus

dot_data = tree.export_graphviz(dtree, out_file=None,
feature_names = iris.feature_names,
class_names = iris.target_names,
filled = True, rounded = True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

parameters = {
'criterion':['gini', 'entropy'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
}

dtree = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(dtree, parameters, scoring= 'accuracy', cv = 5)
grid_search.fit(x, y)

grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_

回归树

from sklearn import tree
dtree = tree.DecisionTreeRegressor(max_depth=3)
dtree.fit(X_train, y_train)

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

y_pred = dtree.predict(X_test)
# 解释方差
explained_variance_score(y_test, y_pred)
# 绝对平均误差
mean_absolute_error(y_test, y_pred)
# 均方误差
mean_squared_error(y_test, y_pred)
# 决定系数 R2
r2_score(y_test, y_pred)

#查看决策树
from IPython.display import Image
import pydotplus

dot_data = tree.export_graphviz(dtree, out_file= None,
filled=True, rounded=True,
special_characters = True)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

朴素贝叶斯

GaussianNB

from sklearn import naive_bayes
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

nb1 = naive_bayes.GaussianNB()
nb1.fit(X_train, y_train)

from sklearn.metrics import classification_report
pred = nb1.predict(X_test)
print(classification_report(y_test, pred))

MultinomialNB

nb2 = naive_bayes.MultinomialNB()
nb2.fit(X, y)
print(nb2.predict(X))

BernoulliNB

nb3 = naive_bayes.BernoulliNB()
nb3.fit(X, y)
print(nb3.predict(X))

anacoda

conda list

conda install numpy

anaconda search -t conda tensorflow

anaconda show jjhelmus/tensorflow