作者:欧新宇(Xinyu OU)
本文档所展示的测试结果,均运行于:Intel Core i7-7700K CPU 4.2GHz
import pandas as pd
fruits = pd.DataFrame({'数值特征':[5,6,7,8,9],
'类型特征':['西瓜','香蕉','桔子','苹果','葡萄']})
display(fruits)
fruits_dum = pd.get_dummies(fruits)
display(fruits_dum)
fruits['数值特征'] = fruits['数值特征'].astype(str)
pd.get_dummies(fruits, columns=['数值特征'])
import numpy as np
import matplotlib.pyplot as plt
rnd = np.random.RandomState(38)
x = rnd.uniform(-5,5,size=50)
y_no_noise = (np.cos(6*x)+x)
X = x.reshape(-1,1)
y = (y_no_noise + rnd.normal(size=len(x)))/2
plt.plot(X,y,'o',c='r')
plt.show()
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
line = np.linspace(-5,5,1000,endpoint=False).reshape(-1,1)
mlpr = MLPRegressor().fit(X,y)
knr = KNeighborsRegressor().fit(X,y)
plt.plot(line, mlpr.predict(line),label='MLP')
plt.plot(line, knr.predict(line),label='KNN')
plt.plot(X,y,'o',c='r')
plt.legend(loc='best')
plt.show()
bins = np.linspace(-5,5,11)
target_bin = np.digitize(X, bins=bins)
print('装箱数据范围:\n{}'.format(bins))
print('\n前十个数据点的特征值:\n{}'.format(X[:10]))
print('\n前十个数据点所在的箱子:\n{}'.format(target_bin[:10]))
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse = False)
onehot.fit(target_bin)
X_in_bin = onehot.transform(target_bin)
print('装箱后的数据形态:{}'.format(X_in_bin.shape))
print('\n装箱后的前十个数据点:\n{}'.format(X_in_bin[:10]))
new_line = onehot.transform(np.digitize(line,bins=bins))
new_mlpr = MLPRegressor().fit(X_in_bin, y)
new_knr = KNeighborsRegressor().fit(X_in_bin,y)
plt.plot(line, new_mlpr.predict(new_line),label='New MLP')
plt.plot(line, new_knr.predict(new_line),label='New KNN')
plt.plot(X,y,'o',c='r')
plt.legend(loc='best')
plt.show()
array_1 = [1,2,3,4,5]
array_2 = [6,7,8,9,0]
array_3 = np.hstack((array_1, array_2))
print('将数组2添加到数据1中后得到:{}'.format(array_3))
X_stack = np.hstack([X, X_in_bin])
X_stack.shape
line_stack = np.hstack([line, new_line])
mlpr_interact = MLPRegressor().fit(X_stack, y)
plt.plot(line, mlpr_interact.predict(line_stack),
label='MLP for interaction')
plt.ylim(-4,4)
for vline in bins:
plt.plot([vline,vline],[-5,5],':',c='k')
plt.legend(loc='lower right')
plt.plot(X, y,'o',c='r')
plt.show()
X_multi = np.hstack([X_in_bin, X*X_in_bin])
print(X_multi.shape)
print(X_multi[0])
mlpr_multi = MLPRegressor().fit(X_multi, y)
line_multi = np.hstack([new_line, line * new_line])
plt.plot(line, mlpr_multi.predict(line_multi), label = 'MLP Regressor')
for vline in bins:
plt.plot([vline,vline],[-5,5],':',c='gray')
plt.plot(X, y, 'o', c='r')
plt.legend(loc='lower right')
plt.show()
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=20, include_bias = False)
X_poly = poly.fit_transform(X)
print (X_poly.shape)
print('原始数据集中的第一个样本特征:\n{}'.format(X[0]))
print('\n处理后的数据集中第一个样本特征:\n{}'.format(X_poly[0]))
print ('PolynomialFeatures对原始数据的处理:\n{}'.format(
poly.get_feature_names()))
from sklearn.linear_model import LinearRegression
LNR_poly = LinearRegression().fit(X_poly, y)
line_poly = poly.transform(line)
plt.plot(line,LNR_poly.predict(line_poly), label='Linear Regressor')
plt.xlim(np.min(X)-0.5,np.max(X)+0.5)
plt.ylim(np.min(y)-0.5,np.max(y)+0.5)
plt.plot(X,y,'o',c='r')
plt.legend(loc='lower right')
plt.show()
import pandas as pd
stock = pd.read_csv('d:/stock dataset/071013.csv',encoding='GBK')
print(stock.head())
y = stock['涨幅%%']
print(y.shape)
print(y[0])
features = stock.loc[:,'现价':'流通股(亿)']
X = features.values
print(X.shape)
print(X[:1])
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
mlpr=MLPRegressor(random_state=62, hidden_layer_sizes=(100,100),alpha=0.001)
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=62)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
mlpr.fit(X_train_scaled, y_train)
print('模型准确率:{:.2f}'.format(mlpr.score(X_test_scaled,y_test)))
wanted = stock.loc[:,'名称']
print(wanted[y>=9])
from sklearn.feature_selection import SelectPercentile
select = SelectPercentile(percentile=50)
select.fit(X_train_scaled, y_train)
X_train_selected = select.transform(X_train_scaled)
print('经过缩放的特征形态:{}'.format(X_train_scaled.shape))
print('特征选择后的特征形态:{}'.format(X_train_selected.shape))
mask = select.get_support()
print(mask)
plt.matshow(mask.reshape(1,-1),cmap=plt.cm.cool)
plt.xlabel("Features Selected")
plt.show()
X_test_selected = select.transform(X_test_scaled)
mlpr_sp=MLPRegressor(random_state=62, hidden_layer_sizes=(100,100),
alpha=0.001)
mlpr_sp.fit(X_train_selected, y_train)
print('特征选择后模型得分:{:.2f}'.format(mlpr_sp.score(X_test_selected,
y_test)))
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
sfm = SelectFromModel(RandomForestRegressor(n_estimators=100,
random_state=38),
threshold='median')
sfm.fit(X_train_scaled, y_train)
X_train_sfm = sfm.transform(X_train_scaled)
print('基于随机森林模型进行特征后的数据形态:{}'.format(X_train_sfm.shape))
mask_sfm = sfm.get_support()
print(mask_sfm)
plt.matshow(mask_sfm.reshape(1,-1),cmap=plt.cm.cool)
plt.xlabel('Features Selected')
plt.show()
X_test_sfm = sfm.transform(X_test_scaled)
mlpr_sfm=MLPRegressor(random_state=62, hidden_layer_sizes=(100,100),
alpha=0.001)
mlpr_sfm.fit(X_train_sfm, y_train)
print('随机森林进行特征选择后的模型得分:{:.2f}'.format(
mlpr_sfm.score(X_test_sfm, y_test)))
from sklearn.feature_selection import RFE
rfe = RFE(RandomForestRegressor(n_estimators=100,
random_state=38),
n_features_to_select=12)
rfe.fit(X_train_scaled, y_train)
mask = rfe.get_support()
print(mask)
plt.matshow(mask.reshape(1,-1), cmap=plt.cm.cool)
plt.xlabel('Features Selected')
plt.show()
X_train_rfe = rfe.transform(X_train_scaled)
X_test_rfe = rfe.transform(X_test_scaled)
mlpr_rfe = MLPRegressor(random_state=62, hidden_layer_sizes=(100,100),
alpha=0.001)
mlpr_rfe.fit(X_train_rfe, y_train)
print("RFE选择特征后的模型得分:{:.2f}".format(mlpr_rfe.score(X_test_rfe,
y_test)))
print('{:.2f}'.format(rfe.score(X_test_scaled, y_test)))