机器学习Sklearn实战——KNN算法
生活随笔
收集整理的這篇文章主要介紹了
机器学习Sklearn实战——KNN算法
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
KNN鳶尾花分類
import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier from sklearn import datasets import numpy as np X,y = datasets.load_iris(True) #返回x、y X = X[:,:2] plt.scatter(X[:,0],X[:,1],c=y) knn = KNeighborsClassifier(n_neighbors = 5) knn.fit(X,y) x1 = np.linspace(4,8,100) #橫坐標4到8 y1 = np.linspace(2,4.5,80) #縱坐標2到4.5 X1,Y1 = np.meshgrid(x1,y1) X1 =X1.reshape(-1,1) Y1 =Y1.reshape(-1,1) X_test = np.concatenate([X1,Y1],axis = 1) #shape為(8000,2)from matplotlib.colors import ListedColormap lc1 = ListedColormap(["#FFAAAA","#AAFFAA","#AAAAFF"]) lc2 = ListedColormap(["#FF0000","#00FF00","#0000FF"]) y_ = knn.predict(X_test) plt.scatter(X_test[:,0],X_test[:,1], c=y_ ,cmap=lc1) plt.scatter(X[:,0],X[:,1], c=y,cmap=lc2)KNN參數的篩選
import numpy as np from sklearn.neighbors import KNeighborsClassifier from sklearn import datasets from sklearn.model_selection import cross_val_scoreX,y = datasets.load_iris(True) knn = KNeighborsClassifier() score = cross_val_score(knn,X,y,scoring="accuracy",cv=6) print(score)erros = [] for i in range(1,14): #150開根號knn = KNeighborsClassifier(n_neighbors=i)score = cross_val_score(knn,X,y,scoring="accuracy",cv=6).mean()erros.append(1-score) import matplotlib.pyplot as plt plt.plot(np.arange(1,14),erros) weights = ["uniform","distance"] for w in weights:knn = KNeighborsClassifier(n_neighbors = 12,weights = w)print(cross_val_score(knn,X,y,scoring="accuracy",cv = 6).mean()) 0.98 0.9733333333333333 result = {} for k in range(1,14):for w in weights:knn = KNeighborsClassifier(n_neighbors = k,weights = w)sm = cross_val_score(knn,X,y,scoring="accuracy",cv = 6).mean()result[w + str(k)] = sm result {'uniform1': 0.96,'distance1': 0.96,'uniform2': 0.94,'distance2': 0.96,'uniform3': 0.9666666666666667,'distance3': 0.9666666666666667,'uniform4': 0.9666666666666667,'distance4': 0.9666666666666667,'uniform5': 0.9666666666666667,'distance5': 0.9666666666666667,'uniform6': 0.9666666666666667,'distance6': 0.96,'uniform7': 0.9733333333333333,'distance7': 0.9733333333333333,'uniform8': 0.9666666666666667,'distance8': 0.9666666666666667,'uniform9': 0.9733333333333333,'distance9': 0.9733333333333333,'uniform10': 0.96,'distance10': 0.96,'uniform11': 0.9733333333333333,'distance11': 0.9733333333333333,'uniform12': 0.98,'distance12': 0.9733333333333333,'uniform13': 0.9733333333333333,'distance13': 0.9733333333333333} np.array(list(result.values())).argmax() list(result)[22] 22 'uniform12'KNN癌癥診斷
import numpy as np import pandas as pd from pandas import Series,DataFrame from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCVcancer = pd.read_csv("/Users/zhucan/Desktop/cancer.csv",sep = "\t") cancer.drop("ID",axis = 1, inplace=True) X = cancer.iloc[:,1:] y = cancer["Diagnosis"] X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2) knn = KNeighborsClassifier() params = {"n_neighbors":[i for i in range(1,30)],"weights":["distance","uniform"],"p":[1,2]} gcv = GridSearchCV(knn,params,scoring = "accuracy",cv = 6) gcv.fit(X_train,y_train) gcv.best_estimator_ gcv.best_score_ gcv.best_params_ y_ = gcv.predict(X_test) gcv.score(X_test,y_test) #此時的gcv就是gcv.best_estimator_ pd.crosstab(index = y_test,columns = y_,rownames=["True"],colnames=["Predict"]) #輸出混淆矩陣 KNeighborsClassifier(n_neighbors=4, p=1, weights='distance') 0.9516666666666667 {'n_neighbors': 4, 'p': 1, 'weights': 'distance'} 0.9385964912280702 from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report confusion_matrix(y_,y_test) print(classification_report(y_test,y_,target_names = ["B","M"]))78/(78+5)?= 0.94? ? ? 78/(78+2) = 0.97
29/(29+2)?= 0.94? ? ?29/(29+5) = 0.85
找健康的比找生病的要強
KNN數據歸一化操作?
#歸一化操作 X_norm1 = (X-X.min())/(X.max()-X.min()) X_train,X_test,y_train,y_test = train_test_split(X_norm1,y,test_size = 0.2) knn = KNeighborsClassifier() params = {"n_neighbors":[i for i in range(1,30)],"weights":["distance","uniform"],"p":[1,2]} gcv = GridSearchCV(knn,params,scoring = "accuracy",cv = 6) gcv.fit(X_train,y_train) from sklearn.metrics import accuracy_score y_ = gcv.predict(X_test) accuracy_score(y_test,y_) 0.9649122807017544 #另外的方法 #標準化 X_norm2 = (X - X.mean())/X.std()from sklearn.preprocessing import MinMaxScaler,StandardScaler mms = MinMaxScaler() mms.fit(X) X2 = mms.transform(X) #和歸一化效果一樣ss = StandardScaler() X3=ss.fit_transform(X) X3 #和標準化效果一樣sklearn中數據拆分
import numpy as np import pandas as pd from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score,GridSearchCV from sklearn.model_selection import KFold,StratifiedKFold #KFold、StratifiedKFold將數據分成多少份data = np.random.randint(0,10,size=(8,2)) target = np.array([0,0,1,0,1,1,1,0]) # train,test是索引,只要有索引就可以獲取數據 KFold = KFold(n_splits=4) for train,test in KFold.split(data,target):print(target[train],target[test]) [1 0 1 1 1 0] [0 0] [0 0 1 1 1 0] [1 0] [0 0 1 0 1 0] [1 1] [0 0 1 0 1 1] [1 0] #分成4分,每一份數據特征,數據樣本比例和原來一樣 sKFold = StratifiedKFold(n_splits=4) for train,test in sKFold.split(data,target):print(target[train],target[test]) [0 0 1 1 1 0] [0 1] [0 1 0 1 1 0] [0 1] [0 0 1 1 1 0] [0 1] [0 0 1 0 1 1] [1 0]#train_test_split,KFold,StratifiedKFold作用都是將數據拆分?
str類型數據的轉變與訓練預測
data = pd.read_csv("/Users/zhucan/Desktop/salary.txt") data.drop(labels=["final_weight","education","capital_gain","capital_loss"],axis = 1,inplace=True) X = data.iloc[:,0:-1] y = data.iloc["salary"] #方法將數據中str轉換int,float從而算法可以計算 #map方法,apply,transform u = X["workclass"].unique() u array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov','Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],dtype=object) np.argwhere(u=='Local-gov')[0,0] 4 def convert(x): #利用數字進行映射 return np.argwhere(u==x)[0,0] X["workclass"]=X["workclass"].map(convert)cols = ['marital_status', 'occupation','relationship', 'race', 'sex','native_country'] for col in cols:u = X[col].unique()def convert(x): return np.argwhere(u==x)[0,0]X[col] = X[col].map(convert) import numpy as np import pandas as pd from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score,GridSearchCV from sklearn.model_selection import KFold,StratifiedKFolddata = pd.read_csv("/Users/zhucan/Desktop/salary.txt") data.head() data.drop(labels=["final_weight","education","capital_gain","capital_loss"],axis = 1,inplace=True)X = data.iloc[:,0:-1] y = data["salary"]u = X["workclass"].unique() def convert(x): return np.argwhere(u==x)[0,0] X["workclass"]=X["workclass"].map(convert)cols = ['marital_status','occupation','relationship','race','sex','native_country'] for col in cols:u = X[col].unique()def convert(x): return np.argwhere(u==x)[0,0]X[col] = X[col].map(convert)knn = KNeighborsClassifier() kFold = KFold(10) knn = KNeighborsClassifier() accuracy = 0 for train,test in kFold.split(X,y):knn.fit(X.loc[train],y[train])acc = knn.score(X.loc[test],y[test])accuracy += acc/10 print(accuracy) 0.7973345728987424總結
以上是生活随笔為你收集整理的机器学习Sklearn实战——KNN算法的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 数模学习笔记——粒子群
- 下一篇: 机器学习Sklearn实战——决策树算法