大作业,细节以后再补
USPS
import h5py import numpy as np import cv2 import torch # load data # data from https://www.kaggle.com/bistaumanga/usps-dataset?select=usps.h5 # another data source https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#usps path = 'usps.h5' with h5py.File(path, 'r') as hf: train = hf.get('train') x_tr = train.get('data')[:] y_tr = train.get('target')[:] test = hf.get('test') x_te = test.get('data')[:] y_te = test.get('target')[:] def check_pic(data, label, idx): pic = data[idx, :] pic = pic.reshape((16, 16)) cv2.imshow(str(label[idx]), pic) cv2.waitKey(0) # check picture # check_pic(x_tr, y_tr, 10) # KNN def knn(x, k): idx = np.argsort(np.sum((x_tr - x_te[x, :]) ** 2, axis=1)) vote = np.zeros(10) for i in range(k): vote[y_tr[idx[i]]] += 1 return np.argmax(vote) for k in range(1, 20): err = 0 for x in range(x_te.shape[0]): y_pr = knn(x, k) err += y_pr != y_te[x] print(x) print(k, 1 - err / x_te.shape[0])
USI sonar
import numpy as np import pandas as pd # load data # data from http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/ path = 'sonar.all-data' data = pd.read_csv(path).values labels = data[:, -1] data = data[:, :-1] train_rate = 0.7 idx = np.arange(labels.shape[0]) idx = [np.random.permutation(idx[labels == 'R']), np.random.permutation(idx[labels == 'M'])] train_idx = [idx[0][:int(train_rate * idx[0].shape[0])], idx[1][:int(train_rate * idx[1].shape[0])]] train_idx = np.concatenate((train_idx[0], train_idx[1])) test_idx = [idx[0][int(train_rate * idx[0].shape[0]):], idx[1][int(train_rate * idx[1].shape[0]):]] test_idx = np.concatenate((test_idx[0], test_idx[1])) x_tr = data[train_idx, :] y_tr = labels[train_idx] x_te = data[test_idx, :] y_te = labels[test_idx] # KNN def knn(x, k): idx = np.argsort(np.sum((x_tr - x_te[x, :]) ** 2, axis=1)) vote = {'M': 0, 'R': 0} for i in range(k): vote[y_tr[idx[i]]] += 1 return 'M' if vote['M'] > vote['R'] else 'R' for k in range(1, 20): err = 0 for x in range(x_te.shape[0]): y_pr = knn(x, k) err += y_pr != y_te[x] print(k, 1 - err / x_te.shape[0])
USI iris
from sklearn import datasets import numpy as np iris = datasets.load_iris() data = iris['data'] labels = iris['target'] train_rate = 0.7 idxx = np.arange(labels.shape[0]) train_idx = [] test_idx = [] for i in range(3): idx = np.random.permutation(idxx[labels == i]) train_idx.append(idx[:int(train_rate * idx.shape[0])]) test_idx.append(idx[int(train_rate * idx.shape[0]):]) train_idx = np.concatenate(tuple(train_idx)) test_idx = np.concatenate(tuple(test_idx)) x_tr = data[train_idx, :] y_tr = labels[train_idx] x_te = data[test_idx, :] y_te = labels[test_idx] # KNN def knn(x, k): idx = np.argsort(np.sum((x_tr - x_te[x, :]) ** 2, axis=1)) vote = np.zeros(3) for i in range(k): vote[y_tr[idx[i]]] += 1 return np.argmax(vote) for k in range(1, 20): err = 0 for x in range(x_te.shape[0]): y_pr = knn(x, k) err += y_pr != y_te[x] print(k, 1 - err / x_te.shape[0])