Neural Networks & Multi-Layer Perceptron
Neural Networks의 정의
- Neural networks are computing systems vaguely inspired by the biological neural networks that constitute animal brains
동물의 뇌를 구성하는 뉴런 네트워크에서 영감을 받아 만든 computing system
=> 앞으로도 더욱 인간의 뇌와 비슷한 모양으로 만들어야 하는가? X
현재도 인간의 뇌와 다른 방향으로 진화하고 있으며, 마치 비행기가 박쥐나 새의 영향을 받아 만들어졌지만, 독자적인 방법으로 발전하고 있는것처럼 NN도 뇌를 꼭 따라갈 필요는 없다
- Neural networks are function approximators that stack affine transformations followed by nonlinear trasnformations
NN은 affine 변환에 비선형 변환을 쌓아가는 함수 approximator이다
affine 변환:
https://luv-n-interest.tistory.com/810
아핀 변환, Affine Transformation [게임수학]
벡터 공간 -> 변환을 배운 것처럼 아핀 공간 -> 변환을 배워보자. 우리가 실제로 게임에서 다룰 모델들은 아핀 공간에 존재한다. 왜 벡터가 아니라 아핀 공간에서 다루느냐? 아핀 공간에는 방향과
luv-n-interest.tistory.com
Linear Neural Networks
Data: $\mathfrak{D} = {(x_i,y_i)}^N_{i=1}$
Model: $\hat{y} = W^TX +b$
Loss: $loss = \frac{1}{N}\sum_{i=1}^{N}(y_i - \hat{y_i})^2$
backpropagation:
$\frac{\partial loss}{\partial w} = -\frac{1}{N} -2(y_i - wx_i - b)x_i$
$\frac{\partial loss}{\partial b} = -\frac{1}{N} -2(y_i - wx_i - b)$
$w := w-\eta \frac{\partial loss}{\partial w}$
$b := b-\eta \frac{\partial loss}{\partial b}$
Multilayer Linear Neural Networks
선형층을 계속해서 쌓아도 결국 선형 함수이기 때문에, 중간에 비선형 함수가 들어가야한다
$$\rho : activation func.$$
$$y = W^T_2h = W^T_2\rho(W^T_1X)$$
activaiton function
loss function
Regression Task: $MSE = \frac{1}{N}\sum_{i=1}^{N}\sum_{d=1}^{D}(y_i^{(d)} - \hat{y}_i^{(d)})^2$
Classification Task: $CE = -\frac{1}{N}\sum_{i = 1}^{N}\sum_{d = 1}^{D}y_i^{(d)}log\hat{y}_i^{(d)}$
Probabilistic Task: $MLE = \frac{1}{N}\sum_{i=1}^{N}\sum_{d=1}^{D}logN(y_i^{(d)};\hat{y}^{(d)}_i,1)$
PyTorch for MLP
-MLP 코드짜기
import
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # 해상도 높게 나옴
print("PyTorch version: {}".format(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("device: {}".format(device))
Dataset
from torchvision import datasets,transforms
mnist_train = datasets.MNIST(root='./data/', train = True, transform= transforms.ToTensor(), download = True)
mnist_test = datasets.MNIST(root='./data/', train = False, transform= transforms.ToTensor(), download = True)
print("mnist_train: \n", mnist_train,"\n")
print("mnist_test: \n", mnist_test,"\n")
print("Done")
Data Iterator(Data Loader)
BATCH_SIZE = 256
train_iter = torch.utils.data.DataLoader(mnist_train,batch_size = BATCH_SIZE, shuffle = True, num_workers = 1)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size = BATCH_SIZE, shuffle = True, num_workers = 1)
print("Done")
define the MLP model
class MultiLayerPerceptronClass(nn.Module):
'''
MLP class
'''
def __init__(self,name = 'mlp', xdim = 784, hdim = 256, ydim = 10):
super().__init__()
self.name = name
self.xdim = xdim
self.hdim = hdim
self.ydim = ydim
self.lin_1 = nn.Linear(xdim, hdim)
self.lin_2 = nn.Linear(hdim,ydim)
self.init_param() # initialize parameters
def init_param(self):
nn.init.kaiming_normal_(self.lin_1.weight) # he initialize
nn.init.zeros_(self.lin_1.bias)
nn.init.kaiming_normal_(self.lin_2.weight)
nn.init.zeros_(self.lin_2.bias)
def forward(self,x):
net = x
net = self.lin_1(net)
net = F.relu(net)
net = self.lin_2(net)
return net
M = MultiLayerPerceptronClass(name = 'mlp', xdim = 784, hdim = 256, ydim = 10).to(device)
loss = nn.CrossEntropyLoss()
optm = optim.Adam(M.parameters(), lr = 1e-3)
print("Done")
check params
np.set_printoptions(precision = 3) # 넘파이 반올림하여 소숫점 3자리까지 나타내기
n_param = 0
for p_idx,(param_name,param) in enumerate(M.named_parameters()):
param_numpy = param.detach().cpu().numpy()
n_param += len(param_numpy.reshape(-1))
print("{} name: {} shape: {}".format(p_idx, param_name,param_numpy.shape))
print(" val:{}".format(param_numpy.reshape(-1)[:5]))
print("Total number of params: {}".format(n_param))
evaluation func.
def func_eval(model,data_iter,device):
with torch.no_grad(): # 기울기 업데이트 + 추적 x
model.eval() # BN, DropOut을 평가 모드로
n_total, n_correct = 0,0
for batch_in, batch_out in data_iter:
y_trgt = batch_out.to(device)
model_pred = model(batch_in.view(-1,28*28).to(device))
_,y_pred = torch.max(model_pred.data,1) # dim 1로 max값을 찾고, (max값, max값의 idx) 반환
n_correct += (y_pred == y_trgt).sum().item() # true인 값을 더해서(sum) 텐서에 들어있는 값만 가져옴(item)
n_total += batch_in.size(0) # 전체 배치 개수
val_accr = (n_correct/n_total)
model.train() # back to train mode
return val_accr
print("Done")
initial Evaluation
M.init_param()
train_accr = func_eval(M,train_iter, device)
test_accr = func_eval(M,test_iter, device)
print("train accr : {}, test accr: {}".format(train_accr, test_accr))
Train
print("start training")
M.init_param() # 가중치 초기화
M.train()
EPOCHS, print_every = 10,1
for epoch in range(EPOCHS):
loss_val_sum = 0
for batch_in, batch_out in train_iter:
y_pred = M.forward(batch_in.view(-1,28*28).to(device))
loss_out = loss(y_pred, batch_out.to(device)) #CE # tensor(0.3388, grad_fn=<NllLossBackward0>)이런 느낌으로 나옴 grad_fn이 있는 tensor
optm.zero_grad()
loss_out.backward()
optm.step()
loss_val_sum += loss_out
loss_val_avg = loss_val_sum /len(train_iter)
if((epoch % print_every ==0) or (epoch==EPOCHS -1)):
train_accr = func_eval(M,train_iter,device)
test_accr = func_eval(M,test_iter,device)
print("epoch:{}, loss:{}, trian_accr:{}, test_accr:{}".format(epoch, loss_val_avg, train_accr, test_accr))
print("Done")
Test
n_sample = 25
sample_indices = np.random.choice(len(mnist_test.targets),n_sample,replace = True)
test_x = mnist_test.data[sample_indices]
test_y = mnist_test.targets[sample_indices]
with torch.no_grad():
y_pred = M.forward(test_x.view(-1,28*28).type(torch.float).to(device))
y_pred = y_pred.argmax(axis = 1)
plt.figure(figsize = (10,10))
for idx in range(n_sample):
plt.subplot(5,5,idx+1)
plt.imshow(test_x[idx], cmap = "gray")
plt.axis("off")
plt.title("Pred:{}, Label:{}".format(y_pred[idx], test_y[idx]))
plt.show()
print("Done")