AlexNet 구현 -PyTorch

Naver boostcamp -ai tech/week 04,05

AlexNet 구현 -PyTorch

끵뀐꿩긘 2022. 10. 12. 07:51

AlexNet이란?

2012년 ILSVRC 우승모델로 top-5 error rate 17%를 기록한 CV에서 딥러닝의 부활을 알린 모델이다.

이 대회 이후로 ILSVRC 에서는 딥러닝을 이용한 모델들이 주를 이루게 된다.

AlexNet의 구조

5개의 Conv layer와 3개의 fc layer로 이루어진 모델이다.

저 당시에는 GPU 메모리가 부족하여 학습을 하나의 gpu로 실행하지 못하고 2개의 gpu에 나누어 실행하여 중간 층들이 2개이다.

특징으로는,

- non-saturating인 ReLU 함수를 사용하여 학습의 속도를 높이고, 역전파가 잘 이루어지도록 하였다

- dropout을 사용하여 과적합을 막고 앙상블 효과로 성능을 향상하였다

- LRN(local response normalization)을 사용하여 뉴런의 출력값을 보다 경쟁적으로 만들었다. (현재는 BN에 대체됨)

AlexNet의 구현

MNIST 데이터셋을 사용하였다.

import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch 
import torchvision 
import torch.nn as nn 
import torch.optim as optim 
import torch.nn.functional as F 
from PIL import Image
from torchvision import transforms, datasets
from torch.utils.data import Dataset, DataLoader
import torch.nn.init as init

# 기본 설정
epochs = 10
batch_size = 64
device = ("cuda" if torch.cuda.is_available() else "cpu")
class_names = [0,1,2,3,4,5,6,7,8,9]

print(torch.__version__)
print(device)

>>>
1.12.1+cu113
cuda

# transform & 데이터 셋
transform = transforms.Compose([
	# AlexNet의 초기 입력에 맞추어 227*227사이즈의 이미지로 바꾸어주었다
    transforms.Resize(227), 
    # grayscale인 MNIST데이터를 강제로 3채널로 맞추어주었다
    transforms.Grayscale(num_output_channels=3),
    # 이미지를 텐서로 정규화 효과있음
    transforms.ToTensor()
])

# 학습 데이터
train_data = datasets.MNIST(
    root = "./data",
    train = True,
    download = True,
    transform = transform
)

# 테스트 데이터
test_data = datasets.MNIST(
    root = "./data",
    train = False,
    download = True,
    transform = transform
)

# 데이터 로더
train_dataloader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

AlexNet 모델

# AlexNet -LRN 구현 x
class alexnet(nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = nn.Sequential(
        nn.Conv2d(in_channels = 3,
                  out_channels = 96,
                  kernel_size = 11,
                  stride = 4,
                  padding = 0
                  ), 
        # input size : (3,227,227) # output size: (96,55,55)  # kernel size = (96,11,11)
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 3, stride = 2) # 55 -> (55 -(3 -1)-1)/2 +1 = 27 
    )

    self.conv2 = nn.Sequential(
        nn.Conv2d(in_channels = 96,
                  out_channels = 256,
                  kernel_size = 5,
                  stride = 1,
                  padding = 2
                  ), 
        # input size : (96,27,27) # output size: (256,27,27)  # kernel size = (256,5,5)
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 3, stride = 2) # 27 -> (27 -(3 -1)-1)/2 +1 = 13
    )

    self.conv3 = nn.Sequential(
        nn.Conv2d(in_channels = 256,
                  out_channels = 384,
                  kernel_size = 3,
                  stride = 1,
                  padding = 1
                  ), 
        # input size : (256,13,13) # output size: (384,13,13)  # kernel size = (384,3,3)
        nn.ReLU()
    )
    self.conv4 = nn.Sequential(
        nn.Conv2d(in_channels = 384,
                  out_channels = 384,
                  kernel_size = 3,
                  stride = 1,
                  padding = 1
                  ), 
        # input size : (384,13,13) # output size: (384,13,13)  # kernel size = (384,3,3)
        nn.ReLU()
    )

    self.conv5 = nn.Sequential(
        nn.Conv2d(in_channels = 384,
                  out_channels = 256,
                  kernel_size = 3,
                  stride = 1,
                  padding = 1
                  ), 
        # input size : (384,13,13) # output size: (256,13,13)  # kernel size = (256,3,3)
        nn.ReLU(),
        nn.MaxPool2d(3,2) # 13 -> (13 -(3 -1)-1)/2 +1 = 6
    )

    self.fc1 = nn.Sequential(
        nn.Linear(256*6*6, 4096),
        nn.ReLU(),
        nn.Dropout(p = 0.5) # 드롭아웃
    )
    self.fc2 = nn.Sequential(
        nn.Linear(4096, 4096),
        nn.ReLU(),
        nn.Dropout(p = 0.5) # 드롭아웃
    )
    self.fc3 = nn.Linear(4096, 10)
  
  def forward(self,x):
    out = self.conv1(x)
    out = self.conv2(out)
    out = self.conv3(out)
    out = self.conv4(out)
    out = self.conv5(out)
    out = out.view(out.size(0), -1) # flatten
    out = self.fc1(out)
    out = self.fc2(out)
    out = self.fc3(out)
    out = F.log_softmax(out,dim=1) 
    # softmax까지 적용해줌 -> loss function으로 nll_loss를 사용하여야함

    return out
 
  # 가중치 초기화 - ReLU를 사용하므로 he 초기화 실행
  def init_weights(self):
        for m in self.modules():
            if isinstance(m,nn.Conv2d): # init conv
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m,nn.Linear): # lnit dense
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)

model = alexnet().to(device) # 모델 생성
criterion = F.nll_loss # 모델에 softmax처리가 없는 경우 torch.nn.CrossEntropyLoss 사용
optimizer = optim.Adam(model.parameters())
model

>>>
alexnet(
  (conv1): Sequential(
    (0): Conv2d(3, 96, kernel_size=(11, 11), stride=(4, 4))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (conv4): Sequential(
    (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (conv5): Sequential(
    (0): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=9216, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
  )
  (fc2): Sequential(
    (0): Linear(in_features=4096, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
  )
  (fc3): Linear(in_features=4096, out_features=10, bias=True)
)

# torch summary
from torchsummary import summary as summary_

summary_(model,(3,227,227), batch_size)

>>>
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1           [64, 96, 55, 55]          34,944
              ReLU-2           [64, 96, 55, 55]               0
         MaxPool2d-3           [64, 96, 27, 27]               0
            Conv2d-4          [64, 256, 27, 27]         614,656
              ReLU-5          [64, 256, 27, 27]               0
         MaxPool2d-6          [64, 256, 13, 13]               0
            Conv2d-7          [64, 384, 13, 13]         885,120
              ReLU-8          [64, 384, 13, 13]               0
            Conv2d-9          [64, 384, 13, 13]       1,327,488
             ReLU-10          [64, 384, 13, 13]               0
           Conv2d-11          [64, 256, 13, 13]         884,992
             ReLU-12          [64, 256, 13, 13]               0
        MaxPool2d-13            [64, 256, 6, 6]               0
           Linear-14                 [64, 4096]      37,752,832
             ReLU-15                 [64, 4096]               0
          Dropout-16                 [64, 4096]               0
           Linear-17                 [64, 4096]      16,781,312
             ReLU-18                 [64, 4096]               0
          Dropout-19                 [64, 4096]               0
           Linear-20                   [64, 10]          40,970
================================================================
Total params: 58,322,314
Trainable params: 58,322,314
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 37.74
Forward/backward pass size (MB): 706.65
Params size (MB): 222.48
Estimated Total Size (MB): 966.87
----------------------------------------------------------------

# train_accr, test_acrr 평가 함수
def func_eval(model,data_iter,device):
    with torch.no_grad(): # 가중치 업데이트 중지
        n_total,n_correct = 0,0
        model.eval() # dropout과 BN 중지
        for batch_in,batch_out in data_iter: # 모든 배치에 대해서 
            y_trgt = batch_out.to(device)
            model_pred = model(batch_in.to(device))
            _,y_pred = torch.max(model_pred.data,1) # torch.max() => (최대값, 최대값의 인덱스)
            n_correct += (y_pred==y_trgt).sum().item()
            n_total += batch_in.size(0)
        val_accr = (n_correct/n_total)
        model.train() # back to train mode 
    return val_accr

# 학습 & 평가
print ("Start training.")
model.init_weights() # 파라미터 초기화
model.train() # to train mode 
print_every = 1
for epoch in range(epochs):
    loss_val_sum = 0
    for batch_in,batch_out in train_dataloader:
        # Forward path
        y_pred = model.forward(batch_in.to(device))
        loss_out = criterion(y_pred,batch_out.to(device))
        # Update
        optimizer.zero_grad()      # reset gradient 
        loss_out.backward()      # backpropagate
        optimizer.step()      # optimizer update
        loss_val_sum += loss_out
    loss_val_avg = loss_val_sum/len(train_dataloader)
    # Print
    if ((epoch%print_every)==0) or (epoch==(epochs-1)):
        train_accr = func_eval(model,train_dataloader,device)
        test_accr = func_eval(model,test_dataloader,device)
        print ("epoch:[%d] loss:[%.3f] train_accr:[%.3f] test_accr:[%.3f]."%
               (epoch,loss_val_avg,train_accr,test_accr))
print ("Done")

>>>
Start training.
epoch:[0] loss:[0.240] train_accr:[0.975] test_accr:[0.978].
epoch:[1] loss:[0.076] train_accr:[0.981] test_accr:[0.980].
epoch:[2] loss:[0.063] train_accr:[0.982] test_accr:[0.981].
epoch:[3] loss:[0.059] train_accr:[0.988] test_accr:[0.985].
epoch:[4] loss:[0.052] train_accr:[0.990] test_accr:[0.989].
epoch:[5] loss:[0.047] train_accr:[0.990] test_accr:[0.988].
epoch:[6] loss:[0.045] train_accr:[0.994] test_accr:[0.990].
epoch:[7] loss:[0.046] train_accr:[0.995] test_accr:[0.991].
epoch:[8] loss:[0.046] train_accr:[0.992] test_accr:[0.988].
epoch:[9] loss:[0.037] train_accr:[0.994] test_accr:[0.990].
Done

# 예측 모델 시각화
test = iter(test_dataloader)
test_x, test_y = next(test)
with torch.no_grad():
  model.eval() # to evaluation mode 
  y_pred = model.forward(test_x.to(device))
y_pred = y_pred.argmax(axis=1)
plt.figure(figsize=(30,30))
for idx in range(batch_size):
    plt.subplot(8, 8, idx+1)
    plt.imshow(test_x[idx].permute(1,2,0), cmap='gray')
    plt.axis('off')
    plt.title("Pred:%d, Label:%d"%(y_pred[idx],test_y[idx]))
plt.show()    
print ("Done")

학습 자체는 오래걸리지 않은 것 같은데, 아마 평가 과정에서 train과 test 데이터셋을 모두 보고 평가하여 40분이라는 오랜 시간이 걸린듯하다..