cs231n Assignment 2: Q4 (CNN, Group Normalization 구현)

cs231n

cs231n Assignment 2: Q4 (CNN, Group Normalization 구현)

츄츄츄츄츄츄츄 2023. 2. 18. 00:29

내 풀이 링크:https://github.com/lionkingchuchu/cs231n.git

GitHub - lionkingchuchu/cs231n: cs231n Spring 2022 Assignment

cs231n Spring 2022 Assignment. Contribute to lionkingchuchu/cs231n development by creating an account on GitHub.

github.com

이번에는 Convolutional layer를 구현하는 문제이다. 지금까지 이미지 분류를 할 때에는 각 픽셀을 일자로 늘어뜨려서 픽셀의 각 값에 대한 weight를 곱한 고 bias를 더한 output을 배출하는 layer를 여러개 이어붙여 신경망을 만들었다. 그러나 이 방식은 사진 데이터가 갖고 있는 특성인 모양 (가로, 세로) 를 전혀 고려하지 않은 분류 방식이다.

Convolution은 이러한 사진의 특성을 분류에 반영하기 위해 임의의 사각형 모양의 weight를 가진 필터를 사진 이미지에 곱하고, bias를 더해 가며 2차원 (H, W) 형식의 output을 만들어 낸다. 이 과정을 사진 이미지는 RGB채널 3개가 있으므로 3개의 각 필터가 세트로 존재한다. (3, H, W). 세트의 출력은 3개 세트에서 나온 3개 세트의 결과를 다 더해주므로 output의 차원은 2차원이다 (H,W). 그리고 이러한 필터를 한 layer에 N개씩 만들어, 총 3차원 (N, H, W) 의 output을 만들어 내 사진의 가로, 세로 특성을 유지한 채로 신경망을 깊게 만들 수 있다.

def conv_forward_naive(x, w, b, conv_param):
   
    out = None
    
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    pad = conv_param['pad']
    stride = conv_param['stride']
    
    N = x.shape[0]
    F = w.shape[0]
    HH = w.shape[2]
    WW = w.shape[3]
    H_out = 1 + (x.shape[2] + 2 * pad - HH) // stride
    W_out = 1 + (x.shape[3] + 2 * pad - WW) // stride

    x_pad = np.pad(x,[(0,0),(0,0),(pad,pad),(pad,pad)])
    out = np.zeros([N,F,H_out,W_out])

    for n in range(N):
      for f in range(F):
        for i in range(H_out):
          for j in range(W_out):
            h_start = i * stride
            w_start = j * stride
            out[n][f][i][j] = np.sum(x_pad[n,:,h_start:h_start+HH, w_start:w_start+WW] * w[f,:,:,:]) + b[f]
    
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
   
    cache = (x, w, b, conv_param)
    return out, cache

루프를 사용해 naive한 방법으로 convoloution layer를 구현해 보았다. 정직하게 필터를 가로부터 세로까지 input에 옮겨 가며 out값에 반영해 가며 계산했다. 이 과제에서는 지금까지의 다른 layer 구현과는 다르게 convolution layer는 별도의 빠른 방법을 구현하지 않고 naive한 방법만 구현하게 한다. 아마 빠른 방법은 구현하기 어려운가 보다.

from imageio import imread
from PIL import Image

kitten = imread('cs231n/notebook_images/kitten.jpg')
puppy = imread('cs231n/notebook_images/puppy.jpg')
# kitten is wide, and puppy is already square
d = kitten.shape[1] - kitten.shape[0]
kitten_cropped = kitten[:, d//2:-d//2, :]

img_size = 200   # Make this smaller if it runs too slow
resized_puppy = np.array(Image.fromarray(puppy).resize((img_size, img_size)))
resized_kitten = np.array(Image.fromarray(kitten_cropped).resize((img_size, img_size)))
x = np.zeros((2, 3, img_size, img_size))
x[0, :, :, :] = resized_puppy.transpose((2, 0, 1))
x[1, :, :, :] = resized_kitten.transpose((2, 0, 1))

# Set up a convolutional weights holding 2 filters, each 3x3
w = np.zeros((2, 3, 3, 3))

# The first filter converts the image to grayscale.
# Set up the red, green, and blue channels of the filter.
w[0, 0, :, :] = [[0, 0, 0], [0, 0.3, 0], [0, 0, 0]]
w[0, 1, :, :] = [[0, 0, 0], [0, 0.6, 0], [0, 0, 0]]
w[0, 2, :, :] = [[0, 0, 0], [0, 0.1, 0], [0, 0, 0]]

# Second filter detects horizontal edges in the blue channel.
w[1, 2, :, :] = [[1, 2, 1], [0, 0, 0], [-1, -2, -1]]

# Vector of biases. We don't need any bias for the grayscale
# filter, but for the edge detection filter we want to add 128
# to each output so that nothing is negative.
b = np.array([0, 128])

# Compute the result of convolving each input in x with each filter in w,
# offsetting by b, and storing the results in out.
out, _ = conv_forward_naive(x, w, b, {'stride': 1, 'pad': 1})

def imshow_no_ax(img, normalize=True):
    """ Tiny helper to show images as uint8 and remove axis labels """
    if normalize:
        img_max, img_min = np.max(img), np.min(img)
        img = 255.0 * (img - img_min) / (img_max - img_min)
    plt.imshow(img.astype('uint8'))
    plt.gca().axis('off')

# Show the original images and the results of the conv operation
plt.subplot(2, 3, 1)
imshow_no_ax(puppy, normalize=False)
plt.title('Original image')
plt.subplot(2, 3, 2)
imshow_no_ax(out[0, 0])
plt.title('Grayscale')
plt.subplot(2, 3, 3)
imshow_no_ax(out[0, 1])
plt.title('Edges')
plt.subplot(2, 3, 4)
imshow_no_ax(kitten_cropped, normalize=False)
plt.subplot(2, 3, 5)
imshow_no_ax(out[1, 0])
plt.subplot(2, 3, 6)
imshow_no_ax(out[1, 1])
plt.show()

만든 convolution layer로 기존 이미지에서 grayscale 이미지 만드는 필터, horizontal edge 찾는 필터를 사용해 결과 이미지를 생성해 본다. 특이한 점으로는 grayscale 이미지 필터는 중앙값에만 R: 0.3, G: 0.6, B: 0.1 필터를 주어 구현했다. grayscale 이미지 만들때에는 단순히 RGB값을 더할 줄 알았는데 특정 비율로 더하는건가? 원래 그런지 모르겠지만 암도 조절을 위해 한 것 같다.

그리고 두번째 horizontal edge찾는 필터는 [[1, 2, 1], [0, 0, 0], [-1, -2, -1]] 이런 방식으로 구현했다. 이렇게 하면 만약 해당 3x3 지역이 모두 같은 값 (a) 이라면 1a + 2a + 1a + -a + -2a + -1a = 0 상쇄되어 0이 될 것이다. 만약 3x3지역의 위 3 픽셀, 아래 3픽셀이 다르다면 상쇄되지 않고 차이가 크면 클 수록 큰 값이 출력 될 것이다. 이런 식으로 edge를 찾을 수 있구나 했다.

잘 된 것을 보니 실제로 신경망을 학습시키면 각 layer가 필터의 가중치들을 이리저리 좋은 방향으로 바꿔 가며 edge를 찾는 layer도 생길 것이고, grayscale 값을 보는 layer도 생길 것이고, 우리가 생각할 수 없는 다른 컴퓨터만의 방법으로 feature를 찾는 layer들도 생길 것이다.

def conv_backward_naive(dout, cache):
   
    dx, dw, db = None, None, None
   
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    x, w, b, conv_param = cache

    pad = conv_param['pad']
    stride = conv_param['stride']

    N = x.shape[0]
    F = w.shape[0]
    HH = w.shape[2]
    WW = w.shape[3]
    H_out = 1 + (x.shape[2] + 2 * pad - HH) // stride
    W_out = 1 + (x.shape[3] + 2 * pad - WW) // stride

    x_pad = np.pad(x,[(0,0),(0,0),(pad,pad),(pad,pad)])
    dw = np.zeros_like(w)
    dx = np.zeros_like(x_pad)
    db = np.zeros_like(b)

    for n in range(N):
      for f in range(F):
        for i in range(H_out):
          for j in range(W_out):
            h_start = i * stride
            w_start = j * stride
            db[f] += dout[n][f][i][j]
            dw[f,:,:,:] += dout[n][f][i][j] * x_pad[n,:,h_start:h_start+HH, w_start:w_start+WW]
            dx[n,:,h_start:h_start+HH, w_start:w_start+WW] += dout[n][f][i][j] * w[f,:,:,:]

    dx = dx[:,:,pad:-pad,pad:-pad]
    
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    return dx, dw, db

backward pass도 layer에 어려운 수식이 들어간 것이 아니라 기존 wx + b 논리만 들어갔기 때문에 루프를 잘 활용하고, array의 차원과 범위를 잘 생각하면 쉽게 구현할 수 있다.

다음으로 pooling layer 구현이 있는데 보통 max pooling을 사용한다. max pooling은 해당 지역 안의 최대값만을 반영해 기존의 (H, W)의 사진 데이터 차원을 downsampling 할 수 있게 한다.

def max_pool_forward_naive(x, pool_param):
   
    out = None
    
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    ph = pool_param['pool_height']
    pw = pool_param['pool_width']
    stride = pool_param['stride']

    H_out = 1 + (x.shape[2] - ph) // stride
    W_out = 1 + (x.shape[3] - pw) // stride

    N = x.shape[0]
    C = x.shape[1]
    out = np.zeros([N,C,H_out,W_out])

    for n in range(N):
      for c in range(C):
        for i in range(H_out):
          for j in range(W_out):
            h_start = i * stride
            w_start = j * stride
            out[n][c][i][j] = np.max(x[n][c][h_start:h_start+ph,w_start:w_start+pw])

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
   
    cache = (x, pool_param)
    return out, cache

def max_pool_backward_naive(dout, cache):
   
    dx = None
   
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    x, pool_param = cache
    ph = pool_param['pool_height']
    pw = pool_param['pool_width']
    stride = pool_param['stride']

    H_out = 1 + (x.shape[2] - ph) // stride
    W_out = 1 + (x.shape[3] - pw) // stride

    N = x.shape[0]
    C = x.shape[1]
    dx = np.zeros_like(x)

    for n in range(N):
      for c in range(C):
        for i in range(H_out):
          for j in range(W_out):
            h_start = i * stride
            w_start = j * stride
            maxidx = np.argmax(x[n][c][h_start:h_start+ph,w_start:w_start+pw])
            dx[n][c][h_start+maxidx//ph][w_start+maxidx%pw] += dout[n][c][i][j]

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
  
    return dx

max pooling forward, backward 구현은 아까 convolution처럼 루프를 사용하고, 특정 값만 반영한다는 점에서 ReLU함수와 비슷하게 backward 과정을 하면 된다. ReLU는 0보다 큰 값만 반영하고, max pooling은 최대 값만 반영하는 방식이므로 비슷하게 최대값의 위치에만 dout을 더해주는 방식으로 구현하면 된다.

다음으로 이미 구현된 fast convolution layer과 내가 구현한 naive convolution layer 속도 비교가 있다. fast / naive pooling layer은 40배 정도 빨랐다.

다음으로 실제 CNN 신경망을 구현하는 문제이다. 기존 affine layer처럼 weight, bias가 있다. convolution layer만의 다른 점은 처음 설명한 것처럼 사진의 가로, 세로의 필터와 채널개수를 반영해야 하므로 weight이 4차원 모양이다. (필터개수, 3(RGB), H, W).

전체 신경망 형태는 아래와 같다.

conv - relu - 2x2 max pool - affine - relu - affine - softmax

loss() 함수는 기존의 해왔던 것처럼 layer를 지나가며 조합만 잘 해가면 된다.

만든 CNN 신경망이 잘 작동하는지 확인하기 위해 train data를 overfit 시켜보고, 이에 대한 loss와 accuracy 그래프이다.

train acc 가 거의 1.0으로 잘 작동하는 것을 확인할 수 있다.

model = ThreeLayerConvNet(weight_scale=0.001, hidden_dim=500, reg=0.001)

solver = Solver(
    model,
    data,
    num_epochs=1,
    batch_size=50,
    update_rule='adam',
    optim_config={'learning_rate': 1e-3,},
    verbose=True,
    print_every=20
)
solver.train()

(Epoch 1 / 1) train acc: 0.482000; val_acc: 0.488000

다음으로 실제 CIFAR-10 데이터를 train 해본다. epoch 1에 위와 같은 acc를 얻을 수 있었다. 아래는 각 필터의 가중치들을 시각화 해 본 결과이다. 우리는 알수 없지만 컴퓨터만의 방식으로 스스로의 weight를 만들어 낸 것 같다.

다음은 spatial batch normalization 구현이다. spatial batch normalization은 전에 배운 batch normalization 을 똑같이 하는데, 이번에는 convolution layer의 데이터 차원인 (N, C, H, W) 차원에서 하는 것이다. 4가지 차원 중 normalization axis를 어디로 하는지가 문제인데, C 기준으로 한다, 왜냐하면 한 convolution layer에서 한 데이터에 대해 출력할 때 필터의 개수(C) 를 기준으로 normalize 해야 한 데이터의 (N) 한 필터 (C) 에 의한 출력값의 이미지 단위 (H,W) 평균과 분산을 일정하게 유지 할 수 있기 때문이다.

def spatial_batchnorm_forward(x, gamma, beta, bn_param):
   
    out, cache = None, None

    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    x_T = np.reshape(x, [-1,x.shape[1]])
    out, cache = batchnorm_forward(x_T, gamma, beta, bn_param)
    out = np.reshape(out,x.shape)

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
   
    return out, cache

기존 batch norm 함수를 x를 잠깐 flatten 해 적용하고, 다시 기존의 차원 모양으로 reshape 해주는 방식으로 쉽게 구현할 수 있다. backward도 같은 원리를 사용하면 된다.

다음은 spatial group normalization인데, 좀 복잡하다. 각 채널 C만큼 각각 normalize 하지 않고, 전체 채널의 개수를 G개로 나누어 각 채널 그룹 뭉치 기준으로 normalize 한다. 만약 채널이 4개고, G가 2라면, (0,1)채널 normalize, (2,3) 채널 normalzie 하는 방식이다.

위 방식을 사용하는 이유는 기존의 Fully Connected nets은 batch norm 과 비슷한 layer norm을 사용하면 각 hidden dims 만큼의 뉴런이 비슷하게 활성화 되어 normalize 하는 방법이 어느정도 효과가 있었다. 반면에 convolution의 각 hidden dims는 뉴런의 활성화 정도가 비슷하지 않고 특정 뉴런만 활성화 된다고 한다.

생각해 보면 각 사진은 피사체를 중심에 두고, 배경을 가장자리에 두기에 사진 분류의 초점은 피사체 중심인 사진의 중심부를 중요하게 생각 할 것이고, 이에 따라 중앙 지역을 보는 뉴런들이 더 활성화 될 것이다. 그래서 hidden dims 전체를 normalize 하기 보다, 채널을 묶어서 normalize 해보았는데 효과가 좋았다고 한다.

논문: [1803.08494] Group Normalization (arxiv.org)

Group Normalization

Batch Normalization (BN) is a milestone technique in the development of deep learning, enabling various networks to train. However, normalizing along the batch dimension introduces problems --- BN's error increases rapidly when the batch size becomes small

arxiv.org

def spatial_groupnorm_forward(x, gamma, beta, G, gn_param):
   
    out, cache = None, None
    eps = gn_param.get("eps", 1e-5)
    
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    N, C, H, W = x.shape
    x = np.reshape(x, [N, G, C // G, H, W])
    
    mean = np.mean(x, axis = (2,3,4), keepdims = True) #[N,G]
    var = np.var(x, axis = (2,3,4), keepdims = True) #[N,G]

    x_normal = (x - mean) / np.sqrt(var + eps)
    x_normal = np.reshape(x_normal, [N,C,H,W])
    out = x_normal * gamma + beta
    cache = (x, mean, var, gamma, G)

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    return out, cache

구현은 먼저 기존의 (N, C, H, W) 를 (N, G, C', H, W) 로 나눈다. G번째 그룹의 C' 번째 채널 형식으로 저장 될 것이다. group을 기점으로 normalize 하기 위해 axis (C', H, W) 만큼의 mean, variance를 구하고 normalize 해 준다.

def spatial_groupnorm_backward(dout, cache):
   
    dx, dgamma, dbeta = None, None, None

    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    eps = 1e-5
    x, mean, var, gamma, G = cache # mean, var: [N,G] x: [N,G,C//G,H,W]
    N,C,H,W = dout.shape
    M = x.shape[2] * x.shape[3] * x.shape[4]
    x_normal = (x- mean) / np.sqrt(var + eps)

    dgamma = np.sum(dout * np.reshape(x_normal,dout.shape), axis = (0,2,3), keepdims=True)
    dbeta = np.sum(dout, axis = (0,2,3), keepdims=True)
    dx_normal = dout * gamma
    
    dlvar = np.sum(np.reshape(dx_normal,x.shape) * (x - mean)
    * -0.5 * (var + eps)**-1.5, axis = (2,3,4), keepdims=True)

    dlmean = np.sum(np.reshape(dx_normal,x.shape) * -1 / np.sqrt(var + eps) , axis = (2,3,4), keepdims=True)
    + dlvar * np.sum(-2 * (x - mean), axis = (2,3,4), keepdims=True) / M

    dx = np.reshape(dx_normal,x.shape) * 1 / np.sqrt(var + eps) + dlvar * 2 * (x - mean) / M + dlmean / M
    dx = np.reshape(dx, dout.shape)

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
   
    return dx, dgamma, dbeta

backward 구현이 참 어려웠는데 전에 했던 layer normalization 에서 기존 batch normalization에서 어떤 axis를 어떻게 바꾸고 dimension을 어떻게 조정 하고 했는지를 잘 살펴보면서 똑같이 axis를 2, 3, 4에 대입하여 시도하면 겨우겨우 구현 할 수 있다.

'cs231n' 카테고리의 다른 글

cs231n Assignment 2: Q6 (Saliency map, Fooling images, Class visualization 구현) (0)	2023.02.21
cs231n Assignment 2: Q5 (Pytorch 사용해보기) (0)	2023.02.20
cs231n Assignment 2: Q3 (Dropout 구현) (0)	2023.02.15
cs231n Assignment 2: Q2 (Batch Normalization, Layer Normalization 구현) (0)	2023.02.14
cs231n Assignment 2: Q1 (Fully Connected Network 구현) (0)	2023.02.14

현재글cs231n Assignment 2: Q4 (CNN, Group Normalization 구현)

King of the Jungle Lion

self supervised learning, 세그먼트 트리, BFS, saliency map, cs231n, 스프라그-그런디, DFS, 최대 유량, fooling image, 세그먼트트리, 볼록 껍질, 파이썬, class visualization, Strongly Connected Components, 트라이, 최대유량 최소컷, Group Normalization, dp, 백준, SimCLR,

Today :
Yesterday :

King of the Jungle Lion