物体検出モデルのファインチューニング(Object Trackingの結果を用いて)

はじめに

前回Object Trackingについて書きました。
touch-sp.hatenablog.com
その結果を物体検出モデルの学習データに使えないかを検証してみました。
約30秒程度の動画を撮影し、そこから約800枚の画像を取得しました。
使うのは1本の動画のみです。
ファインチューニングのコードはほぼチュートリアルのままです。

学習データの準備-1(LSTファイルの作成)

import os
import numpy as np
import mxnet as mx
from gluoncv import model_zoo
from gluoncv.model_zoo.siamrpn.siamrpn_tracker import SiamRPNTracker

import cv2 

def write_line(img_path, im_shape, boxes, ids, idx):
    h, w, c = im_shape
    # for header, we use minimal length 2, plus width and height
    # with A: 4, B: 5, C: width, D: height
    A = 4
    B = 5
    C = w
    D = h
    # concat id and bboxes
    labels = np.hstack((ids.reshape(-1, 1), boxes)).astype('float')
    # normalized bboxes (recommanded)
    labels[:, (1, 3)] /= float(w)
    labels[:, (2, 4)] /= float(h)
    # flatten
    labels = labels.flatten().tolist()
    str_idx = [str(idx)]
    str_header = [str(x) for x in [A, B, C, D]]
    str_labels = [str(x) for x in labels]
    str_path = [img_path]
    line = '\t'.join(str_idx + str_header + str_labels + str_path) + '\n'
    return line

# mp4データを読み込む
video_frames = []
video_path = 'test.mp4'
cap = cv2.VideoCapture(video_path)
while(True):
    ret, img = cap.read()
    if not ret:
        break
    video_frames.append(img)

# モデルを取得する
net = model_zoo.get_model('siamrpn_alexnet_v2_otb15', pretrained=True, root='./model', ctx=mx.gpu())
tracker = SiamRPNTracker(net)

#最初のポジション
#(左上X座標、左上Y座標、横の大きさ、縦の大きさ)
gt_bbox = [212, 228, 119, 202]

if not os.path.exists('./img'):
    os.makedirs('./img')

with open('train.lst', 'w') as fw:
    for ind, frame in enumerate(video_frames):
        if ind == 0:
            tracker.init(frame, gt_bbox, ctx=mx.gpu())
            pred_bbox = gt_bbox
        else:
            outputs = tracker.track(frame, ctx=mx.gpu())
            pred_bbox = outputs['bbox']
        pred_bbox = list(map(int, pred_bbox))

        boxes = np.array([[pred_bbox[0], pred_bbox[1],pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]]])
        line = write_line('%04d.jpg'%(ind), frame.shape, boxes, np.array([0]), ind)
        fw.write(line)
        cv2.imwrite(os.path.join('./img', '%04d.jpg'%(ind)), frame)

学習データの準備-2(im2rec.pyの実行)

python im2rec.py train.lst ./img --pass-through --pack-label

学習

import time
import mxnet as mx
from mxnet import autograd, gluon
import gluoncv as gcv

def get_dataloader(net, train_dataset, data_shape, batch_size, num_workers):
    from gluoncv.data.batchify import Tuple, Stack, Pad
    from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform
    width, height = data_shape, data_shape
    # use fake data to generate fixed anchors for target generation
    with autograd.train_mode():
        _, _, anchors = net(mx.nd.zeros((1, 3, height, width)))
    batchify_fn = Tuple(Stack(), Stack(), Stack())  # stack image, cls_targets, box_targets
    train_loader = gluon.data.DataLoader(
        train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)),
        batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
    return train_loader

from gluoncv import model_zoo
classes = ['baby']
net = model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes,
    pretrained_base=False, transfer='voc', root='./model')

from gluoncv.data import RecordFileDetection
dataset = RecordFileDetection('train.rec', coord_normalized=True)
train_data = get_dataloader(net, dataset, 512, 8, 0)

ctx = [mx.gpu(0)]

net.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(
    net.collect_params(), 'sgd',
    {'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9})

mbox_loss = gcv.loss.SSDMultiBoxLoss()
ce_metric = mx.metric.Loss('CrossEntropy')
smoothl1_metric = mx.metric.Loss('SmoothL1')

for epoch in range(0, 2):
    ce_metric.reset()
    smoothl1_metric.reset()
    tic = time.time()
    btic = time.time()
    net.hybridize(static_alloc=True, static_shape=True)
    for i, batch in enumerate(train_data):
        batch_size = batch[0].shape[0]
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
        with autograd.record():
            cls_preds = []
            box_preds = []
            for x in data:
                cls_pred, box_pred, _ = net(x)
                cls_preds.append(cls_pred)
                box_preds.append(box_pred)
            sum_loss, cls_loss, box_loss = mbox_loss(
                cls_preds, box_preds, cls_targets, box_targets)
            autograd.backward(sum_loss)
        # since we have already normalized the loss, we don't want to normalize
        # by batch-size anymore
        trainer.step(1)
        ce_metric.update(0, [l * batch_size for l in cls_loss])
        smoothl1_metric.update(0, [l * batch_size for l in box_loss])
        name1, loss1 = ce_metric.get()
        name2, loss2 = smoothl1_metric.get()
        if i % 20 == 0:
            print('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
                epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2))
        btic = time.time()

net.save_parameters('ssd_512_mobilenet1.0_baby.params')

その他のコード(学習には不要)

#LSTファイルのチェック
from gluoncv.data import LstDetection
lst_dataset = LstDetection('train.lst', root='./img')
print('length:', len(lst_dataset))
first_img = lst_dataset[0][0]
print('image shape:', first_img.shape)
print('Label example:')
print(lst_dataset[0][1])
print("GluonCV swaps bounding boxes to columns 0-3 by default")

from PIL import Image
img = Image.fromarray(first_img.asnumpy())
img.show()


#im2recのダウンロード
from gluoncv import utils
im2rec = utils.download('https://raw.githubusercontent.com/apache/incubator-mxnet/' +
                        '6843914f642c8343aaa9a09db803b6af6f5d94a2/tools/im2rec.py', 'im2rec.py')

#im2recの使い方
#python im2rec.py train.lst ./img --pass-through --pack-label


#recファイルのチェック
from gluoncv.utils import viz
from gluoncv.data import RecordFileDetection
from matplotlib import pyplot as plt

dataset = RecordFileDetection('train.rec', coord_normalized=True)

print('length:', len(dataset))

classes = ['baby']  # only one foreground class here
image, label = dataset[0]
print('label:', label)
ax = viz.plot_bbox(image, bboxes=label[:, :4], labels=label[:, 4:5], class_names=classes)
plt.show()


#結果の確認
from gluoncv import utils, data, model_zoo
from matplotlib import pyplot as plt

classes = ['baby']
net = model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes, pretrained_base=False, root='./model')
net.load_parameters('ssd_512_mobilenet1.0_baby.params')
x, image = data.transforms.presets.ssd.load_test('test3.jpg', 512)
cid, score, bbox = net(x)
ax = utils.viz.plot_bbox(image, bbox[0], score[0], cid[0], class_names=classes)
plt.axis('off')
plt.show()

結果

f:id:touch-sp:20200514005758p:plain
f:id:touch-sp:20200514004811p:plain
f:id:touch-sp:20200514004845p:plain

感想

30秒程度の動画を1本とるだけでここまでの精度が出せたことに驚きました。
しかも学習は2エポックのみで、10分もかかっていません。
ちなみに学習データの準備も動画をとってTrackingしただけなので10分かかっていません。

環境

Windows10 Pro
NVIDIA GeForce GTX1080
Python 3.7.7
certifi==2020.4.5.1
chardet==3.0.4
cycler==0.10.0
gluoncv==0.7.0
graphviz==0.8.4
idna==2.6
kiwisolver==1.2.0
matplotlib==3.2.1
mxnet-cu101 @ https://repo.mxnet.io/dist/python/cu101/mxnet_cu101-1.6.0-py2.py3-none-win_amd64.whl
numpy==1.16.6
opencv-python==4.2.0.34
Pillow==7.1.2
portalocker==1.7.0
pyparsing==2.4.7
python-dateutil==2.8.1
pywin32==227
requests==2.18.4
scipy==1.4.1
six==1.14.0
tqdm==4.46.0
urllib3==1.22