はじめに
前回Object Trackingについて書きました。
touch-sp.hatenablog.com
その結果を物体検出モデルの学習データに使えないかを検証してみました。
約30秒程度の動画を撮影し、そこから約800枚の画像を取得しました。
使うのは1本の動画のみです。
ファインチューニングのコードはほぼチュートリアルのままです。
学習データの準備-1(LSTファイルの作成)
import os import numpy as np import mxnet as mx from gluoncv import model_zoo from gluoncv.model_zoo.siamrpn.siamrpn_tracker import SiamRPNTracker import cv2 def write_line(img_path, im_shape, boxes, ids, idx): h, w, c = im_shape # for header, we use minimal length 2, plus width and height # with A: 4, B: 5, C: width, D: height A = 4 B = 5 C = w D = h # concat id and bboxes labels = np.hstack((ids.reshape(-1, 1), boxes)).astype('float') # normalized bboxes (recommanded) labels[:, (1, 3)] /= float(w) labels[:, (2, 4)] /= float(h) # flatten labels = labels.flatten().tolist() str_idx = [str(idx)] str_header = [str(x) for x in [A, B, C, D]] str_labels = [str(x) for x in labels] str_path = [img_path] line = '\t'.join(str_idx + str_header + str_labels + str_path) + '\n' return line # mp4データを読み込む video_frames = [] video_path = 'test.mp4' cap = cv2.VideoCapture(video_path) while(True): ret, img = cap.read() if not ret: break video_frames.append(img) # モデルを取得する net = model_zoo.get_model('siamrpn_alexnet_v2_otb15', pretrained=True, root='./model', ctx=mx.gpu()) tracker = SiamRPNTracker(net) #最初のポジション #(左上X座標、左上Y座標、横の大きさ、縦の大きさ) gt_bbox = [212, 228, 119, 202] if not os.path.exists('./img'): os.makedirs('./img') with open('train.lst', 'w') as fw: for ind, frame in enumerate(video_frames): if ind == 0: tracker.init(frame, gt_bbox, ctx=mx.gpu()) pred_bbox = gt_bbox else: outputs = tracker.track(frame, ctx=mx.gpu()) pred_bbox = outputs['bbox'] pred_bbox = list(map(int, pred_bbox)) boxes = np.array([[pred_bbox[0], pred_bbox[1],pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]]]) line = write_line('%04d.jpg'%(ind), frame.shape, boxes, np.array([0]), ind) fw.write(line) cv2.imwrite(os.path.join('./img', '%04d.jpg'%(ind)), frame)
学習データの準備-2(im2rec.pyの実行)
python im2rec.py train.lst ./img --pass-through --pack-label
学習
import time import mxnet as mx from mxnet import autograd, gluon import gluoncv as gcv def get_dataloader(net, train_dataset, data_shape, batch_size, num_workers): from gluoncv.data.batchify import Tuple, Stack, Pad from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform width, height = data_shape, data_shape # use fake data to generate fixed anchors for target generation with autograd.train_mode(): _, _, anchors = net(mx.nd.zeros((1, 3, height, width))) batchify_fn = Tuple(Stack(), Stack(), Stack()) # stack image, cls_targets, box_targets train_loader = gluon.data.DataLoader( train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)), batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) return train_loader from gluoncv import model_zoo classes = ['baby'] net = model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes, pretrained_base=False, transfer='voc', root='./model') from gluoncv.data import RecordFileDetection dataset = RecordFileDetection('train.rec', coord_normalized=True) train_data = get_dataloader(net, dataset, 512, 8, 0) ctx = [mx.gpu(0)] net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer( net.collect_params(), 'sgd', {'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9}) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') for epoch in range(0, 2): ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() if i % 20 == 0: print('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format( epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2)) btic = time.time() net.save_parameters('ssd_512_mobilenet1.0_baby.params')
その他のコード(学習には不要)
#LSTファイルのチェック from gluoncv.data import LstDetection lst_dataset = LstDetection('train.lst', root='./img') print('length:', len(lst_dataset)) first_img = lst_dataset[0][0] print('image shape:', first_img.shape) print('Label example:') print(lst_dataset[0][1]) print("GluonCV swaps bounding boxes to columns 0-3 by default") from PIL import Image img = Image.fromarray(first_img.asnumpy()) img.show() #im2recのダウンロード from gluoncv import utils im2rec = utils.download('https://raw.githubusercontent.com/apache/incubator-mxnet/' + '6843914f642c8343aaa9a09db803b6af6f5d94a2/tools/im2rec.py', 'im2rec.py') #im2recの使い方 #python im2rec.py train.lst ./img --pass-through --pack-label #recファイルのチェック from gluoncv.utils import viz from gluoncv.data import RecordFileDetection from matplotlib import pyplot as plt dataset = RecordFileDetection('train.rec', coord_normalized=True) print('length:', len(dataset)) classes = ['baby'] # only one foreground class here image, label = dataset[0] print('label:', label) ax = viz.plot_bbox(image, bboxes=label[:, :4], labels=label[:, 4:5], class_names=classes) plt.show() #結果の確認 from gluoncv import utils, data, model_zoo from matplotlib import pyplot as plt classes = ['baby'] net = model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes, pretrained_base=False, root='./model') net.load_parameters('ssd_512_mobilenet1.0_baby.params') x, image = data.transforms.presets.ssd.load_test('test3.jpg', 512) cid, score, bbox = net(x) ax = utils.viz.plot_bbox(image, bbox[0], score[0], cid[0], class_names=classes) plt.axis('off') plt.show()
結果
感想
30秒程度の動画を1本とるだけでここまでの精度が出せたことに驚きました。
しかも学習は2エポックのみで、10分もかかっていません。
ちなみに学習データの準備も動画をとってTrackingしただけなので10分かかっていません。
環境
Windows10 Pro NVIDIA GeForce GTX1080 Python 3.7.7
certifi==2020.4.5.1 chardet==3.0.4 cycler==0.10.0 gluoncv==0.7.0 graphviz==0.8.4 idna==2.6 kiwisolver==1.2.0 matplotlib==3.2.1 mxnet-cu101 @ https://repo.mxnet.io/dist/python/cu101/mxnet_cu101-1.6.0-py2.py3-none-win_amd64.whl numpy==1.16.6 opencv-python==4.2.0.34 Pillow==7.1.2 portalocker==1.7.0 pyparsing==2.4.7 python-dateutil==2.8.1 pywin32==227 requests==2.18.4 scipy==1.4.1 six==1.14.0 tqdm==4.46.0 urllib3==1.22