環境
Windows10 Pro GPUなし Python 3.8.2
バージョンの確認(pip freeze)
インストールが必要なのは「mxnet」と「gluoncv」と「opencv-python」のみ。
pip install mxnet pip install gluoncv pip install opencv-python
その他は勝手についてくる。
certifi==2020.4.5.1 chardet==3.0.4 cycler==0.10.0 gluoncv==0.7.0 graphviz==0.8.4 idna==2.6 kiwisolver==1.2.0 matplotlib==3.2.1 mxnet==1.6.0 numpy==1.18.4 opencv-python==4.2.0.34 Pillow==7.1.2 portalocker==1.7.0 pyparsing==2.4.7 python-dateutil==2.8.1 pywin32==227 requests==2.18.4 scipy==1.4.1 six==1.15.0 tqdm==4.46.0 urllib3==1.22
学習データの準備-1(LSTファイルの作成)
事前に「VOCtrainval_11-May-2012.tar」をダウンロードして解凍しておく。
tarファイルの解凍はWindosからも可能。(こちらを参照)
import os import numpy as np import mxnet as mx import xml.etree.cElementTree as ET def write_line(img_path, im_shape, boxes, ids, idx): h, w, c = im_shape # for header, we use minimal length 2, plus width and height # with A: 4, B: 5, C: width, D: height A = 4 B = 5 C = w D = h # concat id and bboxes labels = np.hstack((ids.reshape(-1, 1), boxes)).astype('float') # normalized bboxes (recommanded) labels[:, (1, 3)] /= float(w) labels[:, (2, 4)] /= float(h) # flatten labels = labels.flatten().tolist() str_idx = [str(idx)] str_header = [str(x) for x in [A, B, C, D]] str_labels = [str(x) for x in labels] str_path = [img_path] line = '\t'.join(str_idx + str_header + str_labels + str_path) + '\n' return line with open('./ImageSets/Layout/trainval.txt') as f: lines = [s.split()[0] for s in f.readlines()] with open('train.lst', 'w') as fw: id = 0 for line in lines: xml_filename = line + '.xml' xml_file_pass = os.path.join('./Annotations', xml_filename) tree = ET.parse(xml_file_pass) root = tree.getroot() img_filename = root.find('filename').text img_file_pass = os.path.join('./JPEGImages', img_filename) img = mx.image.imread(img_file_pass) bndbox_all = [] for child in root.findall('object/part'): bndbox = [] if child.find('name').text == 'head': bndbox.append(int(child.find('bndbox').find('xmin').text)) bndbox.append(int(child.find('bndbox').find('ymin').text)) bndbox.append(int(child.find('bndbox').find('xmax').text)) bndbox.append(int(child.find('bndbox').find('ymax').text)) if len(bndbox)>0: bndbox_all.append(bndbox) if len(bndbox_all)>0: all_boxes = np.array(bndbox_all) all_ids = np.zeros(len(bndbox_all), 'int32') line = write_line(img_filename, img.shape, all_boxes, all_ids, id) fw.write(line) id += 1
学習データの準備-2(im2rec.pyの実行)
python im2rec.py train.lst ./JPEGImages --pass-through --pack-label
学習
import time import mxnet as mx from mxnet import autograd, gluon import gluoncv as gcv def get_dataloader(net, train_dataset, data_shape, batch_size, num_workers): from gluoncv.data.batchify import Tuple, Stack, Pad from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform width, height = data_shape, data_shape # use fake data to generate fixed anchors for target generation with autograd.train_mode(): _, _, anchors = net(mx.nd.zeros((1, 3, height, width))) batchify_fn = Tuple(Stack(), Stack(), Stack()) # stack image, cls_targets, box_targets train_loader = gluon.data.DataLoader( train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)), batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) return train_loader from gluoncv import model_zoo classes = ['face'] net = model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes, pretrained_base=False, transfer='voc', root='./model') from gluoncv.data import RecordFileDetection dataset = RecordFileDetection('train.rec', coord_normalized=True) train_data = get_dataloader(net, dataset, 512, 8, 0) ctx = [mx.cpu(0),mx.cpu(1),mx.cpu(2),mx.cpu(3)] #ctx = [mx.gpu(0)] net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer( net.collect_params(), 'sgd', {'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9}) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') for epoch in range(0, 2): ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * batch_size for l in box_loss]) name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() if i % 20 == 0: print('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format( epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2)) btic = time.time() net.save_parameters('ssd_512_mobilenet1.0_face.params')
結果の確認
from gluoncv import utils, data, model_zoo from matplotlib import pyplot as plt classes = ['face'] net = model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes, pretrained_base=False, root='./model') net.load_parameters('ssd_512_mobilenet1.0_face.params') x, image = data.transforms.presets.ssd.load_test('biking.jpg', 512) cid, score, bbox = net(x) ax = utils.viz.plot_bbox(image, bbox[0], score[0], cid[0], class_names=classes) plt.axis('off') plt.show()