Pascal VOC datasetを用いて顔検出を学習する

環境

Windows10 Pro
GPUなし
Python 3.8.2

バージョンの確認(pip freeze)

インストールが必要なのは「mxnet」と「gluoncv」と「opencv-python」のみ。

pip install mxnet
pip install gluoncv
pip install opencv-python

その他は勝手についてくる。

certifi==2020.4.5.1
chardet==3.0.4
cycler==0.10.0
gluoncv==0.7.0
graphviz==0.8.4
idna==2.6
kiwisolver==1.2.0
matplotlib==3.2.1
mxnet==1.6.0
numpy==1.18.4
opencv-python==4.2.0.34
Pillow==7.1.2
portalocker==1.7.0
pyparsing==2.4.7
python-dateutil==2.8.1
pywin32==227
requests==2.18.4
scipy==1.4.1
six==1.15.0
tqdm==4.46.0
urllib3==1.22

学習データの準備-1(LSTファイルの作成)

事前に「VOCtrainval_11-May-2012.tar」をダウンロードして解凍しておく。
tarファイルの解凍はWindosからも可能。(こちらを参照)

import os
import numpy as np
import mxnet as mx
import xml.etree.cElementTree as ET

def write_line(img_path, im_shape, boxes, ids, idx):
    h, w, c = im_shape
    # for header, we use minimal length 2, plus width and height
    # with A: 4, B: 5, C: width, D: height
    A = 4
    B = 5
    C = w
    D = h
    # concat id and bboxes
    labels = np.hstack((ids.reshape(-1, 1), boxes)).astype('float')
    # normalized bboxes (recommanded)
    labels[:, (1, 3)] /= float(w)
    labels[:, (2, 4)] /= float(h)
    # flatten
    labels = labels.flatten().tolist()
    str_idx = [str(idx)]
    str_header = [str(x) for x in [A, B, C, D]]
    str_labels = [str(x) for x in labels]
    str_path = [img_path]
    line = '\t'.join(str_idx + str_header + str_labels + str_path) + '\n'
    return line

with open('./ImageSets/Layout/trainval.txt') as f:
    lines = [s.split()[0] for s in f.readlines()]


with open('train.lst', 'w') as fw:

    id = 0
    for line in lines:
        xml_filename = line + '.xml'
        xml_file_pass = os.path.join('./Annotations', xml_filename)

        tree = ET.parse(xml_file_pass)
        root = tree.getroot()

        img_filename = root.find('filename').text

        img_file_pass = os.path.join('./JPEGImages', img_filename)
        img = mx.image.imread(img_file_pass)

        bndbox_all = []
        for child in root.findall('object/part'):
            bndbox = []
            if child.find('name').text == 'head':
                bndbox.append(int(child.find('bndbox').find('xmin').text))
                bndbox.append(int(child.find('bndbox').find('ymin').text))
                bndbox.append(int(child.find('bndbox').find('xmax').text))
                bndbox.append(int(child.find('bndbox').find('ymax').text))
            if len(bndbox)>0:
                bndbox_all.append(bndbox)
            
        if len(bndbox_all)>0:
            all_boxes = np.array(bndbox_all)
            all_ids = np.zeros(len(bndbox_all), 'int32')
            line = write_line(img_filename, img.shape, all_boxes, all_ids, id)
            fw.write(line)
            id += 1

学習データの準備-2(im2rec.pyの実行)

python im2rec.py train.lst ./JPEGImages --pass-through --pack-label

学習

import time
import mxnet as mx
from mxnet import autograd, gluon
import gluoncv as gcv

def get_dataloader(net, train_dataset, data_shape, batch_size, num_workers):
    from gluoncv.data.batchify import Tuple, Stack, Pad
    from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform
    width, height = data_shape, data_shape
    # use fake data to generate fixed anchors for target generation
    with autograd.train_mode():
        _, _, anchors = net(mx.nd.zeros((1, 3, height, width)))
    batchify_fn = Tuple(Stack(), Stack(), Stack())  # stack image, cls_targets, box_targets
    train_loader = gluon.data.DataLoader(
        train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)),
        batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
    return train_loader

from gluoncv import model_zoo
classes = ['face']
net = model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes,
    pretrained_base=False, transfer='voc', root='./model')

from gluoncv.data import RecordFileDetection
dataset = RecordFileDetection('train.rec', coord_normalized=True)
train_data = get_dataloader(net, dataset, 512, 8, 0)

ctx = [mx.cpu(0),mx.cpu(1),mx.cpu(2),mx.cpu(3)]
#ctx = [mx.gpu(0)]

net.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(
    net.collect_params(), 'sgd',
    {'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9})

mbox_loss = gcv.loss.SSDMultiBoxLoss()
ce_metric = mx.metric.Loss('CrossEntropy')
smoothl1_metric = mx.metric.Loss('SmoothL1')

for epoch in range(0, 2):
    ce_metric.reset()
    smoothl1_metric.reset()
    tic = time.time()
    btic = time.time()
    net.hybridize(static_alloc=True, static_shape=True)
    for i, batch in enumerate(train_data):
        batch_size = batch[0].shape[0]
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
        with autograd.record():
            cls_preds = []
            box_preds = []
            for x in data:
                cls_pred, box_pred, _ = net(x)
                cls_preds.append(cls_pred)
                box_preds.append(box_pred)
            sum_loss, cls_loss, box_loss = mbox_loss(
                cls_preds, box_preds, cls_targets, box_targets)
            autograd.backward(sum_loss)
        # since we have already normalized the loss, we don't want to normalize
        # by batch-size anymore
        trainer.step(1)
        ce_metric.update(0, [l * batch_size for l in cls_loss])
        smoothl1_metric.update(0, [l * batch_size for l in box_loss])
        name1, loss1 = ce_metric.get()
        name2, loss2 = smoothl1_metric.get()
        if i % 20 == 0:
            print('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
                epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2))
        btic = time.time()

net.save_parameters('ssd_512_mobilenet1.0_face.params')

結果の確認

from gluoncv import utils, data, model_zoo
from matplotlib import pyplot as plt

classes = ['face']
net = model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes, pretrained_base=False, root='./model')
net.load_parameters('ssd_512_mobilenet1.0_face.params')
x, image = data.transforms.presets.ssd.load_test('biking.jpg', 512)
cid, score, bbox = net(x)
ax = utils.viz.plot_bbox(image, bbox[0], score[0], cid[0], class_names=classes)
plt.axis('off')
plt.show()

f:id:touch-sp:20200524151006p:plain

感想

すべての顔を検出している訳ではないが十分かな。
ほとんどGluonCVのチュートリアルをコピペしただけですが。
OpenCVだけでも顔検出ができるらしいが横向きや後ろ向きでは検出率が低いとのこと。
今回のモデルは後ろ向きも検出できている。