Yolov5をiOSで使う方法です
Yolov5は物体検出機械学習モデルです。
iPhoneやiPadで使えるように、CoreML形式に変換します。
変換したモデルはそのままでは使えない
CoreMLToolsでシンプルに変換したYoloモデルの出力は、クラスごとの大量のボックスなので、そのままでは使えません。
デコード用のレイヤーと、Non Max Suppressionのレイヤーを追加して、信頼度の高いボックスの座標を絞り込む必要があります。
これらの処理をすることで、iOSのVisionフレームワークで扱えるようになり、また冒頭の画像のような簡単なファイルプレビュー機能が使えるようになります。
iOSで使えるようにするモデル編集スクリプトがこちら
変換ずみモデル(COCOデータセット)はこちら
独自データセットでトレーニングしたYolov5モデルもこちらの変換スクリプトでCoreML形式にできます。
変換コード解説
Yolov5のリポジトリ内にあるエクスポートコードを利用して、PytorchモデルをCoreMLモデルに変換します。
python export.py --weights yolo5s.pt --train --include "coreml"
デコードレイヤーを定義します。
# Just run to define the decode function
import torch
# classLabels = [f"label{i}" for i in range(80)]
numberOfClassLabels = len(classLabels)
outputSize = numberOfClassLabels + 5
# Attention: Some models are reversed!
reverseModel = False
strides = [8, 16, 32]
if reverseModel:
strides.reverse()
featureMapDimensions = [640 // stride for stride in strides]
anchors = ([10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [
116, 90, 156, 198, 373, 326]) # Take these from the <model>.yml in yolov5
if reverseModel:
anchors = anchors[::-1]
anchorGrid = torch.tensor(anchors).float().view(3, -1, 1, 1, 2)
def make_grid(nx, ny):
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
return torch.stack((xv, yv), 2).view((ny, nx, 2)).float()
def addExportLayerToCoreml(builder):
'''
Adds the yolov5 export layer to the coreml model
'''
outputNames = [output.name for output in builder.spec.description.output]
for i, outputName in enumerate(outputNames):
# formulas: https://github.com/ultralytics/yolov5/issues/471
builder.add_activation(name=f"sigmoid_{outputName}", non_linearity="SIGMOID",
input_name=outputName, output_name=f"{outputName}_sigmoid")
### Coordinates calculation ###
# input (1, 3, nC, nC, 85), output (1, 3, nC, nC, 2) -> nC = 640 / strides[i]
builder.add_slice(name=f"slice_coordinates_xy_{outputName}", input_name=f"{outputName}_sigmoid",
output_name=f"{outputName}_sliced_coordinates_xy", axis="width", start_index=0, end_index=2)
# x,y * 2
builder.add_elementwise(name=f"multiply_xy_by_two_{outputName}", input_names=[
f"{outputName}_sliced_coordinates_xy"], output_name=f"{outputName}_multiplied_xy_by_two", mode="MULTIPLY", alpha=2)
# x,y * 2 - 0.5
builder.add_elementwise(name=f"subtract_0_5_from_xy_{outputName}", input_names=[
f"{outputName}_multiplied_xy_by_two"], output_name=f"{outputName}_subtracted_0_5_from_xy", mode="ADD", alpha=-0.5)
grid = make_grid(
featureMapDimensions[i], featureMapDimensions[i]).numpy()
# x,y * 2 - 0.5 + grid[i]
builder.add_bias(name=f"add_grid_from_xy_{outputName}", input_name=f"{outputName}_subtracted_0_5_from_xy",
output_name=f"{outputName}_added_grid_xy", b=grid, shape_bias=grid.shape)
# (x,y * 2 - 0.5 + grid[i]) * stride[i]
builder.add_elementwise(name=f"multiply_xy_by_stride_{outputName}", input_names=[
f"{outputName}_added_grid_xy"], output_name=f"{outputName}_calculated_xy", mode="MULTIPLY", alpha=strides[i])
# input (1, 3, nC, nC, 85), output (1, 3, nC, nC, 2)
builder.add_slice(name=f"slice_coordinates_wh_{outputName}", input_name=f"{outputName}_sigmoid",
output_name=f"{outputName}_sliced_coordinates_wh", axis="width", start_index=2, end_index=4)
# w,h * 2
builder.add_elementwise(name=f"multiply_wh_by_two_{outputName}", input_names=[
f"{outputName}_sliced_coordinates_wh"], output_name=f"{outputName}_multiplied_wh_by_two", mode="MULTIPLY", alpha=2)
# (w,h * 2) ** 2
builder.add_unary(name=f"power_wh_{outputName}", input_name=f"{outputName}_multiplied_wh_by_two",
output_name=f"{outputName}_power_wh", mode="power", alpha=2)
# (w,h * 2) ** 2 * anchor_grid[i]
anchor = anchorGrid[i].expand(-1, featureMapDimensions[i],
featureMapDimensions[i], -1).numpy()
builder.add_load_constant_nd(
name=f"anchors_{outputName}", output_name=f"{outputName}_anchors", constant_value=anchor, shape=anchor.shape)
builder.add_elementwise(name=f"multiply_wh_with_achors_{outputName}", input_names=[
f"{outputName}_power_wh", f"{outputName}_anchors"], output_name=f"{outputName}_calculated_wh", mode="MULTIPLY")
builder.add_concat_nd(name=f"concat_coordinates_{outputName}", input_names=[
f"{outputName}_calculated_xy", f"{outputName}_calculated_wh"], output_name=f"{outputName}_raw_coordinates", axis=-1)
builder.add_scale(name=f"normalize_coordinates_{outputName}", input_name=f"{outputName}_raw_coordinates",
output_name=f"{outputName}_raw_normalized_coordinates", W=torch.tensor([1 / 640]).numpy(), b=0, has_bias=False)
### Confidence calculation ###
builder.add_slice(name=f"slice_object_confidence_{outputName}", input_name=f"{outputName}_sigmoid",
output_name=f"{outputName}_object_confidence", axis="width", start_index=4, end_index=5)
builder.add_slice(name=f"slice_label_confidence_{outputName}", input_name=f"{outputName}_sigmoid",
output_name=f"{outputName}_label_confidence", axis="width", start_index=5, end_index=0)
# confidence = object_confidence * label_confidence
builder.add_multiply_broadcastable(name=f"multiply_object_label_confidence_{outputName}", input_names=[
f"{outputName}_label_confidence", f"{outputName}_object_confidence"], output_name=f"{outputName}_raw_confidence")
# input: (1, 3, nC, nC, 85), output: (3 * nc^2, 85)
builder.add_flatten_to_2d(
name=f"flatten_confidence_{outputName}", input_name=f"{outputName}_raw_confidence", output_name=f"{outputName}_flatten_raw_confidence", axis=-1)
builder.add_flatten_to_2d(
name=f"flatten_coordinates_{outputName}", input_name=f"{outputName}_raw_normalized_coordinates", output_name=f"{outputName}_flatten_raw_coordinates", axis=-1)
builder.add_concat_nd(name="concat_confidence", input_names=[
f"{outputName}_flatten_raw_confidence" for outputName in outputNames], output_name="raw_confidence", axis=-2)
builder.add_concat_nd(name="concat_coordinates", input_names=[
f"{outputName}_flatten_raw_coordinates" for outputName in outputNames], output_name="raw_coordinates", axis=-2)
builder.set_output(output_names=["raw_confidence", "raw_coordinates"], output_dims=[
(25200, numberOfClassLabels), (25200, 4)])
Non Max Suppression を定義します。
# Just run to define the NMS function
def createNmsModelSpec(nnSpec):
'''
Create a coreml model with nms to filter the results of the model
'''
nmsSpec = ct.proto.Model_pb2.Model()
nmsSpec.specificationVersion = 4
# Define input and outputs of the model
for i in range(2):
nnOutput = nnSpec.description.output[i].SerializeToString()
nmsSpec.description.input.add()
nmsSpec.description.input[i].ParseFromString(nnOutput)
nmsSpec.description.output.add()
nmsSpec.description.output[i].ParseFromString(nnOutput)
nmsSpec.description.output[0].name = "confidence"
nmsSpec.description.output[1].name = "coordinates"
# Define output shape of the model
outputSizes = [numberOfClassLabels, 4]
for i in range(len(outputSizes)):
maType = nmsSpec.description.output[i].type.multiArrayType
# First dimension of both output is the number of boxes, which should be flexible
maType.shapeRange.sizeRanges.add()
maType.shapeRange.sizeRanges[0].lowerBound = 0
maType.shapeRange.sizeRanges[0].upperBound = -1
# Second dimension is fixed, for "confidence" it's the number of classes, for coordinates it's position (x, y) and size (w, h)
maType.shapeRange.sizeRanges.add()
maType.shapeRange.sizeRanges[1].lowerBound = outputSizes[i]
maType.shapeRange.sizeRanges[1].upperBound = outputSizes[i]
del maType.shape[:]
# Define the model type non maximum supression
nms = nmsSpec.nonMaximumSuppression
nms.confidenceInputFeatureName = "raw_confidence"
nms.coordinatesInputFeatureName = "raw_coordinates"
nms.confidenceOutputFeatureName = "confidence"
nms.coordinatesOutputFeatureName = "coordinates"
nms.iouThresholdInputFeatureName = "iouThreshold"
nms.confidenceThresholdInputFeatureName = "confidenceThreshold"
# Some good default values for the two additional inputs, can be overwritten when using the model
nms.iouThreshold = 0.6
nms.confidenceThreshold = 0.4
nms.stringClassLabels.vector.extend(classLabels)
return nmsSpec
CoreMLモデルにデコードレイヤーとNon Max Suppressionを追加します。
# run the functions to add decode layer and NMS to the model.
addExportLayerToCoreml(builder)
nmsSpec = createNmsModelSpec(builder.spec)
combineModelsAndExport(builder.spec, nmsSpec, f"yolo5s.mlmodel") # The model will be saved in this path.
これで、Visionで使えるCoreMLモデルが保存されます。
iOSでの使用方法
デコードとNMSは、以下のリポジトリと書籍を参照しています。
🐣
フリーランスエンジニアです。
お仕事のご相談こちらまで
rockyshikoku@gmail.com
Core MLやARKitを使ったアプリを作っています。
機械学習/AR関連の情報を発信しています。