概要

ハンドトラッキングと人物切り抜きをUnity単体で完結させる方法と、TouchDesignerからSpoutで送受信する方法を試したのでその記録。

環境

OS：Windows 10 Home
CPU：i7-8750H
GPU：NVIDIA GeForce RTX 2070 Max-Q

Unity 2021.3.4f1

使用したプラグイン

ハンドトラッキング
MediaPipeUnityPlugin
人物切り抜き
keijiro/NNCam
UnityでSpout
keijiro/KlakSpout

1. Mediapipeでハンドトラッキング

↑こちらの記事を参考に、MediaPipeUnityPluginを使用した。

準備

プロジェクトファイルをZipでダウンロードした後、Releases/v0.10.3のMediaPipeUnity.0.10.3.unitypackageをimportしてAssets/MediaPipeUnity/Samples/Scenesが動くことを確認する。

サンプルシーンは中身をいじるには複雑な構造になっていたので、WikiのTutorial/Official Solutionを順番に真似ていくことにした。

Face Tracking部分を一通りなぞった後、以下2つからhand_landmark_full.bytesやsidePacketを参考にしてHand Trackingに対応させていった。

結果

マーカーを表示するコード全文

Inspectorはこんな感じ。

HandtrackingAddLayer.cs

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.UI;
using Mediapipe.Unity.CoordinateSystem;
using Stopwatch = System.Diagnostics.Stopwatch; // for Timestamp

namespace Mediapipe.Unity.Tutorial
{
	public class HandtrackingAddLayer : MonoBehaviour
	{
		[SerializeField] private RawImage _screen;
		[SerializeField] private int _width;
		[SerializeField] private int _height;
		[SerializeField] private int _fps;
		// ----以下追加-----
		// ハンドトラッキング用
		[SerializeField] private TextAsset _configAsset;
		private CalculatorGraph _graph;
		private ResourceManager _resourceManager;
		private enum ModelComplexity { Lite = 0, Full = 1,}
		[SerializeField] private ModelComplexity _modelComplexity = ModelComplexity.Full;
		[SerializeField] private int _maxNumHands = 2;
		// カメラ入力用
		private WebCamTexture _webCamTexture;
		private Texture2D _inputTexture;
		private Color32[] _pixelData;
		// 重ねるマーカー用
		[SerializeField] private MultiHandLandmarkListAnnotationController _annotationController;

		private IEnumerator Start()
		{
			if (WebCamTexture.devices.Length == 0)
			{
				throw new System.Exception("Web Camera devices are not found");
			}
			// 使用するカメラを選択
			var webCamDevice = WebCamTexture.devices[0];
			_webCamTexture = new WebCamTexture(webCamDevice.name, _width, _height, _fps);
			_webCamTexture.Play();

			yield return new WaitUntil(() => _webCamTexture.width > 16);
			_screen.rectTransform.sizeDelta = new Vector2(_width, _height);

			// ----以下追加-----
			// MediaPipeでは、CPU 上の画像データがImageFrameクラスに格納される
			_inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
			_pixelData = new Color32[_width * _height];

			// WebCamの映像を表示
			_screen.texture = _webCamTexture;

			_resourceManager = new LocalResourceManager();
			if (_modelComplexity == ModelComplexity.Lite)
			{
				yield return _resourceManager.PrepareAssetAsync("hand_landmark_lite.bytes");
				yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
				yield return _resourceManager.PrepareAssetAsync("handedness.txt");
				yield return _resourceManager.PrepareAssetAsync("palm_detection_lite.bytes");
			}
			else
			{
				yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
				yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
				yield return _resourceManager.PrepareAssetAsync("handedness.txt");
				yield return _resourceManager.PrepareAssetAsync("palm_detection_full.bytes");
			}

			// configtextは外部から読み込む
			_graph = new CalculatorGraph(_configAsset.text);
			// ランドマークの取得
			var handLandmarksStream = new OutputStream<NormalizedLandmarkListVectorPacket, List<NormalizedLandmarkList>>(_graph, "hand_landmarks");
			handLandmarksStream.StartPolling().AssertOk();

			// sidePacketを作ってStartRunに渡す
			var sidePacket = new SidePacket();
			sidePacket.Emplace("model_complexity", new IntPacket((int)_modelComplexity));
			sidePacket.Emplace("num_hands", new IntPacket(_maxNumHands));
            // カメラ映像とマーカーの座標は以下で合わせられる
			sidePacket.Emplace("input_rotation", new IntPacket(0));
			sidePacket.Emplace("input_horizontally_flipped", new BoolPacket(false));
			sidePacket.Emplace("input_vertically_flipped", new BoolPacket(true));

			_graph.StartRun(sidePacket).AssertOk();

			var stopwatch = new Stopwatch();
			stopwatch.Start();

			// ランドマーク用
			var screenRect = _screen.GetComponent<RectTransform>().rect;

			while(true)
			{
				_inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData));
				var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width*4, _inputTexture.GetRawTextureData<byte>());
				var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000)
				_graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();

				yield return new WaitForEndOfFrame();

				if (handLandmarksStream.TryGetNext(out var multiLandmarks))
				{
                    // マーカーの表示
					_annotationController.DrawNow(multiLandmarks);

					// 位置の表示
					if (multiLandmarks != null && multiLandmarks.Count > 0)
					{
						foreach (var landmarks in multiLandmarks)
						{
							// landmarks.Landmark[9]：中指の付け根の位置を取得
							var posTarget = landmarks.Landmark[9];
							Debug.Log($"Unity Local Coordinates: {screenRect.GetPoint(posTarget)}, Image Coordinates: {posTarget}");
						}
					}
				}
				else
				{
					_annotationController.DrawNow(null);
				}
			}
		}

		private void OnDestroy()
		{
			if (_webCamTexture != null)
			{
				_webCamTexture.Stop();
			}

			if(_graph != null)
			{
				try
				{
					// InputStreamを閉じてCalculatorGraphを処分する
					_graph.CloseInputStream("input_video").AssertOk();
					_graph.WaitUntilDone().AssertOk();
				}
				finally
				{
					_graph.Dispose();
					Debug.Log("Done");
				}
			}
		}
	}
}

取得したい各LandMarkのIDはここから。

keijiro/NNCamで人物切り抜き

準備

keijiro/NNCamのプロジェクトファイルをZipでダウンロードした後、RoomフォルダをNNCamフォルダ下に移動して、NNCamフォルダをunitypackageとしてExport。
Mediapipeのハンドトラッキングを動かしていたプロジェクトにimportする。

NNCamのREADMEに書いてあるとおり、Releases/v0.0.1からBodyPixOnnx.zipをダウンロードし、作成したONNXフォルダ下に置く。

NNCam/Roomのシーンを再生しようとすると、MediaPipeUnityPluginとNNCamで使用するBarracudaで、Google.Protobuf.dllというが被ってるぞというエラーが出るので、

Multiple precompiled assemblies with the same name Google.Protobuf.dll included on the current platform.
Only one assembly with the same name is allowed per platform.

Library/PackageCache/com.unity.barracuda@80909e3320/Barracuda/Runtime/Plugins/ProtoBuffer/Google.Protobuf.dllのほうを削除する。

プロジェクトを立ち上げ直すとエラーが解消されているはず。

結果

NNCamのModelはRes Net 50-stride32を使用。

人物切り抜きとハンドトラッキングを合わせたコード全文

Inspectorはこんな感じ。

HandtrackingNNCam.cs

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.UI;
using Stopwatch = System.Diagnostics.Stopwatch; // for Timestamp
using NNCam; // from keijiro

namespace Mediapipe.Unity.Tutorial
{
	public class HandtrackingNNCam : MonoBehaviour
	{
		// カメラ入力用
		private WebCamTexture _webCamTexture;
		[SerializeField] private int _width;
		[SerializeField] private int _height;
		[SerializeField] private int _fps;
		private Texture2D _inputTexture;
		private Color32[] _pixelData;
		[SerializeField] private RawImage _screen;
		// ハンドトラッキング用
		[SerializeField] private TextAsset _configAsset;
		private enum ModelComplexity { Lite = 0, Full = 1,}
		[SerializeField] private ModelComplexity _modelComplexity = ModelComplexity.Full;
		[SerializeField] private int _maxNumHands = 2;
		private CalculatorGraph _graph;
		private ResourceManager _resourceManager;
		// 重ねるマーカー用
		[SerializeField] private MultiHandLandmarkListAnnotationController _annotationController;

		// for NNCam
		[SerializeField] Texture2D _background = null;
		[SerializeField, Range(0.01f, 0.99f)] float _threshold = .5f;
		[SerializeField] ResourceSet _resources = null;
		[SerializeField] Shader _shader = null;
		private SegmentationFilter _filter;
		private Material _material;

        // 人物切り抜きのマスク
		void Update()
		=> _filter.ProcessImage(_webCamTexture);

		private IEnumerator Start()
		{
			// for NNCam
			_filter = new SegmentationFilter(_resources);
			_material = new Material(_shader);
			// NNCamのマテリアルをRawImageにあてる
			_screen.material = _material;

			if (WebCamTexture.devices.Length == 0)
			{
				throw new System.Exception("Web Camera devices are not found");
			}
			// 使用するカメラを選択
			var webCamDevice = WebCamTexture.devices[0];
			_webCamTexture = new WebCamTexture(webCamDevice.name, _width, _height, _fps);
			_webCamTexture.Play();

			yield return new WaitUntil(() => _webCamTexture.width > 16);

			_screen.rectTransform.sizeDelta = new Vector2(_width, _height);

			// MediaPipeでは、CPU 上の画像データがImageFrameクラスに格納される
			_inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
			_pixelData = new Color32[_width * _height];

			// WebCamの映像を表示
			_screen.texture = _webCamTexture;

			_resourceManager = new LocalResourceManager();
			if (_modelComplexity == ModelComplexity.Lite)
			{
				yield return _resourceManager.PrepareAssetAsync("hand_landmark_lite.bytes");
				yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
				yield return _resourceManager.PrepareAssetAsync("handedness.txt");
				yield return _resourceManager.PrepareAssetAsync("palm_detection_lite.bytes");
			}
			else
			{
				yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
				yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
				yield return _resourceManager.PrepareAssetAsync("handedness.txt");
				yield return _resourceManager.PrepareAssetAsync("palm_detection_full.bytes");
			}

			// configtextは外部から読み込む
			_graph = new CalculatorGraph(_configAsset.text);
			// ランドマークの取得
			var handLandmarksStream = new OutputStream<NormalizedLandmarkListVectorPacket, List<NormalizedLandmarkList>>(_graph, "hand_landmarks");
			handLandmarksStream.StartPolling().AssertOk();

			// sidePacketを作ってStartRunに渡す
			var sidePacket = new SidePacket();
			sidePacket.Emplace("model_complexity", new IntPacket((int)_modelComplexity));
			sidePacket.Emplace("num_hands", new IntPacket(_maxNumHands));
            // カメラ映像とマーカーの座標は以下で合わせられる
			sidePacket.Emplace("input_rotation", new IntPacket(0));
			sidePacket.Emplace("input_horizontally_flipped", new BoolPacket(false));
			sidePacket.Emplace("input_vertically_flipped", new BoolPacket(true));

			_graph.StartRun(sidePacket).AssertOk();

			var stopwatch = new Stopwatch();
			stopwatch.Start();

			// ランドマーク用
			var screenRect = _screen.GetComponent<RectTransform>().rect;

			while(true)
			{
				_inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData));
				var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width*4, _inputTexture.GetRawTextureData<byte>());
				var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
				_graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();

                // RawImageにあてる用のマテリアルをセット
				_material.SetTexture("_MainTex", _webCamTexture);
				_material.SetTexture("_Background", _background);
				_material.SetTexture("_Mask", _filter.MaskTexture);
				_material.SetFloat("_Threshold", _threshold);

				yield return new WaitForEndOfFrame();

				if (handLandmarksStream.TryGetNext(out var multiLandmarks))
				{
                    // マーカーの表示
					_annotationController.DrawNow(multiLandmarks);
				}
				else
				{
					_annotationController.DrawNow(null);
				}
			}
		}

		private void OnDestroy()
		{
			if (_webCamTexture != null)
			{
				_webCamTexture.Stop();
			}

			if(_graph != null)
			{
				try
				{
					// InputStreamを閉じてCalculatorGraphを処分する
					_graph.CloseInputStream("input_video").AssertOk();
					_graph.WaitUntilDone().AssertOk();
				}
				finally
				{
					_graph.Dispose();
					Debug.Log("Done");
				}
			}
			// for NNCam
			_filter.Dispose();
			Destroy(_material);
		}
	}
}

Compositor.shaderをカメラの入力に対応させるため、_CameraFeedを_MainTexで置き換える。

CompositorForMediapipe.shader

Shader "Hidden/NNCam/CompositorForMediapipe"
{
    Properties
    {
        _MainTex ("Albedo (RGB)", 2D) = "white" {}
        _Background("", 2D) = ""{}
        _Mask("", 2D) = ""{}
    }

    CGINCLUDE

    #include "UnityCG.cginc"

	sampler2D _MainTex;
    sampler2D _Background;
    sampler2D _Mask;
    float _Threshold;

	struct Input
	{
		float2 uv_MainTex;
	};

    void Vertex(float4 position : POSITION,
                float2 uv : TEXCOORD0,
                out float4 outPosition : SV_Position,
                out float2 outUV : TEXCOORD0)
    {
        outPosition = UnityObjectToClipPos(position);
        outUV = uv;
    }

    float4 Fragment(float4 position : SV_Position,
                    float2 uv : TEXCOORD0) : SV_Target
    {
        float3 bg = tex2D(_Background, uv).rgb;
        float3 fg = tex2D(_MainTex, uv).rgb;
        float mask = tex2D(_Mask, uv).r;
        float th1 = max(0, _Threshold - 0.1);
        float th2 = min(1, _Threshold + 0.1);
        return float4(lerp(bg, fg, smoothstep(th1, th2, mask)), 1);
    }

    ENDCG

    SubShader
    {
        Pass
        {
            CGPROGRAM
            #pragma vertex Vertex
            #pragma fragment Fragment
            ENDCG
        }
    }
}

TouchDesignerで切り抜き、Spoutでやり取り

準備

こちらを参考にSpoutの環境を整える。
keijiro/KlakSpoutのREADMEの手順に従って、Scoped Registries（スコープ付きレジストリ）を利用してKlakSpoutをimportする。

うまくいくと、Window/Package Manager/My Registriesにkeijiro作のプラグインがたくさん表示される。

スコープ付きレジストリとは、な記事。

結果

Nvidia Background TOPが優秀だけど、ディレイあり。

Spoutの受信とハンドトラッキングを合わせたコード全文

Inspectorはこんな感じ。

Target TextureにSpoutを受ける用のRenderTextureを作成してセットしたらすぐに受け取れた。

入力画像をハンドトラッキングの推論に渡すところは、

_inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData));

Color32[]で入っていれば良さそうなので、以下の記事を参考にRenderTextureを変換してみる。

HandtrackingSpout.cs

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.UI;
using Stopwatch = System.Diagnostics.Stopwatch; // for Timestamp

namespace Mediapipe.Unity.Tutorial
{
	public class HandtrackingSpout : MonoBehaviour
	{
		// 入力用
		[SerializeField] RenderTexture _spoutIn;
		[SerializeField] private int _width;
		[SerializeField] private int _height;
		private Texture2D _inputTexture;
		private Color32[] _pixelData;
		[SerializeField] private RawImage _screen;
		// ハンドトラッキング用
		[SerializeField] private TextAsset _configAsset;
		private enum ModelComplexity { Lite = 0, Full = 1,}
		[SerializeField] private ModelComplexity _modelComplexity = ModelComplexity.Full;
		[SerializeField] private int _maxNumHands = 2;
		private CalculatorGraph _graph;
		private ResourceManager _resourceManager;
		// 重ねるマーカー用
		[SerializeField] private MultiHandLandmarkListAnnotationController _annotationController;

		private IEnumerator Start()
		{
			// MediaPipeでは、CPU 上の画像データがImageFrameクラスに格納される
			_inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
			_pixelData = new Color32[_width * _height];

			_screen.rectTransform.sizeDelta = new Vector2(_width, _height);

			// Spoutの映像を表示
			_screen.texture = _spoutIn;

			_resourceManager = new LocalResourceManager();
			if (_modelComplexity == ModelComplexity.Lite)
			{
				yield return _resourceManager.PrepareAssetAsync("hand_landmark_lite.bytes");
				yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
				yield return _resourceManager.PrepareAssetAsync("handedness.txt");
				yield return _resourceManager.PrepareAssetAsync("palm_detection_lite.bytes");
			}
			else
			{
				yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
				yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
				yield return _resourceManager.PrepareAssetAsync("handedness.txt");
				yield return _resourceManager.PrepareAssetAsync("palm_detection_full.bytes");
			}

			// configtextは外部から読み込む
			_graph = new CalculatorGraph(_configAsset.text);
			// ランドマークの取得
			var handLandmarksStream = new OutputStream<NormalizedLandmarkListVectorPacket, List<NormalizedLandmarkList>>(_graph, "hand_landmarks");
			handLandmarksStream.StartPolling().AssertOk();

			// sidePacketを作ってStartRunに渡す
			var sidePacket = new SidePacket();
			sidePacket.Emplace("model_complexity", new IntPacket((int)_modelComplexity));
			sidePacket.Emplace("num_hands", new IntPacket(_maxNumHands));
			sidePacket.Emplace("input_rotation", new IntPacket(0));
			sidePacket.Emplace("input_horizontally_flipped", new BoolPacket(false));
			sidePacket.Emplace("input_vertically_flipped", new BoolPacket(true));

			_graph.StartRun(sidePacket).AssertOk();

			var stopwatch = new Stopwatch();
			stopwatch.Start();

			// ランドマーク用
			var screenRect = _screen.GetComponent<RectTransform>().rect;

			while(true)
			{
				_inputTexture.SetPixels32(GetPixels(_spoutIn));
				var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width*4, _inputTexture.GetRawTextureData<byte>());
				var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
				_graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();

				yield return new WaitForEndOfFrame();

				if (handLandmarksStream.TryGetNext(out var multiLandmarks))
				{
					_annotationController.DrawNow(multiLandmarks);
				}
				else
				{
					_annotationController.DrawNow(null);
				}
			}
		}

		// Spoutで受け取ったRenderTextureをMediapipeのためにColor32[]に変換する
		private Color32[] GetPixels(RenderTexture rt)
		{
			var currentRT = RenderTexture.active;
			RenderTexture.active = rt;

			// ピクセル情報をテクスチャに格納する
			var texture = new Texture2D(rt.width, rt.height);
			texture.ReadPixels(new UnityEngine.Rect(0, 0, rt.width, rt.height), 0, 0);
			texture.Apply();

			// ピクセル情報を取得する
			var colors = texture.GetPixels32();

			// 元に戻す
			RenderTexture.active = currentRT;
			return colors;
		}

		private void OnDestroy()
		{
			if(_graph != null)
			{
				try
				{
					// InputStreamを閉じてCalculatorGraphを処分する
					_graph.CloseInputStream("input_video").AssertOk();
					_graph.WaitUntilDone().AssertOk();
				}
				finally
				{
					_graph.Dispose();
					Debug.Log("Done");
				}
			}
		}
	}
}

余談

Nvidia Background TOPについては、このチュートリアルがわかりやすい。

あとkeijiroはすごい。

【Unity】Mediapipeでハンドトラッキング + 人物切り抜き（ONNX or TouchDesignerからSpout）

概要

目次

環境

使用したプラグイン

1. Mediapipeでハンドトラッキング

準備

結果

マーカーを表示するコード全文

keijiro/NNCamで人物切り抜き

準備

結果

人物切り抜きとハンドトラッキングを合わせたコード全文

TouchDesignerで切り抜き、Spoutでやり取り

準備

結果

Spoutの受信とハンドトラッキングを合わせたコード全文

余談