実現したいこと
初対面の人と音声通話はできるが、ビデオ通話となると中々ハードルが高い。とはいえ声だけより相手の表情も見えた方が親しみが湧きやすい。
ということで、顔も背景も隠しつつ表情は相手に伝わるような通話機能を作ってみる。
使うものとその理由
-
言語今回はSwift
iOSの通話アプリを作るため。 -
通話部分はTRTC (Tencent Real-Time Communication)
AgoraやらMediaSoupを触ってきたことはあるが、TRTCは使ったことがないから。 -
VisionFramework (画像からFaceTrackingを行い表情の描画までを行う)
本当はARKitを使った方が綺麗にFaceTrackingできるのだけど、一瞬自分の顔や背景がチラッと見えてしまうことがあり断念。
準備
-
Tencent Cloud に会員登録
https://www.tencentcloud.com
無料枠があるのでお金は気にせんでもOK -
コンソールは英語 / 中国語のみだが、ドキュメントは日本語のものがあるので見てみると良い
https://www.tencentcloud.com/jp/document/product/647/35086
これだけ。
いざ実装
ドキュメントがしっかりしているので上のURLの UIを含まないソリューション
を上からやっていけばすぐビデオ通話ができる状態までいける。
説明がめんどくさいのでコード載せます。
今回はTRTCにまつわるあれやこれやするManagerクラスを作成。
import Foundation
import TXLiteAVSDK_Professional
protocol TRTCManagerDelegate: AnyObject {
func didJoinedRoom()
func didUserJoined(userId: String)
func didUserLeaved(userId: String)
func onRenderRemoteUser(pixelBuffer: CVPixelBuffer?)
func onRenderLocalUser(pixelBuffer: CVPixelBuffer?)
}
class TRTCManager: NSObject {
private let trtcCloud = TRTCCloud.sharedInstance()
weak var delegate: TRTCManagerDelegate?
override init() {
super.init()
// TRTCセットアップ
trtcCloud.delegate = self
trtcCloud.enableCustomVideoCapture(.small, enable: true)
trtcCloud.setLocalVideoRenderDelegate(self,
pixelFormat: ._NV12,
bufferType: .pixelBuffer)
}
func join(userId: String, roomId: Int) {
// 部屋へのjoin
let params = TRTCParams()
params.sdkAppId = 12345678
params.roomId = 1
params.userId = userId
// userSigの作成はDemoから移植。
params.userSig = GenerateTestUserSig.genTestUserSig(identifier: userId)
params.role = .anchor
trtcCloud.enterRoom(params, appScene: .videoCall)
}
func leave() {
trtcCloud.exitRoom()
}
// 本来はこれを使うだけでローカルの壁らビューが実装できる
func startLocalView(view: UIView) {
let param = TRTCRenderParams()
param.fillMode = .fill
param.mirrorType = .auto
trtcCloud.setLocalRenderParams(param)
trtcCloud.startLocalAudio(.speech)
}
func startRemoteView(userId: String, view: UIView) {
trtcCloud.startRemoteView(userId,
streamType: .small,
view: view)
}
// startLocalViewを使わず、ローカルで画像を加工したりした後に送りたい場合はこれ
func sendCustomVideoData(image: UIImage?) {
guard let buffer = image?.ciImage?.pixelBuffer else {
return
}
let videoFrame = TRTCVideoFrame()
videoFrame.pixelFormat = ._NV12
videoFrame.bufferType = .pixelBuffer
videoFrame.pixelBuffer = buffer
trtcCloud.sendCustomVideoData(.small, frame: videoFrame)
}
// startLocalViewを使わず、ローカルで画像を加工したりした後に送りたい場合はこれその2
func sendCustomVideoData(buffer: CVPixelBuffer?) {
let videoFrame = TRTCVideoFrame()
videoFrame.pixelFormat = ._NV12
videoFrame.bufferType = .pixelBuffer
videoFrame.pixelBuffer = buffer
trtcCloud.sendCustomVideoData(.small, frame: videoFrame)
}
}
extension TRTCManager: TRTCCloudDelegate {
func onError(_ errCode: TXLiteAVError, errMsg: String?, extInfo: [AnyHashable : Any]?) {
print("onError: errorCode \(errCode.rawValue)")
}
func onEnterRoom(_ result: Int) {
if result > 0 {
delegate?.didJoinedRoom()
}
}
func onExitRoom(_ reason: Int) {
print("onExitRoom: reason \(reason)")
}
func onRemoteUserEnterRoom(_ userId: String) {
// 誰かが入室してきたらsetRemoteVideoRenderDelegate実行
trtcCloud.setRemoteVideoRenderDelegate(userId,
delegate: self,
pixelFormat: ._NV12,
bufferType: .pixelBuffer)
delegate?.didUserJoined(userId: userId)
}
func onRemoteUserLeaveRoom(_ userId: String, reason: Int) {
delegate?.didUserLeaved(userId: userId)
}
}
extension TRTCManager: TRTCVideoRenderDelegate {
func onRenderVideoFrame(_ frame: TRTCVideoFrame, userId: String?, streamType: TRTCVideoStreamType) {
DispatchQueue.main.async { [weak self] in
if let userId,
!userId.isEmpty {
// リモートユーザの描画がされたらそのbufferをdelegate経由でViewControllerへ
self?.delegate?.onRenderRemoteUser(pixelBuffer: frame.pixelBuffer)
}
}
}
}
ViewController側はこちら
import UIKit
import AVKit
import Vision
class TalkRoomViewController: UIViewController {
@IBOutlet weak var localUserImageView: UIImageView!
@IBOutlet weak var remoteUserImageView: UIImageView!
@IBOutlet weak var remoteUserView: UIView!
private let captureSession = AVCaptureSession()
private let serialQueue = DispatchQueue(label: "Hoge")
private let faceImageDetector = ImageDetector()
private let trtcManager = TRTCManager()
var userId = ""
override func viewDidLoad() {
super.viewDidLoad()
trtcManager.delegate = self
localUserImageView.transform = CGAffineTransform(scaleX: -1, y: 1)
setup()
}
override func viewWillAppear(_ animated: Bool) {
super.viewWillAppear(animated)
joinRoom()
serialQueue.async { [weak self] in
self?.captureSession.startRunning()
}
}
override func viewWillDisappear(_ animated: Bool) {
super.viewWillDisappear(animated)
serialQueue.async { [weak self] in
self?.captureSession.stopRunning()
}
}
private func joinRoom() {
trtcManager.join(userId: userId, roomId: 1)
}
private func setup() {
captureSession.beginConfiguration()
guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .front),
let deviceInput = try? AVCaptureDeviceInput(device: device),
captureSession.canAddInput(deviceInput)
else { return }
captureSession.addInput(deviceInput)
let videoDataOutput = AVCaptureVideoDataOutput()
videoDataOutput.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String : Int(kCVPixelFormatType_32BGRA)]
videoDataOutput.alwaysDiscardsLateVideoFrames = true
videoDataOutput.setSampleBufferDelegate(self, queue: serialQueue)
guard captureSession.canAddOutput(videoDataOutput) else { return }
captureSession.addOutput(videoDataOutput)
for connection in videoDataOutput.connections {
if connection.isVideoOrientationSupported {
connection.videoOrientation = .portrait
}
}
captureSession.commitConfiguration()
}
@IBAction func didSelectClose(_ sender: Any) {
trtcManager.leave()
dismiss(animated: true)
}
private var count = 0
}
extension TalkRoomViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
// ローカルビューはAVCaptureでもらった画像を加工してそのまま表示
trtcManager.sendCustomVideoData(buffer: sampleBuffer.imageBuffer)
faceImageDetector.detectFaceImage(from: sampleBuffer) { [weak self] result in
DispatchQueue.main.async {
self?.localUserImageView.image = result
}
}
}
}
extension TalkRoomViewController: TRTCManagerDelegate {
func didJoinedRoom() {
}
func didUserJoined(userId: String) {
trtcManager.startRemoteView(userId: userId,
view: remoteUserView)
}
func didUserLeaved(userId: String) {
}
func onRenderRemoteUser(pixelBuffer: CVPixelBuffer?) {
// リモートビューはTRTCから受け取った画像を加工して表示
if let pixelBuffer {
let ciImage = CIImage.init(cvImageBuffer: pixelBuffer)
let cgImage = CIContext(options: nil).createCGImage(ciImage, from: ciImage.extent)
faceImageDetector.detectFaceImage(source: cgImage) { result in
DispatchQueue.main.async { [weak self] in
self?.remoteUserImageView.image = result
}
}
}
}
func onRenderLocalUser(pixelBuffer: CVPixelBuffer?) {
if let pixelBuffer {
localUserImageView.image = UIImage.init(ciImage: CIImage.init(cvImageBuffer: pixelBuffer))
}
}
}
VisionFrameworkで写真から顔を描画する部分
import UIKit
import Vision
import AVKit
class ImageDetector {
func detectFaceImage(source image: CGImage?, handler: @escaping (UIImage?) -> Void) {
guard let image else {
return
}
let detectFaceRequest = VNDetectFaceLandmarksRequest { [weak self] (request, error) in
if error == nil {
if let results = request.results as? [VNFaceObservation] {
for faceObservation in results {
guard let self = self,
let landmarks = faceObservation.landmarks else {
continue
}
let boundingRect = faceObservation.boundingBox
let resultImage = self.drawOnImage(sourceSize: CGSize(width: image.width, height: image.height), boundingRect: boundingRect, faceLandmarks: landmarks)
handler(resultImage)
}
}
} else {
print(error!.localizedDescription)
}
// complete(resultImage)
}
let vnImage = VNImageRequestHandler(cgImage: image, options: [:])
try? vnImage.perform([detectFaceRequest])
}
func detectFaceImage(from sampleBuffer :CMSampleBuffer, handler: @escaping (UIImage?) -> Void) {
guard let image = image(from: sampleBuffer) else {
handler(nil)
return
}
detectFaceImage(source: image.cgImage, handler: handler)
}
private func drawOnImage(sourceSize: CGSize, boundingRect: CGRect, faceLandmarks: VNFaceLandmarks2D) -> UIImage {
UIGraphicsBeginImageContextWithOptions(sourceSize, false, 1)
let context = UIGraphicsGetCurrentContext()!
context.translateBy(x: 0.0, y: sourceSize.height)
context.scaleBy(x: 1.0, y: -1.0)
//context.setBlendMode(CGBlendMode.colorBurn)
context.setLineJoin(.round)
context.setLineCap(.round)
context.setShouldAntialias(true)
context.setAllowsAntialiasing(true)
//draw overlay
context.setLineWidth(1.0)
if let faceContour = faceLandmarks.faceContour {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: faceContour, color: UIColor.magenta.cgColor)
}
if let leftEye = faceLandmarks.leftEye {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: leftEye, color: UIColor.magenta.cgColor, close: true)
}
if let rightEye = faceLandmarks.rightEye {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: rightEye, color: UIColor.magenta.cgColor, close: true)
}
if let leftPupil = faceLandmarks.leftPupil {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: leftPupil, color: UIColor.magenta.cgColor, close: true)
}
if let rightPupil = faceLandmarks.rightPupil {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: rightPupil, color: UIColor.magenta.cgColor, close: true)
}
if let nose = faceLandmarks.nose {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: nose, color: UIColor.magenta.cgColor)
}
if let noseCrest = faceLandmarks.noseCrest {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: noseCrest, color: UIColor.magenta.cgColor)
}
// if let medianLine = faceLandmarks.medianLine {
// drawFeature(medianLine, color: UIColor.magenta.cgColor)
// }
if let outerLips = faceLandmarks.outerLips {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: outerLips, color: UIColor.magenta.cgColor, close: true)
}
if let innerLips = faceLandmarks.innerLips {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: innerLips, color: UIColor.magenta.cgColor, close: true)
}
if let leftEyebrow = faceLandmarks.leftEyebrow {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: leftEyebrow, color: UIColor.magenta.cgColor, close: true)
}
if let rightEyebrow = faceLandmarks.rightEyebrow {
drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: rightEyebrow, color: UIColor.magenta.cgColor, close: true)
}
let coloredImg : UIImage = UIGraphicsGetImageFromCurrentImageContext()!
UIGraphicsEndImageContext()
return coloredImg
}
private func drawFeature(_ context: CGContext,
sourceSize: CGSize,
boundingRect: CGRect,
feature: VNFaceLandmarkRegion2D,
color: CGColor,
close: Bool = false) {
let rectWidth = sourceSize.width * boundingRect.width
let rectHeight = sourceSize.height * boundingRect.height
context.setStrokeColor(color)
context.setFillColor(color)
let mappedPoints = feature.normalizedPoints.map { CGPoint(x: boundingRect.origin.x * sourceSize.width + $0.x * rectWidth, y: boundingRect.origin.y * sourceSize.height + $0.y * rectHeight) }
if mappedPoints.isEmpty {
return
}
if close {
mappedPoints.enumerated().forEach { offset, element in
switch offset {
case 0:
break
default:
let beforeOffset = (mappedPoints.count + offset - 1) % mappedPoints.count
let afterOffset = (mappedPoints.count + offset + 1) % mappedPoints.count
let moreAfterOffset = (mappedPoints.count + offset + 2) % mappedPoints.count
context.move(to: mappedPoints[beforeOffset])
context.addCurve(to: mappedPoints[moreAfterOffset],
control1: element, control2: mappedPoints[afterOffset])
}
}
} else {
mappedPoints.enumerated().forEach { offset, element in
switch offset {
case 0:
break
default:
let beforeOffset = offset - 1
let afterOffset = offset + 1
let moreAfterOffset = offset + 2
if mappedPoints.indices.contains(beforeOffset),
mappedPoints.indices.contains(afterOffset),
mappedPoints.indices.contains(moreAfterOffset) {
context.move(to: mappedPoints[beforeOffset])
context.addCurve(to: mappedPoints[moreAfterOffset],
control1: element, control2: mappedPoints[afterOffset])
}
}
}
}
context.strokePath()
}
private func image(from sampleBuffer :CMSampleBuffer) -> UIImage? {
if let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
let ciImage = CIImage(cvPixelBuffer: pixelBuffer)
let imageRect = CGRect(x: 0, y: 0, width: CVPixelBufferGetWidth(pixelBuffer), height: CVPixelBufferGetHeight(pixelBuffer))
let context = CIContext()
if let image = context.createCGImage(ciImage, from: imageRect) {
return UIImage(cgImage: image)
}
}
return nil
}
}
出来上がったもの
作ってみた感想
TRTCについて (iOS目線
- Agoraと同じくらい使いやすい。一番ベーシックな通話ならAgoraより簡単ではないか
- ドキュメントやサンプルがかなり充実している。
- コンソールが中国語と英語オンリーなので使い辛い。
VisionFrameworkについて
- まぁ実装は単純でわかりやすい
- 少し得られる顔の情報が少ない...
Qiita投稿について
- 今回初めて投稿してみたが、コードを書くよりめんどくさい。w
- 今後はすこーしずつ書いていこうかなと思いました。もっと小さいスコープで。
今後やってみたいこと
TRTCである程度やりたいことができそうなので、画像を加工して電波少年的なUIで通話できるアプリだったり、もう少し綺麗に顔を描画して通話できるアプリなどを作ってみたい。