実現したいこと
初対面の人と音声通話はできるが、ビデオ通話となると中々ハードルが高い。とはいえ声だけより相手の表情も見えた方が親しみが湧きやすい。
ということで、顔も背景も隠しつつ表情は相手に伝わるような通話機能を作ってみる。
使うものとその理由
- 
言語今回はSwift
iOSの通話アプリを作るため。 - 
通話部分はTRTC (Tencent Real-Time Communication)
AgoraやらMediaSoupを触ってきたことはあるが、TRTCは使ったことがないから。 - 
VisionFramework (画像からFaceTrackingを行い表情の描画までを行う)
本当はARKitを使った方が綺麗にFaceTrackingできるのだけど、一瞬自分の顔や背景がチラッと見えてしまうことがあり断念。 
準備
- 
Tencent Cloud に会員登録
https://www.tencentcloud.com
無料枠があるのでお金は気にせんでもOK - 
コンソールは英語 / 中国語のみだが、ドキュメントは日本語のものがあるので見てみると良い
https://www.tencentcloud.com/jp/document/product/647/35086 
これだけ。
いざ実装
ドキュメントがしっかりしているので上のURLの UIを含まないソリューション を上からやっていけばすぐビデオ通話ができる状態までいける。
説明がめんどくさいのでコード載せます。
今回はTRTCにまつわるあれやこれやするManagerクラスを作成。
import Foundation
import TXLiteAVSDK_Professional
protocol TRTCManagerDelegate: AnyObject {
    func didJoinedRoom()
    
    func didUserJoined(userId: String)
    func didUserLeaved(userId: String)
    
    func onRenderRemoteUser(pixelBuffer: CVPixelBuffer?)
    func onRenderLocalUser(pixelBuffer: CVPixelBuffer?)
}
class TRTCManager: NSObject {
    private let trtcCloud = TRTCCloud.sharedInstance()
    
    weak var delegate: TRTCManagerDelegate?
    
    override init() {
        super.init()
        // TRTCセットアップ
        trtcCloud.delegate = self
        trtcCloud.enableCustomVideoCapture(.small, enable: true)
        trtcCloud.setLocalVideoRenderDelegate(self,
                                              pixelFormat: ._NV12,
                                              bufferType: .pixelBuffer)
    }
    
    func join(userId: String, roomId: Int) {
        // 部屋へのjoin
        let params = TRTCParams()
        params.sdkAppId = 12345678
        params.roomId = 1
        params.userId = userId
        // userSigの作成はDemoから移植。
        params.userSig = GenerateTestUserSig.genTestUserSig(identifier: userId)
        params.role = .anchor
        
        trtcCloud.enterRoom(params, appScene: .videoCall)
    }
    
    func leave() {
        trtcCloud.exitRoom()
    }
    
    // 本来はこれを使うだけでローカルの壁らビューが実装できる
    func startLocalView(view: UIView) {
        let param = TRTCRenderParams()
        param.fillMode = .fill
        param.mirrorType = .auto
        trtcCloud.setLocalRenderParams(param)
        trtcCloud.startLocalAudio(.speech)
    }
    
    func startRemoteView(userId: String, view: UIView) {
        trtcCloud.startRemoteView(userId,
                                  streamType: .small,
                                  view: view)
    }
    // startLocalViewを使わず、ローカルで画像を加工したりした後に送りたい場合はこれ
    func sendCustomVideoData(image: UIImage?) {
        guard let buffer = image?.ciImage?.pixelBuffer else {
            return
        }
        let videoFrame = TRTCVideoFrame()
        videoFrame.pixelFormat = ._NV12
        videoFrame.bufferType = .pixelBuffer
        videoFrame.pixelBuffer = buffer
        trtcCloud.sendCustomVideoData(.small, frame: videoFrame)
    }
    
    // startLocalViewを使わず、ローカルで画像を加工したりした後に送りたい場合はこれその2
    func sendCustomVideoData(buffer: CVPixelBuffer?) {
        let videoFrame = TRTCVideoFrame()
        videoFrame.pixelFormat = ._NV12
        videoFrame.bufferType = .pixelBuffer
        videoFrame.pixelBuffer = buffer
        trtcCloud.sendCustomVideoData(.small, frame: videoFrame)
    }
}
extension TRTCManager: TRTCCloudDelegate {
    func onError(_ errCode: TXLiteAVError, errMsg: String?, extInfo: [AnyHashable : Any]?) {
        print("onError: errorCode \(errCode.rawValue)")
    }
    
    func onEnterRoom(_ result: Int) {
        if result > 0 {
            delegate?.didJoinedRoom()
        }
    }
    
    func onExitRoom(_ reason: Int) {
        print("onExitRoom: reason \(reason)")
    }
    
    func onRemoteUserEnterRoom(_ userId: String) {
        // 誰かが入室してきたらsetRemoteVideoRenderDelegate実行
        trtcCloud.setRemoteVideoRenderDelegate(userId,
                                               delegate: self,
                                               pixelFormat: ._NV12,
                                               bufferType: .pixelBuffer)
        delegate?.didUserJoined(userId: userId)
    }
    
    func onRemoteUserLeaveRoom(_ userId: String, reason: Int) {
        delegate?.didUserLeaved(userId: userId)
    }
}
extension TRTCManager: TRTCVideoRenderDelegate {
    func onRenderVideoFrame(_ frame: TRTCVideoFrame, userId: String?, streamType: TRTCVideoStreamType) {
        DispatchQueue.main.async { [weak self] in
            if let userId,
               !userId.isEmpty {
                // リモートユーザの描画がされたらそのbufferをdelegate経由でViewControllerへ
                self?.delegate?.onRenderRemoteUser(pixelBuffer: frame.pixelBuffer)
            }
        }
    }
}
ViewController側はこちら
import UIKit
import AVKit
import Vision
class TalkRoomViewController: UIViewController {
    @IBOutlet weak var localUserImageView: UIImageView!
    @IBOutlet weak var remoteUserImageView: UIImageView!
    @IBOutlet weak var remoteUserView: UIView!
    
    private let captureSession = AVCaptureSession()
    
    private let serialQueue = DispatchQueue(label: "Hoge")
    
    private let faceImageDetector = ImageDetector()
    
    private let trtcManager = TRTCManager()
    
    var userId = ""
    override func viewDidLoad() {
        super.viewDidLoad()
        trtcManager.delegate = self
        localUserImageView.transform = CGAffineTransform(scaleX: -1, y: 1)
        setup()
    }
    
    override func viewWillAppear(_ animated: Bool) {
        super.viewWillAppear(animated)
        
        joinRoom()
        serialQueue.async { [weak self] in
            self?.captureSession.startRunning()
        }
    }
    
    override func viewWillDisappear(_ animated: Bool) {
        super.viewWillDisappear(animated)
        
        serialQueue.async { [weak self] in
            self?.captureSession.stopRunning()
        }
    }
    
    private func joinRoom() {
        trtcManager.join(userId: userId, roomId: 1)
    }
    
    private func setup() {
        captureSession.beginConfiguration()
        
        guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .front),
              let deviceInput = try? AVCaptureDeviceInput(device: device),
            captureSession.canAddInput(deviceInput)
        else { return }
        captureSession.addInput(deviceInput)
        
        let videoDataOutput = AVCaptureVideoDataOutput()
        videoDataOutput.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String : Int(kCVPixelFormatType_32BGRA)]
               videoDataOutput.alwaysDiscardsLateVideoFrames = true
        videoDataOutput.setSampleBufferDelegate(self, queue: serialQueue)
        
        guard captureSession.canAddOutput(videoDataOutput) else { return }
        captureSession.addOutput(videoDataOutput)
        
        for connection in videoDataOutput.connections {
            if connection.isVideoOrientationSupported {
                connection.videoOrientation = .portrait
            }
        }
        captureSession.commitConfiguration()
    }
    
    @IBAction func didSelectClose(_ sender: Any) {
        trtcManager.leave()
        dismiss(animated: true)
    }
    
    private var count = 0
}
extension TalkRoomViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
    func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        // ローカルビューはAVCaptureでもらった画像を加工してそのまま表示
        trtcManager.sendCustomVideoData(buffer: sampleBuffer.imageBuffer)
        faceImageDetector.detectFaceImage(from: sampleBuffer) { [weak self] result in
            DispatchQueue.main.async {
                self?.localUserImageView.image = result
            }
        }
    }
}
extension TalkRoomViewController: TRTCManagerDelegate {
    func didJoinedRoom() {
        
    }
    
    func didUserJoined(userId: String) {
        trtcManager.startRemoteView(userId: userId,
                                    view: remoteUserView)
    }
    
    func didUserLeaved(userId: String) {
        
    }
    
    func onRenderRemoteUser(pixelBuffer: CVPixelBuffer?) {
        // リモートビューはTRTCから受け取った画像を加工して表示
        if let pixelBuffer {
            let ciImage = CIImage.init(cvImageBuffer: pixelBuffer)
            let cgImage = CIContext(options: nil).createCGImage(ciImage, from: ciImage.extent)
            faceImageDetector.detectFaceImage(source: cgImage) { result in
                DispatchQueue.main.async { [weak self] in
                    self?.remoteUserImageView.image = result
                }
            }
        }
    }
    
    func onRenderLocalUser(pixelBuffer: CVPixelBuffer?) {
        if let pixelBuffer {
            localUserImageView.image = UIImage.init(ciImage: CIImage.init(cvImageBuffer: pixelBuffer))
        }
    }
}
VisionFrameworkで写真から顔を描画する部分
import UIKit
import Vision
import AVKit
class ImageDetector {
    func detectFaceImage(source image: CGImage?, handler: @escaping (UIImage?) -> Void) {
        guard let image else {
            return
        }
        let detectFaceRequest = VNDetectFaceLandmarksRequest { [weak self] (request, error) in
            if error == nil {
                if let results = request.results as? [VNFaceObservation] {
                    for faceObservation in results {
                        guard let self = self,
                              let landmarks = faceObservation.landmarks else {
                            continue
                        }
                        let boundingRect = faceObservation.boundingBox
                        
                        let resultImage = self.drawOnImage(sourceSize: CGSize(width: image.width, height: image.height), boundingRect: boundingRect, faceLandmarks: landmarks)
                        handler(resultImage)
                    }
                }
            } else {
                print(error!.localizedDescription)
            }
//            complete(resultImage)
        }
        let vnImage = VNImageRequestHandler(cgImage: image, options: [:])
        try? vnImage.perform([detectFaceRequest])
    }
    
    func detectFaceImage(from sampleBuffer :CMSampleBuffer, handler: @escaping (UIImage?) -> Void) {
        guard let image = image(from: sampleBuffer) else {
            handler(nil)
            return
        }
        detectFaceImage(source: image.cgImage, handler: handler)
    }
    
    private func drawOnImage(sourceSize: CGSize, boundingRect: CGRect, faceLandmarks: VNFaceLandmarks2D) -> UIImage {
        UIGraphicsBeginImageContextWithOptions(sourceSize, false, 1)
        let context = UIGraphicsGetCurrentContext()!
        context.translateBy(x: 0.0, y: sourceSize.height)
        context.scaleBy(x: 1.0, y: -1.0)
        //context.setBlendMode(CGBlendMode.colorBurn)
        context.setLineJoin(.round)
        context.setLineCap(.round)
        context.setShouldAntialias(true)
        context.setAllowsAntialiasing(true)
        
        //draw overlay
        context.setLineWidth(1.0)
        
        if let faceContour = faceLandmarks.faceContour {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: faceContour, color: UIColor.magenta.cgColor)
        }
        
        if let leftEye = faceLandmarks.leftEye {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: leftEye, color: UIColor.magenta.cgColor, close: true)
        }
        if let rightEye = faceLandmarks.rightEye {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: rightEye, color: UIColor.magenta.cgColor, close: true)
        }
        if let leftPupil = faceLandmarks.leftPupil {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: leftPupil, color: UIColor.magenta.cgColor, close: true)
        }
        if let rightPupil = faceLandmarks.rightPupil {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: rightPupil, color: UIColor.magenta.cgColor, close: true)
        }
        
        if let nose = faceLandmarks.nose {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: nose, color: UIColor.magenta.cgColor)
        }
        if let noseCrest = faceLandmarks.noseCrest {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: noseCrest, color: UIColor.magenta.cgColor)
        }
        
//        if let medianLine = faceLandmarks.medianLine {
//            drawFeature(medianLine, color: UIColor.magenta.cgColor)
//        }
        
        if let outerLips = faceLandmarks.outerLips {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: outerLips, color: UIColor.magenta.cgColor, close: true)
        }
        if let innerLips = faceLandmarks.innerLips {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: innerLips, color: UIColor.magenta.cgColor, close: true)
        }
        
        if let leftEyebrow = faceLandmarks.leftEyebrow {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: leftEyebrow, color: UIColor.magenta.cgColor, close: true)
        }
        if let rightEyebrow = faceLandmarks.rightEyebrow {
            drawFeature(context, sourceSize: sourceSize, boundingRect: boundingRect, feature: rightEyebrow, color: UIColor.magenta.cgColor, close: true)
        }
        
        let coloredImg : UIImage = UIGraphicsGetImageFromCurrentImageContext()!
        UIGraphicsEndImageContext()
        return coloredImg
    }
    
    private func drawFeature(_ context: CGContext,
                             sourceSize: CGSize,
                             boundingRect: CGRect,
                             feature: VNFaceLandmarkRegion2D,
                             color: CGColor,
                             close: Bool = false) {
        let rectWidth = sourceSize.width * boundingRect.width
        let rectHeight = sourceSize.height * boundingRect.height
        context.setStrokeColor(color)
        context.setFillColor(color)
        let mappedPoints = feature.normalizedPoints.map { CGPoint(x: boundingRect.origin.x * sourceSize.width + $0.x * rectWidth, y: boundingRect.origin.y * sourceSize.height + $0.y * rectHeight) }
        if mappedPoints.isEmpty {
            return
        }
        if close {
            mappedPoints.enumerated().forEach { offset, element in
                switch offset {
                case 0:
                    break
                default:
                    let beforeOffset = (mappedPoints.count + offset - 1) % mappedPoints.count
                    let afterOffset = (mappedPoints.count + offset + 1) % mappedPoints.count
                    let moreAfterOffset = (mappedPoints.count + offset + 2) % mappedPoints.count
                    context.move(to: mappedPoints[beforeOffset])
                    context.addCurve(to: mappedPoints[moreAfterOffset],
                                     control1: element, control2: mappedPoints[afterOffset])
                }
            }
        } else {
            mappedPoints.enumerated().forEach { offset, element in
                switch offset {
                case 0:
                    break
                default:
                    let beforeOffset = offset - 1
                    let afterOffset = offset + 1
                    let moreAfterOffset = offset + 2
                    if mappedPoints.indices.contains(beforeOffset),
                       mappedPoints.indices.contains(afterOffset),
                       mappedPoints.indices.contains(moreAfterOffset) {
                        context.move(to: mappedPoints[beforeOffset])
                        context.addCurve(to: mappedPoints[moreAfterOffset],
                                         control1: element, control2: mappedPoints[afterOffset])
                    }
                }
            }
        }
        
        context.strokePath()
    }
    
    private func image(from sampleBuffer :CMSampleBuffer) -> UIImage? {
        if let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
            let ciImage = CIImage(cvPixelBuffer: pixelBuffer)
            let imageRect = CGRect(x: 0, y: 0, width: CVPixelBufferGetWidth(pixelBuffer), height: CVPixelBufferGetHeight(pixelBuffer))
            let context = CIContext()
            if let image = context.createCGImage(ciImage, from: imageRect) {
                return UIImage(cgImage: image)
            }
        }
        return nil
    }
}
出来上がったもの
作ってみた感想
TRTCについて (iOS目線
- Agoraと同じくらい使いやすい。一番ベーシックな通話ならAgoraより簡単ではないか
 - ドキュメントやサンプルがかなり充実している。
 - コンソールが中国語と英語オンリーなので使い辛い。
 
VisionFrameworkについて
- まぁ実装は単純でわかりやすい
 - 少し得られる顔の情報が少ない...
 
Qiita投稿について
- 今回初めて投稿してみたが、コードを書くよりめんどくさい。w
 - 今後はすこーしずつ書いていこうかなと思いました。もっと小さいスコープで。
 
今後やってみたいこと
TRTCである程度やりたいことができそうなので、画像を加工して電波少年的なUIで通話できるアプリだったり、もう少し綺麗に顔を描画して通話できるアプリなどを作ってみたい。
