More than 1 year has passed since last update.

PyAV と OpenCV によるWebカメラのキャプチャ with OpenGLによる描画

Posted at 2023-05-15

Webカメラのキャプチャ

OpenCVによるキャプチャはよく見かけるけど、PyAVでもキャプチャできる。
今回は、PyAVでキャプチャしたら、解像度設定もちゃんとでき、
同じ解像度だとFPSがOpenCVよりよかったという話。

あと、imshowは遅いので、1920x1080 を3画面出すなら、OpenGL使ったほうが早い。
1画面なら30fpsは出るので、必要なfpsで使い分けしてもいいかも。
（OpenGLはなんだかんだ面倒だし）

OpenCVで解像度指定してもうまくいかない

今回、たまたまあったFullHD対応Webカメラ(Logi C615 HD WebCam)で、
OpenCVでキャプチャしたら、なぜか960x720までしか解像度が上がらず、
ネットを調べても、そんなことで困っているひとがおらず、
他の方法を調べてみた。ちなみにその時のコード。

import os
import cv2
from datetime import datetime
from collections import deque
import time

class Dict(dict):
    __getattr__ = dict.__getitem__

# ダンプするVideoCaptureの値一覧
param_id2name = dict( (eval(f"cv2.CAP_PROP_{n}"), n) for n in """
FRAME_WIDTH FRAME_HEIGHT FPS
FOURCC FORMAT MODE CONVERT_RGB
BRIGHTNESS CONTRAST SATURATION
HUE GAIN EXPOSURE WB_TEMPERATURE
GAMMA FOCUS PAN TILT ROLL IRIS 
AUTO_EXPOSURE AUTOFOCUS AUTO_WB
TEMPERATURE ZOOM 
TRIGGER TRIGGER_DELAY SETTINGS
""".split() )

# キャプチャ画像保存場所
dirname = 'cap_c615a'

# 解像度
# cap_height = 960
# cap_width  = 720
# cap_height = 3840
# cap_width  = 2160
cap_height = 1920
cap_width  = 1080
# VideoCaptureに設定する値
cap_sets = {
    cv2.CAP_PROP_FOURCC: cv2.VideoWriter_fourcc(*"MJPG"),
    cv2.CAP_PROP_FPS: 30,
    cv2.CAP_PROP_FRAME_HEIGHT: cap_height,
    cv2.CAP_PROP_FRAME_WIDTH:  cap_width,
    # 繰り返すと、設定できるという話があったので、繰り返して設定してみたが・・・
    cv2.CAP_PROP_FRAME_HEIGHT: cap_height,
    cv2.CAP_PROP_FRAME_WIDTH:  cap_width,
    cv2.CAP_PROP_FRAME_HEIGHT: cap_height,
    cv2.CAP_PROP_FRAME_WIDTH:  cap_width,
    # cv2.CAP_PROP_SETTINGS: 1, 
}

# USBカメラ。今回は3台同時キャプチャする。
cap_def = Dict(
    cap1=Dict(args=[3, cv2.CAP_DSHOW], sets=cap_sets, ),
    cap2=Dict(args=[1, cv2.CAP_DSHOW], sets=cap_sets, ),
    cap3=Dict(args=[2, cv2.CAP_DSHOW], sets=cap_sets, ),
)

# 直近N_SAMPLE分の時刻をキューで保持。FPSの計算用
N_SAMPLE = 10
q = deque([time.time() for i in range(N_SAMPLE)])

# デバイスオープン
caps = {}
param_list = [ [k] for k in param_id2name.keys() ]
for n, p in cap_def.items():
    print(n)
    caps[n] = cv2.VideoCapture(*p.args)
    for k, v in p.sets.items():
        # print(f"setting: {param_id2name[k]}={v}")
        if not caps[n].set(k, v):
            print(f"setting failed: {param_id2name[k]}={v}")
    
    # 設定値ダンプ用
    # print(caps[n].get(cv2.CAP_PROP_FRAME_WIDTH), caps[n].get(cv2.CAP_PROP_FRAME_HEIGHT))
    for p in param_list:
        p.append(caps[n].get(p[0]))

# 設定値ダンプ
for p in param_list: print(param_id2name[p[0]], p[1:])

frame = {}
while (True):
    for n, c in caps.items():
        _, frame[n] = c.read()

    # 直近N_SAMPLEフレームからfps算出
    now = time.time()
    fps = N_SAMPLE / (now - q.popleft())
    q.append(now)

    for n, f in frame.items():
        if f is not None:
            # FPSの値を描画
            cv2.putText(f,'{:6.3f}fps'.format(fps), (10,20),
                        cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 0))
            cv2.imshow(n, f)
    
    key = cv2.waitKey(10) & 0xFF
    if key == ord('c'):
        dt = datetime.now().strftime ( '%Y%m%d_%H%M%S_%f' )
        print("capture", dt)
        os.makedirs(dirname, exist_ok=True)
        for n, f in frame.items():
            if f is not None:
                cv2.imwrite(f'{dirname}/cap_{dt}_{n}.png', f)
    
    if key == 27:  # ESC
        break
    
for c in caps.values():
    c.release()

cv2.destroyAllWindows()

PyAVによるキャプチャ

PyAVはffmpegのラッパーなので、これでキャプチャできるか確認。
参考: DirectShow - FFmpeg

デバイスの一覧のやり方

実行例

!ffmpeg -list_devices true -f dshow -i dummy
　・・・中略・・・
[dshow @ 000001a49f18e580]  "Logi C615 HD WebCam"
[dshow @ 000001a49f18e580]     Alternative name "@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#6&15d3e9a7&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global"
[dshow @ 000001a49f18e580]  "Logi C615 HD WebCam"
[dshow @ 000001a49f18e580]     Alternative name "@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#6&c7e554f&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global"
[dshow @ 000001a49f18e580]  "Logi C615 HD WebCam"
[dshow @ 000001a49f18e580]     Alternative name "@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#9&8216d50&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global"

設定可能なデバイス用の設定項目

実行例

!ffmpeg -list_options true -f dshow -i video="Logi C615 HD WebCam"
　・・・中略・・・
[dshow @ 000001f8a70ee480] DirectShow video device options (from video devices)
[dshow @ 000001f8a70ee480]  Pin "キャプチャ" (alternative pin name "0")
[dshow @ 000001f8a70ee480]   pixel_format=yuyv422  min s=640x480 fps=5 max s=640x480 fps=30
　・・・中略・・・
[dshow @ 000001f8a70ee480]   pixel_format=yuyv422  min s=1600x896 fps=5 max s=1600x896 fps=7.5
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=864x480 fps=5 max s=864x480 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=864x480 fps=5 max s=864x480 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=1024x576 fps=5 max s=1024x576 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=1024x576 fps=5 max s=1024x576 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=800x600 fps=5 max s=800x600 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=800x600 fps=5 max s=800x600 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=960x720 fps=5 max s=960x720 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=960x720 fps=5 max s=960x720 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=1280x720 fps=5 max s=1280x720 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=1280x720 fps=5 max s=1280x720 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=1600x896 fps=5 max s=1600x896 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=1600x896 fps=5 max s=1600x896 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=1920x1080 fps=5 max s=1920x1080 fps=30
[dshow @ 000001f8a70ee480]   vcodec=mjpeg  min s=1920x1080 fps=5 max s=1920x1080 fps=30
video=Logi C615 HD WebCam: Immediate exit requested

ffmpeg でキャプチャしてみる

実行例

ffmpeg -f dshow -video_device_number 1 -video_size 1600x896  -vcodec mjpeg -i video="Logi C615 HD WebCam" test.avi
[mjpeg @ 000002015b1e9540] unable to decode APP fields: Invalid data found when processing input
Input #0, dshow, from 'video=Logi C615 HD WebCam':
  Duration: N/A, start: 213165.622000, bitrate: N/A
    Stream #0:0: Video: mjpeg (Baseline) (MJPG / 0x47504A4D), yuvj422p(pc, bt470bg/unknown/unknown), 1600x896, 30 fps, 30 tbr, 10000k tbn, 10000k tbc
Stream mapping:
  Stream #0:0 -> #0:0 (mjpeg (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
[swscaler @ 000002015b254a00] deprecated pixel format used, make sure you did set range correctly
Output #0, avi, to 'test.avi':
  Metadata:
    ISFT            : Lavf58.45.100
    Stream #0:0: Video: mpeg4 (FMP4 / 0x34504D46), yuv420p, 1600x896, q=2-31, 200 kb/s, 30 fps, 30 tbn, 30 tbc
    Metadata:
      encoder         : Lavc58.91.100 mpeg4
    Side data:
      cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
    Last message repeated 15 times
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
    Last message repeated 13 times
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
    Last message repeated 14 times
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
    Last message repeated 15 times
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
    Last message repeated 14 times
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
    Last message repeated 14 times
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
    Last message repeated 15 times
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
    Last message repeated 14 times
[mjpeg @ 000002015b1f10c0] unable to decode APP fields: Invalid data found when processing input
frame=  124 fps= 30 q=31.0 Lsize=    1573kB time=00:00:04.13 bitrate=3116.8kbits/s speed=1.01x
video:1564kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.545260%

ちなみに、C615だとめっちゃ警告出るけど、どうしようもないみたい。
Getting "unable to decode APP fields" while playing USB webcam stream through ffplay

PyAVによるキャプチャしてみる

引数は、大体FFmpegと同じ。遅延が大きかったのでrtbufsizeを最低に設定。
さすがに3台同時だと10~15fps、2台だと15～20fps、1台なら30fps

import logging
import av
import cv2
from collections import deque
import time

logging.basicConfig()
logging.getLogger('libav').setLevel(logging.FATAL)

# 直近N_SAMPLE分の時刻をキューで保持
N_SAMPLE = 10
q = deque([time.time() for i in range(N_SAMPLE)])


con_options = dict(
    video_size='1920x1080',
#    video_size='960x720',
#    video_size='1280x720',
    vcodec='mjpeg',
    framerate='30',
    rtbufsize='1',
#    show_video_device_dialog='true',
)

con_def = [
    dict(format='dshow', file=r'video=Logi C615 HD WebCam', options=dict(con_options, video_device_number='1') ),
    dict(format='dshow', file=r'video=Logi C615 HD WebCam', options=dict(con_options, video_device_number='0') ),
    dict(format='dshow', file=r'video=Logi C615 HD WebCam', options=dict(con_options, video_device_number='2') ),
    
    # dict(format='dshow', file=r'video=@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#9&8216d50&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global', options=con_options ),
    # dict(format='dshow', file=r'video=@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#6&15d3e9a7&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global', options=con_options ),
    # dict(format='dshow', file=r'video=@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#6&c7e554f&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global', options=con_options ),
]

containers = []
for args in con_def:
    containers.append(av.open(**args))

try:
    for frames in zip( *[c.decode(video=0) for c in containers] ):
        imgs = [f.to_ndarray(format='bgr24') for f in frames]
        
        # 直近N_SAMPLEフレームからfps算出
        now = time.time()
        fps = N_SAMPLE / (now - q.popleft())
        q.append(now)
        
        for i, img in enumerate(imgs):
            cv2.putText(img,'{:6.3f}fps'.format(fps),(10,20),cv2.FONT_HERSHEY_PLAIN, 1,(255,255,0))
            cv2.imshow(f'VIDEO {i}', img)

        if 27 == cv2.waitKey(10):
            break

except KeyboardInterrupt:
    print("KeyboardInterrupt")
finally:
    for c in containers:
        c.close()
    cv2.destroyAllWindows()

PyOpenGL による描画

どうも3画面同時で10～15fpsになるのは、cv2.imshow が遅いから見たい。
表示なしで更新頻度見ると30fpsぐらい出そう。

PythonでOpenCVの画像をOpenGLで表示する
を参考に、OpenGLで表示。

下記コードで試すと、3画面同時で20~25fpsぐらい、もう少し早くならないかな。

from OpenGL.GL import *
from OpenGL.GLU import *
from OpenGL.GLUT import *
import threading
import cv2
import numpy as np
    
class OpenGLWindow:
    @classmethod
    def get_manager(cls):
        return cls.Manager.get_instance()
    
    
    @classmethod
    def mainloop(cls):
        cls.get_manager().mainloop()
    
    
    class Manager:
        @classmethod
        def get_instance(cls):
            if not hasattr(cls, "_instance"):
                cls._instance = cls()
            return cls._instance
        
        
        def __init__(self):
            self.__winids = []
        
        
        def append(self, winid):
            self.__winids.append(winid)
        
        
        def __idle(self):
            for winid in self.__winids:
                glutSetWindow(winid)
                glutPostRedisplay()
        
        
        def mainloop(self):
            glutIdleFunc(self.__idle)
            glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE,GLUT_ACTION_GLUTMAINLOOP_RETURNS);
            glutMainLoop()
                
    
    def update_image(self, image):
        self.image = image
    
    
    def run(self):
        if self.read_image_iterator is None:
            return
        
        for image in self.read_image_iterator:
            self.update_image(image)
            if self.closing:
                break
        
        self.read_image_iterator = None
    
    
    def draw(self):
        # Paste into texture to draw at high speed
        img = cv2.cvtColor(self.image, cv2.COLOR_BGR2RGB) #BGR-->RGB
        h, w = img.shape[:2]
        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, w, h, 0, GL_RGB, GL_UNSIGNED_BYTE, img)

        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
        glColor3f(1.0, 1.0, 1.0)

        # Enable texture map
        glEnable(GL_TEXTURE_2D)
        # Set texture map method
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR)
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR)

        # draw square
        glBegin(GL_QUADS) 
        # 左下
        glTexCoord2d(0.0, 1.0)
        glVertex3d(-w/2, -h/2, 0.0)
        # 右下
        glTexCoord2d(1.0, 1.0)
        glVertex3d( w/2, -h/2,  0.0)
        # 右上
        glTexCoord2d(1.0, 0.0)
        glVertex3d( w/2,  h/2,  0.0)
        # 左上
        glTexCoord2d(0.0, 0.0)
        glVertex3d(-w/2,  h/2,  0.0)
        glEnd()
        
        glFlush();
        glutSwapBuffers()
    
    
    def reshape(self, w, h):
        self.windowWidth = w
        self.windowHeight = h
        glViewport(0, 0, w, h)
        glLoadIdentity()
        #Make the display area proportional to the size of the view
        glOrtho(-w/2, w/2, -h/2, h/2, -1.0, 1.0)
    
    
    def closeWindow(self):
        print('closeWindow')
        glutLeaveMainLoop()
    
    
    def close(self):
        self.closing = True
    
    
    def keyboard(self, key, x, y):
        # convert byte to str
        key = key.decode('utf-8')
        # press q to exit
        if key == 'q':
            self.closeWindow()
    
    
    def init(self):
        glClearColor(0.7, 0.7, 0.7, 0.7)
    
    
    def __init__(
        self, argv=(), 
        x=0, y=0, width=720, height=480,
        title="Display", read_image_iterator=None, auto_start=True,
    ):
        self.title = title
        self.image = np.full((height, width, 3), 255, np.uint8)
        self.windowWidth  = width
        self.windowHeight = height
        
        self.closing = False
        self.read_image_iterator = read_image_iterator
        self.thread = threading.Thread(target=self.run, daemon=True)
        
        glutInitWindowPosition(x, y);
        glutInitWindowSize(width, height);
        glutInit(argv)
        glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE )
        winid = glutCreateWindow(title)
        glutDisplayFunc(self.draw)
        glutReshapeFunc(self.reshape)
        glutKeyboardFunc(self.keyboard)
        glutCloseFunc(self.close)
        self.get_manager().append(winid)
        self.init()
        
        if auto_start and read_image_iterator is not None:
            self.thread.start()


            
import logging
import av
import cv2
from collections import deque
import time

logging.basicConfig()
logging.getLogger('libav').setLevel(logging.FATAL)

# FPS算出のサンプル数
N_SAMPLE = 10


con_options = dict(
    video_size='1920x1080',
#    video_size='960x720',
#    video_size='1280x720',
#    video_size='1600x896',
    vcodec='mjpeg',
    framerate='30',
    rtbufsize='1',
#    show_video_device_dialog='true',
)

con_def = [
    dict(format='dshow', file=r'video=Logi C615 HD WebCam', options=dict(con_options, video_device_number='1') ),
    dict(format='dshow', file=r'video=Logi C615 HD WebCam', options=dict(con_options, video_device_number='0') ),
    dict(format='dshow', file=r'video=Logi C615 HD WebCam', options=dict(con_options, video_device_number='2') ),
    
    #dict(format='dshow', file=r'video=@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#9&8216d50&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global', options=con_options ),
    #dict(format='dshow', file=r'video=@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#6&15d3e9a7&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global', options=con_options ),
    #dict(format='dshow', file=r'video=@device_pnp_\\?\usb#vid_046d&pid_082c&mi_02#6&c7e554f&0&0002#{65e8773d-8f56-11d0-a3b9-00a0c9223196}\global', options=con_options ),
]

def read_frame(args):
    container = av.open(**args)
    try:
        for frame in container.decode(video=0):
            yield frame.to_ndarray(format='bgr24')
    finally:
        print("close")
        container.close()

def draw_fps(reader):
    # 直近N_SAMPLE分の時刻をキューで保持
    q = deque([time.time() for i in range(N_SAMPLE)])
    
    for frame in reader:
        # 直近N_SAMPLEフレームからfps算出
        now = time.time()
        fps = N_SAMPLE / (now - q.popleft())
        q.append(now)
        cv2.putText(frame,'{:6.3f}fps'.format(fps),(10,20),cv2.FONT_HERSHEY_PLAIN, 1,(255,255,0))
        yield frame
        
window_width = 1920
window_height = 1080
windows = []
for i, args in enumerate(con_def):
    read_image_iterator = draw_fps(read_frame(args))
    windows.append(OpenGLWindow(title=f"Cap{i}",
                                read_image_iterator=read_image_iterator,
                                x=i * window_width, width=window_width, height=window_height))

OpenGLWindow.mainloop()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up