動画のキャプション作成 #ObjectDetection

動画からキャプションを作成してみました

この動画のキャプションを作成すると以下のようになりました。
https://www.youtube.com/shorts/u0TfMuEoVqA

この動画は、最新の24年式のトロトアックを紹介する内容です。外側と内側の様子が紹介されています。ディテクトされたオブジェクトにはキーボード、ブロッコリー、電車、象などが含まれています。

動画内容によっては、キャプションらしいものが作成できました。

物体検知やwhisperのモデルを大きいものを利用することで、改善されることを期待します。

動画ファイルからwhisperを利用し、音声を抽出します

def extract_audio_and_transcribe(video_path, audio_path='extracted_audio.wav'):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path)
    audio.close()
    video.close()

    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    print(result['text'])
    return result['text']

動画ファイルからSSDを利用し、物体名を抽出します

def detect_objects_in_video(video_path, detection_model, class_names):
    cap = cv2.VideoCapture(video_path)
    descriptions = []
    frame_count = 0  # フレームカウント用の変数

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # 10フレームごとに検出を実行
        if frame_count % 10 == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_rgb = np.expand_dims(frame_rgb, axis=0).astype('uint8')

            input_tensor = tf.convert_to_tensor(frame_rgb)
            detections = detection_model(input_tensor)
            num_detections = int(detections.pop('num_detections'))
            detection_classes = detections['detection_classes'][0].numpy().astype(int)

            # 検出されたオブジェクトを取得し、範囲チェック
            detected_objects = [class_names[class_id] if class_id < len(class_names) else 'Unknown' for class_id in detection_classes[:num_detections]]

            # オブジェクトのカウントとフィルタリング
            object_counts = Counter(detected_objects)
            filtered_objects = []
            for obj, count in object_counts.items():
                if obj == 'Unknown':
                    continue
                if count > 5:
                    # 同一の物体が6個以上の場合は3個のみ記載
                    filtered_objects.extend([obj] * 1)
                # 5個以下の場合は配列から完全に削除

            descriptions.append(filtered_objects)

        frame_count += 1  # フレームカウンタを更新

    cap.release()
    print(descriptions)
    return descriptions

ChatGPT(API)を利用し、物体名と音声テキストを合わせてキャプションを作成します

# GPT-3を使ったキャプション生成
def generate_caption(transcription, descriptions):
    descriptions_text = ', '.join([str(d) for d in descriptions])
    prompt = f"Describe this video based on the transcription and detected objects: {transcription} and objects detected are {descriptions_text}"

    # Create an OpenAI client instance
    client = OpenAI()

    try:
        # Start a chat completion stream with GPT-3.5-turbo
        stream = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Generate a video caption."},
                {"role": "user", "content": prompt},
                {"role": "user", "content": "日本語で回答して" }
            ],
            max_tokens= 3000,  # Adjust max_tokens to stay within the limit
            stream=True,  # Enable streaming
            temperature=0.5,  # 生成するテキストのランダム性を制御
            top_p=0.9  # 最も可能性の高い次のワードの累積確率閾値
        )

        generated_caption = ""
        for response in stream:
            if response.choices[0].delta.content:
                generated_caption += response.choices[0].delta.content

        return generated_caption.strip()

    except Exception as e:
        print(f"An error occurred: {e}")
        return None