GPT4VとDALL•E３ APIでimg2img実装 Flask使用

Posted at 2023-12-03

カメラで人の顔の写真を撮り、その顔をアバターにする仕組みを実装しました。

カメラ起動ボタン

ディレクトリ構造は下記です。

ディレクトリ構成図

app_avatar.py
└── templates
    └── index.html

index.htmlにカメラボタンをHTMLとJavaScriptで記述します。

index.html

 
    <section>
   <!-- カメラボタン -->
        <button id="cameraButton">カメラを起動</button>
        <!-- 撮影ボタン -->
        <button id="captureButton">撮影</button>
        <!-- 保存ボタン-->
        <button id="saveButton" style="display:none;">生成</button>
        <!-- ビデオ要素 -->
        <video id="cameraStream" width="640" height="480" autoplay style="display:none;"></video>
        <!-- 撮影した画像を表示するための要素 -->
        <img id="yourImageElementId" src="" alt="Generated Image" style="display: none;">
        <img id="capturedImage" style="display:none;">
    </section>

    <script>
        // カメラボタンの要素を取得
        const cameraButton = document.getElementById('cameraButton');
        const stopCameraButton = document.getElementById('stopCameraButton');
        const captureButton = document.getElementById('captureButton');
        const saveButton = document.getElementById('saveButton');
        const cameraStream = document.getElementById('cameraStream');
        const capturedImage = document.getElementById('capturedImage');
        let stream = null;

        // カメラを起動する関数
        const startCamera = () => {
            navigator.mediaDevices.getUserMedia({ video: true })
                .then((mediaStream) => {
                    stream = mediaStream;
                    cameraStream.srcObject = mediaStream;
                    cameraStream.play();
                })
                .catch((error) => {
                    console.error("カメラの起動に失敗しました:", error);
                });
        }
        // カメラを止める関数
        const stopCamera = () => {
            if (stream) {
                stream.getTracks().forEach(track => track.stop());
                cameraStream.srcObject = null;
            }
        }
        
        const captureImage = () => {
            // カメラストリームからの画像をキャンバスに描画
            const canvas = document.createElement('canvas');
            canvas.width = cameraStream.videoWidth;
            canvas.height = cameraStream.videoHeight;
            canvas.getContext('2d').drawImage(cameraStream, 0, 0, canvas.width, canvas.height);

            // キャンバスからの画像をcapturedImageに設定
            capturedImage.src = canvas.toDataURL('image/png');
            capturedImage.style.display = 'block';
            saveButton.style.display = 'block';
        }

        const saveImage = () => {
            // 現在のカメラストリームを取得
            const stream = cameraStream.srcObject;

            // カメラストリームの各トラックを停止
            if (stream) {
                const tracks = stream.getTracks();
                tracks.forEach(track => track.stop());
            }

            // ビデオ要素を非表示に設定
            cameraStream.style.display = 'none';
        }

        document.addEventListener('DOMContentLoaded', (event) => {
            const cameraButton = document.getElementById('cameraButton');
            const stopCameraButton = document.getElementById('stopCameraButton');
            const captureButton = document.getElementById('captureButton');
            const saveButton = document.getElementById('saveButton');
            const cameraStream = document.getElementById('cameraStream');

            // カメラを起動するボタンのイベントリスナー
            cameraButton.addEventListener('click', () => {
                startCamera(); // カメラを起動する関数を呼び出し
                cameraStream.style.display = 'block'; // ビデオ要素を表示
            });

            // カメラを停止するボタンのイベントリスナー
            stopCameraButton.addEventListener('click', stopCamera); // カメラを停止する関数を呼び出し

            // 撮影ボタンのイベントリスナー
            captureButton.addEventListener('click', captureImage); // 画像を撮影する関数を呼び出し
        });

        document.getElementById('saveButton').addEventListener('click', () => {
            const canvas = document.createElement('canvas');
            const cameraStream = document.getElementById('cameraStream');
            canvas.width = cameraStream.videoWidth;
            canvas.height = cameraStream.videoHeight;
            canvas.getContext('2d').drawImage(cameraStream, 0, 0, canvas.width, canvas.height);

            canvas.toBlob(uploadImageAndSaveData, 'image/png');
        });

    </script>

ここまでで、カメラを起動して、撮影、保存するという機能が実装できます。
ただ、GPT４Vはローカルに保存した画像の入力は受け付けないので、クラウドに上げる必要があります。

そこで、Firestoreに画像をアップします。先ほどのコードにしたに追記します。

Firebaseを設定する

index.html

        import { initializeApp } from "https://www.gstatic.com/firebasejs/9.22.2/firebase-app.js";

        import {
            getFirestore,
            collection,
            doc,
        } from "https://www.gstatic.com/firebasejs/9.22.2/firebase-firestore.js";

        import {
            getStorage,
            ref,
            uploadBytes,
            getDownloadURL,
        } from "https://www.gstatic.com/firebasejs/9.22.2/firebase-storage.js";


        const firebaseConfig = {
            apiKey: "YOUR_API_KEY",
            authDomain: "XXXXX",
            projectId: "XXXXX",
            storageBucket: "XXXXX",
            messagingSenderId: "XXXXX",
            appId: "XXXXX",
        };

        const app = initializeApp(firebaseConfig);
        const storage = getStorage(app);

APIは自分のものを入れる必要があります。firebaseConfigは下記が参考になります。
https://firebase.google.com/docs/web/setup?hl=ja

Firebaseに保存して、Flaskと通信する

index.htmlに下記も追記する形で、
上記で設定したFirebaseのFirestorageに画像を保存します。

index.html


const uploadImageAndSaveData = (imageBlob) => {
const storageRef = ref(storage, `images/${new Date().getTime()}.png`);
uploadBytes(storageRef, imageBlob)
    .then(snapshot => getDownloadURL(snapshot.ref))
    .then(downloadURL => 
        // ここでFlaskサーバーにdownloadURLを送信して解析結果を受け取る。後述。（＊１）
  })         
});

本来的に実装したいのはimg2imgですが、openaiAPIではまだ実装できないので、一度テキスト化して画像にする、img2txt2imgで実装します。
なので、①img2txt　②txt2img で実装しています。

まずは①img2txt。
保存したURLをdownloadURLで返してもらいます。そのURLにGPT4VのAPIをかまして、何が映っているか、テキストにしてもらいます。

プロンプトは400トークン以内が望ましいと言うことで、どんな画像が写っているのかを150トークン以内で記述します。

（プロンプトについては下記サイトが参考になりました。）
DALL-E3 (ダリスリー) の無料教科書:初級編
https://note.com/chatgpt4graph/n/na2c61100d60d

下記コードを（＊１）に流し込みます。

①img2txt

index.html

//(*1)に挿入
 return fetch('/analyze-image', {
    method: 'POST',
    headers: {
        'Content-Type': 'application/json'
    },
    body: JSON.stringify({ image_url: downloadURL })
});

flask、openAIのAPIを使用するにはインストールが必要ですので、ターミナルに下記を打ち込んでください。

pip install Flask
pip install openai

ここでようやくPythonファイルの作成です。
ディレクトリ構造は上の説明を参考にしてください。

app.py

# app.py
from flask import Flask, request, render_template, jsonify
import os
from openai import OpenAI
import base64
import time

app = Flask(__name__)

# ファイル保存用のディレクトリ設定
UPLOAD_FOLDER = 'uploaded_images'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

# OpenAIクライアントのインスタンス化
openai_client = OpenAI(api_key="YOUR_API_KEY") 

# ホームページのルート
@app.route('/')
def home():
    return render_template('index.html')

# 画像アップロードのルート
@app.route('/upload', methods=['POST'])
def upload_image():
    # ...
    return "画像がアップロードされました"

# 画像解析のルート
@app.route('/analyze-image', methods=['POST'])
def analyze_image():
    data = request.json
    image_url = data['image_url']
    print("Received image URL:", image_url)  # デバッグ出力

    response = openai_client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", 
                       "text": "Describe the person in this image, paying attention to hair color, and hair texture. Also describe the clothing and color from the chest up. Do not touch the background elements or facial expressions."
                     },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url 
                        }
                    }
                ]
            }
        ],
        max_tokens=150
    )

    content = response.choices[0].message.content
    print(content) 

    return jsonify(content)

これで「どんな人が写真に写っているかを記述することができます。
なるべくシンプルな人物像をここで描画する必要があります。
なので、背景や表情については触れないように禁則を設けています。
ここで表情を描写してしまうと、画像を生成する時にいくら「笑顔の」というプロンプトを書いても、「しかめっつらの」といった表情の描画に引っ張られてしまうからです。
また、服装の色味は出来栄えに影響するのであえて描画してもらいます。

②txt2img

そうして戻ってきた値を変数dataとして扱い、イラストにして欲しいというpromtに${data}で流し込みます。
①img2txtの次に足します。

index.html

![dall-e-3_1701269172_0.png](https://qiita-image-store.s3.ap-northeast-1.amazonaws.com/0/3341129/d37fa648-6db5-9c8a-1c57-0a125653574b.png)

    .then(response => response.json())
    .then(data => {
    
        const prompt = `Please create an illustration of a single cheerful person in a sticker style. The background is white. Please do not include any elements other than the person. The character features large eyes, no nose. Keep it simple with bright colors and few bold lines. The body from the chest up is small, which emphasizes the Japanese cartoon look.${data}`;

    
        return fetch('/generate-image', {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json'
            },
            body: JSON.stringify({ prompt: prompt })  // 解析結果に基づくプロンプト
        });
    })

Flaskもapp.pyの下に追記します。

app.py

# app.py
@app.route('/generate-image', methods=['POST'])
def generate_image():
    data = request.json
    prompt = data['prompt']

    if request.method == "POST":

        # ここでOpenAI APIを呼び出す
        response = openai_client.images.generate(
            model="dall-e-3",
            prompt=prompt,
            n=1,
            size="1024x1024",
            response_format="b64_json",
            quality="standard",
            style="vivid"
        )
    
        # 画像を保存
        for i, d in enumerate(response.data):
            filename = f"static/img/dall-e-3_{int(time.time())}_0.png"
            
            with open(filename, "wb") as f:
                f.write(base64.b64decode(d.b64_json))

    image_path = f"img/dall-e-3_{int(time.time())}_0.png"
    return jsonify({"image_path": image_path})

if __name__ == '__main__':
    app.run(debug=True)

イメージ画像が保存されたurlをjsonで返してもらいます。
そうしてようやく生成された画像をHTML2秒がすることができます。
ちなみに、プロンプト次第で出力結果に大きく影響があるので、注意が必要です。

ここに出来上がった画像を表示するコードも付加した全体のコードは下記になります。

index.html


const uploadImageAndSaveData = (imageBlob) => {

            // ビデオ要素を非表示に設定
            cameraStream.style.display = 'none';

            const storageRef = ref(storage, `images/${new Date().getTime()}.png`);

            uploadBytes(storageRef, imageBlob)
                .then(snapshot => getDownloadURL(snapshot.ref))
                .then(downloadURL => {
                 
                    // ここでFlaskサーバーにdownloadURLを送信して解析結果を受け取る
                    return fetch('/analyze-image', {
                        method: 'POST',
                        headers: {
                            'Content-Type': 'application/json'
                        },
                        body: JSON.stringify({ image_url: downloadURL })
                    });
                })
                .then(response => response.json())
                .then(data => {

                    const prompt = `Please create an illustration of a single cheerful person in a sticker style. The background is white. Please do not include any elements other than the person. The character features large eyes, no nose. Keep it simple with bright colors and few bold lines. The body from the chest up is small, which emphasizes the Japanese cartoon look.${data}`;
                   
                    // 解析結果を基に新しい画像を生成するリクエストを送信

                    return fetch('/generate-image', {
                        method: 'POST',
                        headers: {
                            'Content-Type': 'application/json'
                        },
                        body: JSON.stringify({ prompt: prompt })  // 解析結果に基づくプロンプト
                    });
               

                })
                .then(response => response.json())  // レスポンスをJSONとして解析
                .then(data => {

                    // 生成された画像のパスを取得
                    const imagePath = data.image_path;
                    console.log('画像パス:', data.image_path);

                    // HTML内のimg要素を取得
                    const imageElement = document.getElementById('yourImageElementId');

                    // img要素のsrc属性を更新して画像を表示
                    imageElement.src = '/static/' + imagePath;
                    imageElement.style.display = 'block';

                    // img要素の最大幅を設定してブラウザの幅に収まるようにする
                    imageElement.style.maxWidth = '100px';
                })
                .catch(error => {
                    console.error('エラーが発生しました:', error);
                });
        };

これで、少し遠回りですが、img2imgを実装できているはずです。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up