FramePack-CLI対応 (MPS対応 Fork用) mac osx Appleシリコン

Posted at 2025-05-11

まとめ

何ができる？：FramePackのコマンドライン操作(CLI)が可能(2025年5月11日時点)
前提：Fork版FramePackが動作していること (takusandayoさん記載参照)
謝辞：以下の投稿が元情報となっています。投稿および共有ありがとうございます。

takusandayoさん記載：【Appleシリコン対応】FramePackをMacで動かす方法（24GBメモリで実機検証）
yanosen_jpさん記載：FramePackに動画を生成させまくる話
brandon929さん記載：MPS対応 Fork https://github.com/brandon929/FramePack
lllyasvielさん記載：Original https://github.com/lllyasviel/FramePack

実行例

実行：ターミナルを起動しFramePackへ移動後、スクリプトに引数をつけて実行してください。(最小はimage_pathとpromptのみ、指定なしの場合はヘルプ表示)
出力：outputsフォルダに生成されます。

% cd ~/FramePack/
% python run_osx.py --seed 31337 --image_path ./image1.png --resolution 416 --prompt "A man walking"

CLI対応スクリプト

以下の内容のrun-osx.pyファイルを作成し、FramePackフォルダにコピーしてください。
*Original版に対応しているyanosen_jpさんの内容をベースにfork版へ対応させています。

run-osx.py

import os
os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import torch
import traceback
import einops
import numpy as np
import argparse
import math
from PIL import Image

from diffusers import AutoencoderKLHunyuanVideo
from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, generate_timestamp
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
from transformers import SiglipImageProcessor, SiglipVisionModel
from diffusers_helper.clip_vision import hf_clip_vision_encode
from diffusers_helper.bucket_tools import find_nearest_bucket

# OSX resolution 追加
def parse_args():
    parser = argparse.ArgumentParser(description="Generate video from image and prompt using FramePack.")
    parser.add_argument('--image_path', type=str, required=True, default="./image1.png", help='Path to the input image.')
    parser.add_argument('--prompt', type=str, required=True, default="男がダンスしている", help='Text prompt describing the desired video content.')
    parser.add_argument('--output_dir', type=str, default='./outputs/', help='Directory to save the output video and intermediate files.')
    parser.add_argument('--n_prompt', type=str, default="", help='Negative text prompt.')
    parser.add_argument('--seed', type=int, default=31337, help='Random seed for generation.')
    parser.add_argument('--total_second_length', type=float, default=5.0, help='Total length of the video in seconds.')
    parser.add_argument('--latent_window_size', type=int, default=9, help='Latent window size (should not change).')
    parser.add_argument('--steps', type=int, default=25, help='Number of sampling steps.')
    parser.add_argument('--cfg', type=float, default=1.0, help='CFG Scale (should not change).')
    parser.add_argument('--gs', type=float, default=10.0, help='Distilled CFG Scale.')
    parser.add_argument('--rs', type=float, default=0.0, help='CFG Re-Scale (should not change).')
    parser.add_argument('--gpu_memory_preservation', type=float, default=6.0, help='GPU memory preservation in GB.')
    parser.add_argument('--use_teacache', action='store_true', default=True, help='Use TeaCache for potentially faster generation.')
    parser.add_argument('--no_teacache', action='store_false', dest='use_teacache', help='Do not use TeaCache.')
    parser.add_argument('--resolution', type=int, default=416, help='Resolution Min=240 Max=720.')

    args = parser.parse_args()
    return args

@torch.no_grad()
def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, outputs_folder, resolution, high_vram, text_encoder, text_encoder_2, tokenizer, tokenizer_2, vae, feature_extractor, image_encoder, transformer):
    total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
    total_latent_sections = int(max(round(total_latent_sections), 1))

    job_id = generate_timestamp()
    output_filename_final = None

    print("Starting video generation...")

    try:
        # Clean GPU
        if not high_vram:
            unload_complete_models(
                text_encoder, text_encoder_2, image_encoder, vae, transformer
            )

        # Text encoding
        print("Text encoding ...")
        if not high_vram:
            fake_diffusers_current_device(text_encoder, gpu)
            load_model_as_complete(text_encoder_2, target_device=gpu)

        llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)

        if cfg == 1:
            llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
        else:
            llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)

        llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
        llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)

        # Processing input image
        print("Image processing ...")
        input_image_np = np.array(input_image)
        H, W, C = input_image_np.shape
        #OSX resolutoin 640 -> resolution
        height, width = find_nearest_bucket(H, W, resolution=resolution)
        input_image_np = resize_and_center_crop(input_image_np, target_width=width, target_height=height)

        #OSX seed 追加
        Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}_input_' + str(seed) + '.png'))

        input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
        input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]

        # VAE encoding
        print("VAE encoding ...")
        if not high_vram:
            load_model_as_complete(vae, target_device=gpu)

        start_latent = vae_encode(input_image_pt, vae)

        # CLIP Vision
        print("CLIP Vision encoding ...")
        if not high_vram:
            load_model_as_complete(image_encoder, target_device=gpu)

        image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
        image_encoder_last_hidden_state = image_encoder_output.last_hidden_state

        # Dtype
        llama_vec = llama_vec.to(transformer.dtype)
        llama_vec_n = llama_vec_n.to(transformer.dtype)
        clip_l_pooler = clip_l_pooler.to(transformer.dtype)
        clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
        image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)

        # Sampling
        print("Start sampling ...")
        rnd = torch.Generator("cpu").manual_seed(seed)
        num_frames = latent_window_size * 4 - 3

        history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
        history_pixels = None
        total_generated_latent_frames = 0

        latent_paddings = reversed(range(total_latent_sections))

        if total_latent_sections > 4:
            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]

        for i, latent_padding in enumerate(latent_paddings):
            is_last_section = latent_padding == 0
            latent_padding_size = latent_padding * latent_window_size

            print(f"Generating section {i+1}/{total_latent_sections} (padding={latent_padding}, last={is_last_section})...")
            # print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}')

            indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
            clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
            clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)

            clean_latents_pre = start_latent.to(history_latents)
            clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
            clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)

            if not high_vram:
                unload_complete_models()
                move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)

            if use_teacache:
                transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
            else:
                transformer.initialize_teacache(enable_teacache=False)

            def callback(d):
                current_step = d['i'] + 1
                percentage = int(100.0 * current_step / steps)
                print(f"  Sampling Step: {current_step}/{steps} ({percentage}%)", end='\r')
                # No preview generation needed for CLI
                return

            #OSX dtype=torch.bfloat16,  dtype=transformer.dtype,
            generated_latents = sample_hunyuan(
                transformer=transformer,
                sampler='unipc',
                width=width,
                height=height,
                frames=num_frames,
                real_guidance_scale=cfg,
                distilled_guidance_scale=gs,
                guidance_rescale=rs,
                num_inference_steps=steps,
                generator=rnd,
                prompt_embeds=llama_vec,
                prompt_embeds_mask=llama_attention_mask,
                prompt_poolers=clip_l_pooler,
                negative_prompt_embeds=llama_vec_n,
                negative_prompt_embeds_mask=llama_attention_mask_n,
                negative_prompt_poolers=clip_l_pooler_n,
                device=gpu,
                dtype=transformer.dtype,
                image_embeddings=image_encoder_last_hidden_state,
                latent_indices=latent_indices,
                clean_latents=clean_latents,
                clean_latent_indices=clean_latent_indices,
                clean_latents_2x=clean_latents_2x,
                clean_latent_2x_indices=clean_latent_2x_indices,
                clean_latents_4x=clean_latents_4x,
                clean_latent_4x_indices=clean_latent_4x_indices,
                callback=callback,
            )
            print() # Newline after step progress

            if is_last_section:
                generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)

            total_generated_latent_frames += int(generated_latents.shape[2])
            history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)

            if not high_vram:
                offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
                load_model_as_complete(vae, target_device=gpu)

            real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]

            print("  Decoding latents...")
            if history_pixels is None:
                history_pixels = vae_decode(real_history_latents, vae).cpu()
            else:
                section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
                overlapped_frames = latent_window_size * 4 - 3

                current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
                history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)

            if not high_vram:
                unload_complete_models()

            #OSX seed 追加
            output_filename_final = os.path.join(outputs_folder, f'{job_id}_' + str(seed) +'.mp4') # Use a consistent final name
            print(f"  Saving video segment to {output_filename_final}...")

            # OSX fps 24 -> 30
            # mp4crf = 16(固定)
            save_bcthw_as_mp4(history_pixels, output_filename_final, fps=30, crf=16)

            print(f"  Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}")

            if is_last_section:
                break

    except Exception as e:
        traceback.print_exc()
        print(f"Error during generation: {e}")

# + finally 追加  orignalでは serverの解放
    finally:
        if not high_vram:
            print("Unloading models...")
            unload_complete_models(
                text_encoder, text_encoder_2, image_encoder, vae, transformer
            )

    print("Video generation finished.")
    return output_filename_final

if __name__ == "__main__":
    args = parse_args()

    print("Arguments:")
    for k, v in vars(args).items():
        print(f"  {k}: {v}")

    print("\nInitializing...")

    #OSX MPS対応 torch.mps 分岐追加
    if torch.cuda.is_available():
        free_mem_gb = get_cuda_free_memory_gb(gpu)
    else:
        free_mem_gb = torch.mps.recommended_max_memory() / 1024 / 1024 / 1024

    high_vram = free_mem_gb > 60

    print(f'Free VRAM {free_mem_gb} GB')
    print(f'High-VRAM Mode: {high_vram}')

    # Load Models
    print("Loading models...")
    text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
    text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
    tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
    tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
    vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
    feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
    image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
    transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
    print("Models loaded.")

    # Configure Models
    vae.eval()
    text_encoder.eval()
    text_encoder_2.eval()
    image_encoder.eval()
    transformer.eval()

    if not high_vram:
        vae.enable_slicing()
        vae.enable_tiling()

    transformer.high_quality_fp32_output_for_inference = True
    # print('transformer.high_quality_fp32_output_for_inference = True') # Less verbose

    transformer.to(dtype=torch.bfloat16)
    vae.to(dtype=torch.float16)
    image_encoder.to(dtype=torch.float16)
    text_encoder.to(dtype=torch.float16)
    text_encoder_2.to(dtype=torch.float16)

    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)
    text_encoder_2.requires_grad_(False)
    image_encoder.requires_grad_(False)
    transformer.requires_grad_(False)

    if not high_vram:
        print("Installing dynamic swap for low VRAM...")
        DynamicSwapInstaller.install_model(transformer, device=gpu)
        DynamicSwapInstaller.install_model(text_encoder, device=gpu)
    else:
        print("Moving models to GPU (High VRAM)...")
        text_encoder.to(gpu)
        text_encoder_2.to(gpu)
        image_encoder.to(gpu)
        vae.to(gpu)
        transformer.to(gpu)

    os.makedirs(args.output_dir, exist_ok=True)

    # Load Input Image
    print(f"Loading input image from: {args.image_path}")
    try:
        input_image = Image.open(args.image_path).convert('RGB')
    except FileNotFoundError:
        print(f"Error: Input image not found at {args.image_path}")
        exit(1)
    except Exception as e:
        print(f"Error loading image: {e}")
        exit(1)

    #OSX resolution 追加
    # Run Worker
    final_video_path = worker(
        input_image=input_image,
        prompt=args.prompt,
        n_prompt=args.n_prompt,
        seed=args.seed,
        total_second_length=args.total_second_length,
        latent_window_size=args.latent_window_size,
        steps=args.steps,
        cfg=args.cfg,
        gs=args.gs,
        rs=args.rs,
        gpu_memory_preservation=args.gpu_memory_preservation,
        use_teacache=args.use_teacache,
        outputs_folder=args.output_dir,
        resolution=args.resolution,
        high_vram=high_vram,
        text_encoder=text_encoder,
        text_encoder_2=text_encoder_2,
        tokenizer=tokenizer,
        tokenizer_2=tokenizer_2,
        vae=vae,
        feature_extractor=feature_extractor,
        image_encoder=image_encoder,
        transformer=transformer
    )

    if final_video_path:
        print(f"\nSuccessfully generated video: {final_video_path}")
    else:
        print("\nVideo generation failed.")

メモ：

Fork版からCLI対応への主な変更点

mp4crf=16(固定)
fps=30(固定)
outputファイル名にseedを追記(実行時seedを乱数とした際、確認のため追加)
ほかにも少々

Original版からFork版への変更点一部 (Fork版のサイトに詳細)

MSP対応 (maxOS-Appleシリコン対応)
resolution制御の追加と初期値 416。 Originalは固定 640
計算量および、メモリとVRAM必要量に影響する。

中断：Ctrl+cにて可能

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up