はじめに
EVO-X2でDepth Anything 3を動かしてみました
開発環境
- EVO-X2
- Python 3.12
導入
下記を参考に、Pythonプログラムを作成
da3.py
import torch
import subprocess
import sys
import os
import time
import glob
import shutil
from datetime import datetime
# Pythonバージョンチェック
if sys.version_info < (3, 8):
print(f"❌ Python 3.8以降が必要です。現在のバージョン: {sys.version}")
sys.exit(1)
print(f"✅ Python {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
# パッケージのインストール確認
required_packages = {
'depth_anything_3': 'awesome-depth-anything-3',
'addict': 'addict',
'matplotlib': 'matplotlib',
'PIL': 'pillow',
'numpy': 'numpy',
'torch': 'torch',
}
missing_packages = []
for module_name, package_name in required_packages.items():
try:
__import__(module_name)
except ImportError:
missing_packages.append(package_name)
if missing_packages:
print(f"📦 Installing missing packages: {', '.join(missing_packages)}...")
subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_packages)
# 再度インポート
from depth_anything_3.api import DepthAnything3
from depth_anything_3.utils.export.glb import export_to_glb
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import urllib.request
# ============================================================================
# 1. デバイス設定
# ============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
gpu_name = torch.cuda.get_device_name(0) if device == "cuda" else "None"
vram = torch.cuda.get_device_properties(0).total_memory / 1e9 if device == "cuda" else 0
if device == "cuda":
print(f"✅ GPU detected: {gpu_name} ({vram:.1f} GB VRAM)")
else:
print("⚠️ No GPU detected. Using CPU (will be slower)")
# ============================================================================
# 2. モデルの読み込み
# ============================================================================
model_size = "DA3-LARGE" # オプション: "DA3-SMALL", "DA3-BASE", "DA3-LARGE", "DA3-GIANT", "DA3NESTED-GIANT-LARGE"
print(f"\n📥 Loading {model_size}...")
print("| Model | Speed | Quality | VRAM |")
print("|-------|-------|---------|------|")
print("| SMALL | ⚡⚡⚡ | ★★☆ | 4GB |")
print("| BASE | ⚡⚡ | ★★★ | 6GB |")
print("| LARGE | ⚡ | ★★★★ | 8GB |")
print("| GIANT | 🐢 | ★★★★★ | 12GB |")
print("| NESTED | 🐢 | ★★★★★+ | 16GB |")
start = time.time()
model = DepthAnything3.from_pretrained(f"depth-anything/{model_size}")
model = model.to(device).eval()
print(f"✅ Model loaded in {time.time()-start:.1f}s")
# ============================================================================
# 3. サンプル画像で試す
# ============================================================================
print("\n🖼️ Testing with sample image...")
os.makedirs("samples", exist_ok=True)
url = "https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=1280"
sample_path = "samples/mountain.jpg"
if not os.path.exists(sample_path):
print("📥 Downloading sample image...")
urllib.request.urlretrieve(url, sample_path)
# 推論実行
result = model.inference([sample_path])
# 可視化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].imshow(result.processed_images[0])
axes[0].set_title("📸 Input", fontsize=14, fontweight='bold')
axes[0].axis("off")
depth = result.depth[0]
im = axes[1].imshow(depth, cmap='Spectral_r')
axes[1].set_title(f"🌊 Depth (range: {depth.min():.1f}m - {depth.max():.1f}m)", fontsize=14, fontweight='bold')
axes[1].axis("off")
plt.colorbar(im, ax=axes[1], fraction=0.046, pad=0.04, label='Depth (m)')
plt.tight_layout()
plt.savefig("samples/depth_result.png", dpi=150, bbox_inches='tight')
print("💾 Saved visualization to samples/depth_result.png")
plt.close()
print(f"\n📊 Output shapes:")
print(f" Depth: {result.depth.shape}")
print(f" Confidence: {result.conf.shape}")
print(f" Camera intrinsics: {result.intrinsics.shape}")
# ============================================================================
# 4. 独自の画像で3D再構成
# ============================================================================
print("\n" + "="*60)
print("📤 4. Use Your Own Images")
print("="*60)
# 画像ディレクトリを指定(必要に応じて変更)
upload_dir = "my_images" # または画像があるディレクトリのパスを指定
print(f"\n💡 Tips for best results:")
print(" - Move the camera, not the objects")
print(" - 30-50% overlap between consecutive images")
print(" - Avoid motion blur")
print(" - Good lighting helps!")
if not os.path.exists(upload_dir):
print(f"\n❌ Directory '{upload_dir}' not found.")
print(f"Please create the directory and add your images there.")
print(f" mkdir {upload_dir}")
image_files = []
else:
image_files = sorted(glob.glob(f"{upload_dir}/*.jpg") +
glob.glob(f"{upload_dir}/*.jpeg") +
glob.glob(f"{upload_dir}/*.png") +
glob.glob(f"{upload_dir}/*.webp"))
print(f"\n✅ Found {len(image_files)} images in '{upload_dir}'")
# プレビュー
if len(image_files) > 0:
n_preview = min(6, len(image_files))
fig, axes = plt.subplots(1, n_preview, figsize=(3*n_preview, 3))
if n_preview == 1:
axes = [axes]
for i, img_path in enumerate(image_files[:n_preview]):
img = Image.open(img_path)
axes[i].imshow(img)
axes[i].set_title(f"#{i+1}", fontsize=10)
axes[i].axis("off")
if len(image_files) > n_preview:
print(f" (showing first {n_preview} of {len(image_files)})")
plt.tight_layout()
plt.savefig("samples/image_preview.png", dpi=150, bbox_inches='tight')
plt.close()
# 3D再構成
if len(image_files) >= 2:
print(f"\n🔄 Processing {len(image_files)} images...")
start = time.time()
result = model.inference(
image_files,
process_res_method="upper_bound_resize",
)
inference_time = time.time() - start
print(f"✅ Inference done in {inference_time:.1f}s ({len(image_files)/inference_time:.1f} img/s)")
# GLBにエクスポート
output_dir = "output_3d"
os.makedirs(output_dir, exist_ok=True)
print("📦 Generating 3D point cloud...")
export_to_glb(
result,
export_dir=output_dir,
show_cameras=True,
conf_thresh_percentile=20, # Filter low-confidence points
num_max_points=500_000,
)
print(f"\n✅ 3D model saved to {output_dir}/")
if os.path.exists(output_dir):
files = os.listdir(output_dir)
for f in files:
filepath = os.path.join(output_dir, f)
size = os.path.getsize(filepath) / (1024 * 1024) # MB
print(f" 📄 {f}: {size:.2f} MB")
glb_file = os.path.join(output_dir, "point_cloud.glb")
if os.path.exists(glb_file):
print(f"\n🎉 GLB file created!")
print(f"📁 Location: {os.path.abspath(glb_file)}")
print(f"\n👉 View your model: https://gltf-viewer.donmccurdy.com/")
elif len(image_files) == 1:
print("\n⚠️ At least 2 images are needed for 3D reconstruction.")
else:
print("\n⚠️ No images found. Skipping 3D reconstruction.")
# ============================================================================
# 5. 結果の可視化
# ============================================================================
if 'result' in locals() and len(result.depth) > 0 and len(result.depth) > 1:
print("\n" + "="*60)
print("📊 5. Visualize Results")
print("="*60)
# 深度マップの表示
n_images = len(result.depth)
cols = min(4, n_images)
rows = (n_images + cols - 1) // cols
fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
axes = np.array(axes).flatten() if n_images > 1 else [axes]
for i in range(n_images):
depth = result.depth[i]
axes[i].imshow(depth, cmap='Spectral_r')
axes[i].set_title(f"Frame {i+1}", fontsize=10)
axes[i].axis("off")
# 未使用のサブプロットを非表示
for i in range(n_images, len(axes)):
axes[i].axis("off")
plt.suptitle("🌊 Depth Maps", fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig("samples/all_depth_maps.png", dpi=150, bbox_inches='tight')
print("💾 Saved depth maps to samples/all_depth_maps.png")
plt.close()
# カメラポーズの可視化
if len(result.extrinsics) > 0:
from mpl_toolkits.mplot3d import Axes3D
positions = []
for ext in result.extrinsics:
R = ext[:3, :3]
t = ext[:3, 3]
cam_pos = -R.T @ t
positions.append(cam_pos)
positions = np.array(positions)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(positions[:, 0], positions[:, 1], positions[:, 2],
c=range(len(positions)), cmap='viridis', s=100, marker='o')
ax.plot(positions[:, 0], positions[:, 1], positions[:, 2],
'b-', alpha=0.5, linewidth=1)
ax.scatter(*positions[0], c='green', s=200, marker='^', label='First')
ax.scatter(*positions[-1], c='red', s=200, marker='v', label='Last')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('📷 Camera Trajectory', fontsize=14, fontweight='bold')
ax.legend()
plt.tight_layout()
plt.savefig("samples/camera_trajectory.png", dpi=150, bbox_inches='tight')
print("💾 Saved camera trajectory to samples/camera_trajectory.png")
plt.close()
print(f"📍 {len(positions)} camera poses estimated")
print("\n" + "="*60)
print("✅ Processing complete!")
print("="*60)
ライブラリのインストール
pip install awesome-depth-anything-3 matplotlib pillow numpy torch
画像を my_images/ フォルダに配置
mkdir my_images
実行
実行してみましょう
python da3.py
✅ Python 3.12.12
[INFO ] ModelCache initialized
📦 Installing missing packages: awesome-depth-anything-3, addict...
[WARN ] Dependency `gsplat` is required for rendering 3DGS. Install via: pip install git+https://github.com/nerfstudio-project/gsplat.git@0b4dddf04cb687367602c01196913cde6a743d70
⚠️ No GPU detected. Using CPU (will be slower)
📥 Loading DA3-LARGE...
| Model | Speed | Quality | VRAM |
|-------|-------|---------|------|
| SMALL | ⚡⚡⚡ | ★★☆ | 4GB |
| BASE | ⚡⚡ | ★★★ | 6GB |
| LARGE | ⚡ | ★★★★ | 8GB |
| GIANT | 🐢 | ★★★★★ | 12GB |
| NESTED | 🐢 | ★★★★★+ | 16GB |
config.json: 1.21kB [00:00, 1.21MB/s]
[INFO ] Model cache MISS: da3-large on cpu. Loading...
[INFO ] using MLP layer as FFN
[INFO ] Model cached: da3-large on cpu
[INFO ] Using standard InputProcessor (optimized CPU pipeline)
model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.64G/1.64G [01:29<00:00, 18.3MB/s]
✅ Model loaded in 92.7s
🖼️ Testing with sample image...
📥 Downloading sample image...
[INFO ] Processed Images Done taking 0.07385778427124023 seconds. Shape: torch.Size([1, 3, 336, 504])
[INFO ] Model Forward Pass Done. Time: 5.336712837219238 seconds
[INFO ] Conversion to Prediction Done. Time: 0.0010001659393310547 seconds
💾 Saved visualization to samples/depth_result.png
📊 Output shapes:
Depth: (1, 336, 504)
Confidence: (1, 336, 504)
Camera intrinsics: (1, 3, 3)
サンプル画像の深度推定結果
============================================================
📤 4. Use Your Own Images
============================================================
💡 Tips for best results:
- Move the camera, not the objects
- 30-50% overlap between consecutive images
- Avoid motion blur
- Good lighting helps!
✅ Found 2 images in 'my_images'
🔄 Processing 2 images...
[INFO ] Processed Images Done taking 0.02064800262451172 seconds. Shape: torch.Size([2, 3, 280, 504])
[INFO ] Model Forward Pass Done. Time: 8.261384725570679 seconds
[INFO ] Conversion to Prediction Done. Time: 0.0 seconds
✅ Inference done in 8.3s (0.2 img/s)
📦 Generating 3D point cloud...
[INFO ] conf_thresh_percentile: 20
[INFO ] num max points: 500000
[INFO ] Exporting to GLB with num_max_points: 500000
'cp' は、内部コマンドまたは外部コマンド、
操作可能なプログラムまたはバッチ ファイルとして認識されていません。
✅ 3D model saved to output_3d/
📄 depth_vis: 0.00 MB
📄 scene.glb: 2.07 MB
============================================================
📊 5. Visualize Results
============================================================
💾 Saved depth maps to samples/all_depth_maps.png
💾 Saved camera trajectory to samples/camera_trajectory.png
📍 2 camera poses estimated
============================================================
✅ Processing complete!
============================================================
my_images/に配置した2枚の画像
深度推定結果
output_3d/depth_vis
2枚の画像から3D再構築したモデル
output_3d/scene.glb
glbビューワー作った pic.twitter.com/HD55CzGsNr
— がちもとさん (@sotongshi) December 9, 2025
お疲れさまでした。







