More than 1 year has passed since last update.

C&W Attack 実装

Posted at 2023-09-09

はじめに

「C&W Attack」という敵対的サンプル攻撃をPyTorchで実装しました。
画像データはCIFAR10を用いました。
実装にあたり書籍「AIセキュリティから学ぶディープラーニング[技術]入門」に掲載のソースコードを参考にしました。

参考元ソースコード

実装

本稿では L2 Attack、Targeted Attack を実装します。

モジュールのインポート

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

データの用意

mu=0.5
sigma=0.5

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((mu, mu, mu), (sigma, sigma, sigma)), 
])
trainset = torchvision.datasets.CIFAR10(
    root='./data', 
    train=True,
    download=True,
    transform=transform
)
trainloader = torch.utils.data.DataLoader(
    trainset,
    batch_size=100,
    shuffle=True,
    num_workers=2
)
testset = torchvision.datasets.CIFAR10(
    root='./data', 
    train=False, 
    download=True, 
    transform=transform
)
testloader = torch.utils.data.DataLoader(
    testset, 
    batch_size=1, 
    shuffle=False, 
    num_workers=2
)

モデルの用意

def Net():
    model_ft=models.resnet50(pretrained=True)
    model_ft.fc=nn.Linear(model_ft.fc.in_features, 10)
    return  model_ft

# Define what device we are using
use_cuda=True
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

# Initialize the network
model = Net().to(device)

# Load the pretrained model
path="./exp/bestloss.pth"
model.load_state_dict(torch.load(path))

# Set the model in evaluation mode. In this case this is for the Dropout layers
model.eval()

前準備

class2name={
    0: "airplane", 
    1: "automobile", 
    2: "bird", 
    3: "cat", 
    4: "deer", 
    5: "dog", 
    6: "frog", 
    7: "horse", 
    8: "ship", 
    9: "truck", 
}

攻撃

class CW2:
    """ C&W Attack (L2) による敵対的サンプルを生成
    Attributes:
        classifier (Model) : logits を出力するモデル
        k (float): 自信を調整するパラメータ
        learning_rate (float): Adam の学習率
        binary_search_steps (int): バイナリサーチの回数
        max_iterations (int): Adam の最大イテレーション回数
        initial_c (float): c の初期値
    """

    def __init__(self, model, k = 0, learning_rate = 0.01,
                            binary_search_steps = 9, max_iterations = 1000,
                            initial_c = 0.001):

        # 引数をすべてインスタンス変数にセット
        self.model = model
        self.k = k
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.binary_search_steps = binary_search_steps
        self.initial_c = initial_c
            
    def is_satisfied_with_k(self, logits, target_class):
        """
        「 k を加味したターゲットクラスの logit が最大」という制約を満たすか確認する。
        ターゲットクラスの logit から k　を引いた値が他のどのクラスの logit よりも大きければ　True を返し、
        そうでなければ、False を返す。
        Args:
            logits (ndarray): logits
            target_class (ndarray): ターゲットクラス
        Returns:
            satisfied (bool): 制約を満たす場合は True、そうでなければ False
        """
        # logits = np.copy(logits)
        logits[target_class] -= self.k
        satisfied = torch.argmax(logits) == target_class
        return satisfied    
    
    def generate(self, original_image, target_class_ohe):
        """
        敵対的サンプルを生成
        Args:
            original_image (tensor): オリジナル画像 `(28, 28)`
            target_class_ohe (tensor): ターゲットクラスの One-Hot `(10, )`
        Returns:
            o_best_adv_image (tensor): 敵対的サンプル `(28, 28)`
        """

        # ターゲットクラスを変数にセット
        target_class = np.argmax(target_class_ohe)
        
        # オリジナル画像の型を変数にセット
        shape = original_image.shape
               
        # c とその下限値と上限値をセット
        c = self.initial_c
        c_lower = 0
        c_upper = 1e10

        #  (tanh(w) + 1) / 2 の w
        w = torch.zeros(shape, dtype=torch.float32)
        w.requires_grad = True
        
        #  オリジナル画像を Tensor にキャスト
        # original_image = torch.tensor(original_image, dtype=torch.float32)

        # 目的関数を生成する
        def build_objective(c, w):

            # 敵対的サンプルを格納する変数
            # adv_image = (torch.tanh(w) + 1) / 2
            adv_image = torch.tanh(w) # 正規化後の画像は[-1,1]

            # 目的関数1
            objective1 = torch.sum(torch.square(adv_image - original_image))

            # 敵対的サンプルの logits
            logits = self.model(adv_image)[0]

            # ターゲットクラスの logit
            target_logit = torch.sum(target_class_ohe * logits)
            
            # ターゲットクラス以外の logit の最大値
            other_max_logit = torch.max((1 - target_class_ohe) * logits + (target_class_ohe * torch.min(logits)))
          
            # 目的関数2
            objective2 = c * torch.maximum(torch.tensor(0.0), other_max_logit - target_logit + self.k)

            # 目的関数
            objective = objective1 + objective2

            return objective

        # Adam のインスタンスを生成
        opt = torch.optim.Adam([w], lr=self.learning_rate)
        
        # 目的関数の最小化の過程で見つかった最小の objective1、敵対的サンプル、分類結果を格納する変数
        o_best_objective1 = np.inf # 初期値として無限をセット
        o_best_adv_image = np.zeros(shape) # 初期値としてすべての要素に0をセット
        o_best_class = -1 # 初期値としてダミークラスをセット
        
        # バイナリサーチ用のループ
        for outer_step in range(self.binary_search_steps):

            # print("outer_step: ", outer_step)
            
            # バイナリサーチのステップ内で見つかった最小の objective1 と分類結果を格納する変数
            best_objective1 = np.inf # 初期値として無限をセット
            best_class = -1 # 初期値としてダミークラスをセット
            
            # objective を保存しておく変数
            prev_objective = np.inf # 初期値として無限をセット
            
            # Adam を実行するループ
            for iteration in range(self.max_iterations):

                # print("    iteration: ", iteration)
                
                # Adam を実行
                opt.zero_grad()
                objective = build_objective(c, w)
                objective.backward()
                opt.step()
                
                # 敵対的サンプルの logits 取得
                # adv_image = (torch.tanh(w) + 1) / 2
                adv_image = torch.tanh(w) # 正規化後の画像は[-1,1]
                logits = self.model(adv_image)[0]

                objective1 = torch.sum(torch.square(adv_image - original_image))

                # max_iterations の 10% ごとに objective を確認
                if iteration % (self.max_iterations // 10) == 0:
                    # objective にほとんど変化がない、もしくは増えている場合は Adam のループを抜ける
                    if build_objective(c, w) > prev_objective * 0.9999:
                        break
                    # 次のステップの比較用に現在の objective を保存
                    prev_objective = build_objective(c, w)
                
                # 制約を満たすか確認
                satisfied = self.is_satisfied_with_k(logits, target_class)

                # objective1 が、best_objective1 より小さく、制約も満たせば、best_objective1 と best_class を更新する
                if objective1 < best_objective1 and satisfied:
                    best_objective1 = objective1
                    best_class = target_class
                    
                # objective1 が、o_best_objective1 より小さく、制約も満たせば、o_best_objective1, o_best_class, o_best_adv_image を更新する
                if objective1 < o_best_objective1 and satisfied:
                    o_best_objective1 = objective1
                    o_best_class = target_class
                    o_best_adv_image = adv_image

            # 現在のバイナリサーチのステップで制約を満たしている場合
            if best_class == target_class:
                # c の上限値に現在の c をセットする
                c_upper = c

                # c に　「現在の c」　と　「c の下限値」　の平均値をセットする
                c = (c_lower + c_upper) / 2

            # 見つからなかった場合
            else:
                # c の下限値に現在の c をセットする
                c_lower = c
                if c_upper < 1e9:
                    # c の上限値が 1e9 未満の場合は、c に　「現在の c」　と　「c の上限値」　の平均値をセットする
                    c = (c_lower + c_upper) / 2
                else:
                    # それ以外は、c が大きくなりすぎないように、c を 10 倍する
                    c *= 10
            
            # ログ出力
            print('Binary Search {0}/{1}'.format(outer_step + 1, self.binary_search_steps))
            print('  L2 square: {0:.2f} - c: {1:.2f} - class: {2}'.format(o_best_objective1, c, o_best_class))

        # 敵対的サンプルを ndarray に変換して返す
        return o_best_adv_image

original_image, _ = testset[0]
original_image=original_image.unsqueeze(0)
target_class_ohe = F.one_hot(torch.tensor(0), num_classes=10) # ターゲットクラスを 0 とする

k=0 の攻撃

attack = CW2(model, k=0) # k = 0 で CW2 のインスタンスを生成
adv_image_k0 = attack.generate(original_image, target_class_ohe) # 敵対的サンプルを生成

# 推論結果。 model_logits ではなく、model での推論である点に注意
Y_hat_k0 = model(adv_image_k0)

# スコア
score_k0 = torch.max(torch.softmax(Y_hat_k0, dim=1)).item()

# 分類結果
class_k0 = torch.argmax(Y_hat_k0[0]).item()
advk0_name = class2name[class_k0]

# L2
adv_img_k0 = adv_image_k0[0].detach().numpy()
orig_img = original_image[0].numpy()
L2_k0 = np.linalg.norm(adv_img_k0 - orig_img)

adv_img_k0 = mu+sigma*adv_img_k0
adv_img_k0 = adv_img_k0.transpose(1,2,0)

k=7 の攻撃

attack = CW2(model, k=7) # k = 0 で CW2 のインスタンスを生成
adv_image_k7 = attack.generate(original_image, target_class_ohe) # 敵対的サンプルを生成

# 推論結果。 model_logits ではなく、model での推論である点に注意
Y_hat_k7 = model(adv_image_k7)

# スコア
score_k7 = torch.max(torch.softmax(Y_hat_k7, dim=1)).item()

# 分類結果
class_k7 = torch.argmax(Y_hat_k7[0]).item()
advk7_name = class2name[class_k7]

# L2
adv_img_k7 = adv_image_k7[0].detach().numpy()
orig_img = original_image[0].numpy()
L2_k7 = np.linalg.norm(adv_img_k7 - orig_img)

adv_img_k7 = adv_img_k7.transpose(1,2,0)
adv_img_k7 = mu+sigma*adv_img_k7

可視化

plt.figure(figsize=(15, 15))

# オリジナル画像を表示
plt.subplot(1, 3, 1)
plt.gca().axes.xaxis.set_visible(False)
plt.gca().axes.yaxis.set_visible(False)
plt.title('Original\n {0} - {1:.2f}%'.format(class2name[original_class], original_score * 100))
plt.imshow(img)

# k = 0 の敵対的サンプルを表示
plt.subplot(1, 3, 2)
plt.gca().axes.xaxis.set_visible(False)
plt.gca().axes.yaxis.set_visible(False)
plt.title('Adversarial (k = 0)\n {0} - {1:.2f}%, l2 = {2:.2f}'.format(class2name[class_k0], score_k0 * 100, L2_k0))
plt.imshow(adv_img_k0)

# k = 7 の敵対的サンプルを表示
plt.subplot(1, 3, 3)
plt.gca().axes.xaxis.set_visible(False)
plt.gca().axes.yaxis.set_visible(False)
plt.title('Adversarial (k = 7)\n {0} - {1:.2f}%, l2 = {2:.2f}'.format(class2name[class_k7], score_k7 * 100, L2_k7))
plt.imshow(adv_img_k7)

可視化は以下のようになります。

k=0, k=7 ともに airplane と誤認識させることに成功しています。
ここでよく見ると、k=0 ではスコア（airplaneの確率）が50.71%であるのに対して、k=7 のほうがスコアは99.88%と高くなっています。（その分l2ノルムも大きくなっています）

終わりに

現論文によると C&W Attack は当時有効とされていた Distillation Defense を破った手法とのこと。
時間を見つけて防御手法も調査していきたい。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up