factor_analyzerを使わずに因子分析とバリマックス回転をスクラッチで実装してみる【Python】

Last updated at 2025-07-27Posted at 2025-07-26

1.モチベーション

『マンガでわかる統計学因子分析編』（オーム社）の解説のステップを再現する
おそろしく大変なので、手で計算する代わりにPythonでスクラッチで実装する
factor_analyzerをここでは使用しない
反復主因子法と基準化バリマックス法のプロセスについて理解を深める

2.コード掲載

import numpy as np
import pandas as pd

def create_dataset():
    ''' サンプルデータを作成 '''
    global questions, attributes

    responses = [
        [5, 5, 5, 4, 4, 2],
        [5, 4, 5, 2, 2, 2],
        [4, 4, 4, 4, 4, 4],
        [2, 3, 4, 3, 3, 3],
        [3, 3, 3, 3, 4, 1],
        [5, 4, 5, 3, 2, 3],
        [5, 5, 5, 4, 5, 5],
        [3, 1, 2, 5, 4, 4],
        [4, 1, 3, 3, 2, 3],
        [1, 2, 2, 2, 2, 2],
        [3, 2, 3, 1, 1, 1],
        [4, 3, 4, 4, 3, 4],
        [3, 2, 3, 4, 5, 5],
        [4, 3, 4, 5, 4, 5],
        [2, 2, 3, 5, 5, 4],
    ]
    respondents = ["アさん", "イさん", "ウさん", "エさん", "オさん", 
                   "カさん", "キさん", "クさん", "ケさん", "コさん", 
                   "サさん", "シさん", "スさん", "セさん", "ソさん"
                   ]
    questions = ["Q1a", "Q1b", "Q1c", "Q1d", "Q1e", "Q1f"]
    attributes = ["外観の雰囲気", "店内の雰囲気", "ウェイトレスの応対",
                  "紅茶の味", "紅茶の価格", "ティーカップのセンス"
                  ]
    df = pd.DataFrame(responses, index=respondents, columns=questions)   
    return df


def do_normalize(df):
    ''' 列データを標準化する '''
    normalized = pd.DataFrame()
    cols = df.columns
    for col in cols:
        mean = np.mean(df[col], axis=0)
        std = np.std(df[col], axis=0, ddof=1)
        score = (df[col]-mean)/std
        normalized[col] = score
    return normalized


def get_corr_table(indata):
    ''' 相関行列を算出する '''
    corr_table = indata.corr(method='pearson')
    return pd.DataFrame(corr_table, index=questions, columns=questions)


def do_factor_analysis(corr_table, nf=2, do_rotation=False):
    ''' 反復主因子法を行う     
    引数:        
        corr_table: 相関行列
        nf: 因子数
        do_rotation: バリマックス回転を行う場合、'varimax'を指定 
    '''

    v_reduced = np.nan          # 固有ベクトル行列（最大からnf番目以降の固有値の要素をゼロに置き換えたもの）
    w_reduced = np.nan          # 固有値の対角行列（最大からnf番目以降の固有値の要素をゼロに置き換えたもの）
    matrix_reduced = np.nan     # v_reducedとw_reducedによって復元される行列
    epsilon = 1/1_000_000_000

    def replace_diag_by_residuals(corr_table):
        ''' 相関行列の対角成分（共通性：1-d^2）を重回帰式の回帰係数で置き換える '''
        from sklearn.linear_model import LinearRegression

        outdata = corr_table.copy()
        elements = corr_table.shape[0]
        for i in range(elements):
            col = corr_table.columns[i]
            model = LinearRegression()
            X, y = df_normalized.drop(columns=col), df_normalized[col]
            model.fit(X, y)
            r_squared = model.score(X, y)
            outdata.iloc[i,i] = r_squared        
        return outdata
    

    def eig_decomp_reduction(indata):
        ''' 固有値分解を行い、最大からnf番目以降の固有値の要素をゼロに置き換える '''
        nonlocal matrix_reduced, v_reduced, w_reduced

        # 固有値分解
        eigenvalues, eigenvectors = np.linalg.eig(indata)        
        v = eigenvectors
        w = np.diag(eigenvalues)  # 固有値から対角行列を作成（他の要素はすべて0）
        reconstructed_matrix = v @ w @ v.T
        assert np.all(np.abs(indata.values - reconstructed_matrix) < epsilon), \
          'Eigenvalues Decomposition Error'

        # 因子の数をnfとし、それ以外の固有値をゼロとする
        v_reduced = v[:, 0:nf]
        w_reduced = w[0:nf, 0:nf]
        matrix_reduced = v_reduced @ w_reduced @ v_reduced.T

        # indataの対角値を新たな共通性の値で置き換える
        commonalities = np.diag(matrix_reduced)
        next_input = indata.values.copy()
        np.fill_diagonal(next_input, commonalities)       
             
        return pd.DataFrame(next_input)


    def do_loop(indata):
        '''
        いずれかの共通性が１を超えそうになるまで eig_decomp_reduction (STEP 12-14) を繰り返し、
        最後の繰り返しにおける因子情報を返す
        '''
        nonlocal matrix_reduced, v_reduced, w_reduced
        
        outdata = indata.copy()       
        history = []
        while np.all(np.diag(outdata) < 1):
            outdata = eig_decomp_reduction(outdata)
            history.append({
                'v_reduced': v_reduced.copy(),
                'w_reduced': w_reduced.copy(),
                'matrix_reduced': matrix_reduced.copy()
            })        
        v_reduced = history[-2]['v_reduced'].copy()
        w_reduced = history[-2]['w_reduced'].copy()
        matrix_reduced = history[-2]['matrix_reduced'].copy()

        return v_reduced, w_reduced, matrix_reduced


    def align_factor_signs(loadings):
        ''' 各因子の負荷量の符号を揃える '''
        loadings_aligned = loadings.copy()
        for i in range(loadings.shape[1]):
            factor = loadings[:, i]
            if np.sum(factor < 0) > np.sum(factor > 0):
                loadings_aligned[:, i] = -factor
        return loadings_aligned


    def do_varimax_rotation(factor_loadings):
        ''' バリマックス回転を行う '''            
        from scipy import optimize
        assert nf == 2, "Sorry, only two-factor Varimax rotation is supported."

        def yield_rotation_matrix(theta):
            ''' 角度thetaの回転行列を生成する '''
            theta = float(theta)
            R_left  = np.array([[np.cos(-theta), -np.sin(-theta)], [np.sin(-theta), np.cos(-theta)]])
            R_right = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
            assert np.all(np.abs(R_left @ R_right - np.identity(2)) < epsilon), \
              'Rotation Matrices Specification Error'
            return R_left, R_right
 
 
        def objective(theta):
            ''' 基準化バリマックス法の目的変数 V と theta の関係式 '''
            R_left, R_right = yield_rotation_matrix(theta)
            B_left = factor_loadings @ R_left
            B_right = R_right @ factor_loadings.T                
            V = np.var(B_left**2, axis=0, ddof=0).sum()  
            return -V   # optimize.minimize を使って最大化するのでマイナスをつける


        # 最適化を実行
        theta_initial = np.radians(0)
        result = optimize.minimize(objective, theta_initial, bounds=[(np.radians(-180), np.radians(180))])            
        theta_optimized = result.x[0]
        V_max = - result.fun
        print(f"回転させる角度: {np.degrees(theta_optimized):.2f}°")

        R_left, R_right = yield_rotation_matrix(theta_optimized)
        B_left = factor_loadings @ R_left
        B_right = R_right @ factor_loadings.T          
        factor_loadings_rotated = B_left
        print(f'バリマックス回転後の因子負荷量：\n{factor_loadings_rotated}\n')

        return factor_loadings_rotated


    # 因子分析を実行
    start_data = replace_diag_by_residuals(corr_table)
    v_reduced, w_reduced, matrix_reduced = do_loop(start_data)
    
    # 因子負荷行列（因子パターン行列）を求める
    factor_loadings = v_reduced @ np.sqrt(w_reduced)
    factor_loadings = align_factor_signs(factor_loadings)
    assert np.all(matrix_reduced - (factor_loadings @ factor_loadings.T) < epsilon), \
      'Factor Loadings Computation Error'

    print(f'回転前の因子負荷量：\n{factor_loadings}')
    if not do_rotation:
        return factor_loadings

    # バリマックス回転を行う場合
    elif do_rotation == 'varimax':        
        factor_loadings_after_rotation = do_varimax_rotation(factor_loadings)
        return factor_loadings_after_rotation

if __name__ == '__main__':
    pd.set_option('display.float_format', '{:.2f}'.format)
    np.set_printoptions(precision=2)

    df = create_dataset()
    df_normalized = do_normalize(df)
    corr_table = get_corr_table(df_normalized)
    factor_loadings_rotated = do_factor_analysis(corr_table, nf=2, do_rotation='varimax')

上記のコードの実行結果は以下のとおり。
本に書かれている内容と同じ結果となった。

回転前の因子負荷量：
[[ 0.67 -0.41]
 [ 0.74 -0.49]
 [ 0.8  -0.57]
 [ 0.57  0.75]
 [ 0.55  0.66]
 [ 0.51  0.6 ]]
 
回転させる角度: 36.39°
バリマックス回転後の因子負荷量：
[[0.78 0.07]
 [0.89 0.04]
 [0.99 0.01]
 [0.01 0.94]
 [0.05 0.86]
 [0.06 0.79]]

3.ダブルチェック

factor_analyzerライブラリのドキュメントを見ると

残差最小法 ('minres')
主成分法 ('principal')
最尤法 ('ml')

の３種類のメソッドが使用でき、反復主因子法はサポートされていない。
代わりに統計ソフト SAS では反復主因子法が使用できるので、 SAS の結果と比べてみる。
先にPython側の表示桁数を上げておく。

pd.set_option('display.float_format', '{:.5f}'.format)
np.set_printoptions(precision=5)

proc factor data=df nfactors=2 method=prinit priors=SMC maxiter=2 rotate=varimax;
  var Q1a Q1b Q1c Q1d Q1e Q1f ;
run;

結果は以下のとおり。

(中略)

回転させる角度: 36.39°
バリマックス回転後の因子負荷量：
[[0.78282 0.06618]
 [0.89055 0.04324]
 [0.98667 0.01333]
 [0.01193 0.94159]
 [0.05182 0.86122]
 [0.05614 0.79022]]

The SAS System
(中略)

      Rotated Factor Pattern

            Factor1         Factor2

Q1a         0.78285         0.06586
Q1b         0.89057         0.04288
Q1c         0.98668         0.01293
Q1d         0.01231         0.94159
Q1e         0.05217         0.86120
Q1f         0.05646         0.79020

小数点以下4桁くらいのレベルまでおおよそ一致している。

4.次回

プロマックス回転のコードも実装してみる（時間があれば）。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up