5
4

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

単回帰分析(最小二乗法) in C++

Last updated at Posted at 2018-09-23

C++による単回帰分析
x(横軸):身長、y(縦軸):年収の2次元データに対し、最小二乗法により近似直線を計算する。このデータは自作した。左の列が身長[cm]、右の列が年収[万円]である。

height-income.csv
162,420
165,470
168,540
170,550
171,565
172,570
175,580
178,590
181,600
183,625

身長と年収には正の相関があると聞いたことがある。確かに、特に男性だと体格の良い方が強くて頼もしく見えるから、結果的に仕事も上手くいきやすいのかもしれない...逆に小さいと舐められるケースもあるから、それで少し不利になるのかもしれない...
とはいえ、上記のデータほどの差はないだろうけど。

slr.cpp
/* 単回帰分析 (simple linear regression) */
#include <vector>

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

using namespace std;

// Calculate the average
// 平均値を計算
float calcAverage(vector<float> &f){

  float average = 0.0;
  for(int i = 0; i < f.size(); i++){
    average += f[i]/(float)(f.size());
  }

  return average;

}

// Calculate the mean square
// 2乗平均を計算
float calcMeanSquare(vector<float> &f){

  float meanSquare = 0.0;
  for(int i = 0; i < f.size(); i++){
    meanSquare += f[i]*f[i]/(float)(f.size());
  }

  return meanSquare;

}

// Calculate the decentration
// 分散を計算
float calcDecentration(vector<float> &f){

  float average           = calcAverage(f);
  float meanSquare        = calcMeanSquare(f);
  float decentration = meanSquare - average*average;

  return decentration;

}

// Calculate the covariance
// 共分散を計算
float calcCovariance(vector<float> &f, vector<float> &g){

  float covariance = 0.0;
  for(int i = 0; i < f.size(); i++){
    covariance += f[i]*g[i]/(float)(f.size());
  }
  covariance -= calcAverage(f) * calcAverage(g);

  return covariance;

}

// Calculate the slope of the approximate line
// 近似直線の傾きを計算
float calcSlope(vector<float> &x, vector<float> &y){

  float Xdecentration = calcDecentration(x);
  float covariance    = calcCovariance(x, y);
  float slope         = covariance/Xdecentration;

  return slope;

}

// Calculate the intercept of the approximate line
// 近似直線の切片を計算
float calcIntercept(vector<float> &x, vector<float> &y){

  float Xdecentration = calcDecentration(x);
  float covariance    = calcCovariance(x, y);
  float Xaverage      = calcAverage(x);
  float Yaverage      = calcAverage(y);
  float intercept     = Yaverage - (covariance/Xdecentration)*Xaverage;

  return intercept;

}

// Load a file
// ファイルの読み込み
void readFile(char *filename, vector<float> &x, vector<float> &y){

    float Xtmp = 0.0, Ytmp = 0.0;

    FILE *fp;
    fp = fopen(filename, "r");
    if(fp == NULL){
      //cout << "can't open file ." << endl;
      printf("can't open file.\n");
      exit(1);
    }

    while(fscanf(fp, "%f, %f\n", &Xtmp, &Ytmp) != EOF){
      x.push_back(Xtmp);
      y.push_back(Ytmp);
    }

    fclose(fp);

}

// Export a file
// ファイル書き出し
int writeFile(char *filename, vector<float> &x, vector<float> &y_actual, vector<float> &y_predective){

  FILE *fp;
  fp = fopen(filename, "w");
  if(fp == NULL){
    printf("can't open file.\n");
    exit(1);
  }

  for(int i = 0; i < x.size(); i++){
    fprintf(fp, "%f, %f, %f\n", x[i], y_actual[i], y_predective[i]);
  }

  fclose(fp);

  return(0);

}

int main(void){

  int i;
  float corre = 0.0;
  vector<float> x, y_predective, y_actual, error;
  char inputFilename[30]  = "height-income.csv";
  char outputFilename[50] = "height-income-approximation.csv";

  // Load file
  // ファイル読み込み
  readFile(inputFilename, x, y_actual);

  // 相関係数
  corre = calcCovariance(x, y_actual)/( sqrt(calcDecentration(x))*sqrt(calcDecentration(y_actual)) );
  printf("correlation coefficience : %5.4f\n", corre);

  // Calculate the approximate line
  // 近似直線を求める
  float a = calcSlope(x, y_actual);
  float b = calcIntercept(x, y_actual);

  printf("slope     : %f\n", a);
  printf("intercept : %f\n\n", b);

  // 近似直線の配列に格納
  for(i = 0; i < x.size(); i++){
    y_predective.push_back(a*x[i]+b);
    error.push_back(y_actual[i] - (a*x[i]+b));
  }

  for(i = 0; i < x.size(); i++){
    printf("No%3d : x = %6.2f, y(actual) = %6.2f, y(theoretical) = %6.2f, error = %6.2f\n", i+1, x[i], y_actual[i], y_predective[i], error[i]);
  }

  // ファイル書き出し
  writeFile(outputFilename, x, y_actual, y_predective);

  return(0);

}

↓コンソール出力
近似直線はy = 8.41x + -899となる。
y(actual)は実際のデータの値、y(theoretical)が近似直線から算出した値、errorは両者の誤差。
y(actual)とy(theoretical)の誤差の大きさの割に相関係数(correlation coefficience)が1に近くて高いような...

correlation coefficience : 0.9197
slope     : 8.408483
intercept : -899.463379

No  1 : x = 162.00, y(actual) = 420.00, y(theoretical) = 462.71, error = -42.71
No  2 : x = 165.00, y(actual) = 470.00, y(theoretical) = 487.94, error = -17.94
No  3 : x = 168.00, y(actual) = 540.00, y(theoretical) = 513.16, error =  26.84
No  4 : x = 170.00, y(actual) = 550.00, y(theoretical) = 529.98, error =  20.02
No  5 : x = 171.00, y(actual) = 565.00, y(theoretical) = 538.39, error =  26.61
No  6 : x = 172.00, y(actual) = 570.00, y(theoretical) = 546.80, error =  23.20
No  7 : x = 175.00, y(actual) = 580.00, y(theoretical) = 572.02, error =   7.98
No  8 : x = 178.00, y(actual) = 590.00, y(theoretical) = 597.25, error =  -7.25
No  9 : x = 181.00, y(actual) = 600.00, y(theoretical) = 622.47, error = -22.47
No 10 : x = 183.00, y(actual) = 625.00, y(theoretical) = 639.29, error = -14.29

↓元の2次元データ(青点)と近似直線(緑線)

height-income-slr.png

5
4
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
5
4

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?