Edited at

Spark MLlibで相関係数を算出してみる。

More than 5 years have passed since last update.


環境


  • Mac OS X Version 10.9.5

  • Scala 2.10.4

  • Spark 1.1.0

  • Spark MLlib 1.1.0

  • sbt 0.13.1


準備

/root/to/project/path

|-- build.sbt
|-- src
| |-- main
| | |-- scala
| | | |-- Stoppable.scala
| |-- test
| | |-- scala
| | | |-- SparkMLlibCorrelationSpec.scala


build.sbt

name := "Spark MLlib examples"                                                                                                                                                    

version := "0.0.1-SNAPSHOT"

scalaVersion := "2.10.4"

scalacOptions ++= Seq("-Xlint", "-deprecation", "-unchecked", "-feature", "-Xelide-below", "ALL")

libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "1.1.0",
"org.apache.spark" %% "spark-mllib" % "1.1.0",
"org.specs2" %% "specs2" % "2.4.1"
)



実装


src/test/scala/SparkMLlibCorrelationSpec.scala

import commons._                                                                                                                                                                  

import org.apache.spark._
import org.apache.spark.mllib.stat._
import org.apache.spark.mllib.stat.correlation._
import org.apache.spark.SparkContext._

import org.specs2._

class SparkMLlibCorrelationSpec extends Specification with Stoppable { def is = s2"""

Spark MLlib

Correlation
correlation1 $correlation1
"""

// 気温とビールの出荷数
val xData = Array(12.1, 15.3, 18.6, 21.7, 26.1, 32.1)
val yData = Array(45.0, 520.0, 2864.0, 6874.0, 25487.0, 102870.0)

var retCorrelation1: Double = _
using(new SparkContext("local[1]", "SparkMLlibCorrelationSpec", System.getenv("SPARK_HOME"))) { sc =>
val x = sc.parallelize(xData)
val y = sc.parallelize(yData)
retCorrelation1 = Statistics.corr(x, y, "pearson")
}
// この相関係数を算出しているページ:http://dekiru.net/article/4576/
def correlation1 = retCorrelation1 must_== 0.8633475827899126
}



src/main/scala/Stoppable.scala

package commons                                                                                                                                                                   

import scala.language.reflectiveCalls

trait Stoppable {
type T = { def stop(): Unit }
def using[A <: T, B](resource: A)(f: A => B) = try {
f(resource)
} finally {
resource.stop()
}
}



実行

$ sbt '~test-only SparkMLlibCorrelationSpec'


参考

http://spark.apache.org/docs/1.1.0/mllib-statistics.html#correlations