Motivation
To understand how to's for handling DataFrame in SMILE
Environment
Jupyter Lab & IJava & temrin jdk 11 (have to use > 9)
Usage of IJava and maven
First, should do load repositories and dependencies
For example,
%mavenRepo scijava-public https://maven.scijava.org/content/repositories/public/
%maven net.imagej:ij:1.54h
%maven com.github.haifengl:smile-core:3.0.2
%maven com.github.haifengl:smile-plot:3.0.2
%maven org.slf4j:slf4j-simple:2.0.11
(Here, ImageJ was loaded, but you do not need, just example...)
Import classes
//example
import smile.io.*;
import smile.plot.swing.*;
import smile.stat.distribution.*;
import smile.stat.*;
import static smile.math.MathEx.*;
import smile.math.matrix.*;
import smile.data.*;
import smile.data.type.*;
import smile.stat.hypothesis.*;
import java.util.*;
Load csv, arff, or more
see tutorial docs;
https://haifengl.github.io/data.html
Show Graph
//download iris.arff from web.
import java.io.InputStream;
import java.nio.file.*;
String iris = "https://storm.cis.fordham.edu/~gweiss/data-mining/weka-data/iris.arff";
InputStream in = new java.net.URL(iris).openStream();
Files.copy(in, Paths.get("iris.arff"), StandardCopyOption.REPLACE_EXISTING);
Then,
import smile.io.*;
import smile.plot.swing.*;
import smile.stat.distribution.*;
//load dataset as DataFrame
var iris = Read.arff("iris.arff");//This is DataFrame
var canvas = ScatterPlot.of(iris, "sepallength", "sepalwidth", "class", '*').canvas();
canvas.setAxisLabels("sepallength", "sepalwidth");
canvas.window();//show Swing component
Or, you can visualize a graph on notebook directly.
canvas.toBufferedImage(400, 400);
Tips for DataFrame
show summary (only show number type column)
iris.summary()
get column values
iris.columns(0);
//or
//iris.columns("col name you want");
get row values
iris.get(0);
get columns names
String[] cols = iris.names();
select columns and copy
DataFrame selected = iris.select("sepallength", "sepalwidth");
select rows and copy
var setosa = DataFrame.of(iris.stream().filter(row -> row.getByte("class") == 0));
var versicolor = DataFrame.of(iris.stream().filter(row -> row.getByte("class") == 1));
Create DataFrame from zero
For example.
// zero variance, multi corr, null value are included.
var row0 = new Object[]{0,0 ,3.5,1,5.3,"M"};
var row1 = new Object[]{1,10,4.7,1,1.8,"F"};
var row2 = new Object[]{2,20,9.7,1,5.5,"M"};
var row3 = new Object[]{3,30,7.7,1,5.2,"F"};
var row4 = new Object[]{4,40,1.2,1,null,"M"};
var row5 = new Object[]{5,50,2.7,1,4.8,"M"};
StructField sf0 = new StructField("col0", DataType.of(Integer.class));
StructField sf1 = new StructField("col1", DataType.of(Integer.class));
StructField sf2 = new StructField("col2", DataType.of(Double.class));
StructField sf3 = new StructField("col3", DataType.of(Integer.class));
StructField sf4 = new StructField("col4", DataType.of(Double.class));
StructField sf5 = new StructField("col5", DataType.of(String.class));
StructType col_type = new StructType(sf0,sf1,sf2,sf3,sf4,sf5);
var row0_ = Tuple.of(row0,col_type);
var row1_ = Tuple.of(row1,col_type);
var row2_ = Tuple.of(row2,col_type);
var row3_ = Tuple.of(row3,col_type);
var row4_ = Tuple.of(row4,col_type);
var row5_ = Tuple.of(row5,col_type);
ArrayList<Tuple> raw = new ArrayList<Tuple>();
raw.add(row0_);
raw.add(row1_);
raw.add(row2_);
raw.add(row3_);
raw.add(row4_);
raw.add(row5_);
var test_df = DataFrame.of(raw);
or, more simply,
//for example
int m = 5;//num of row
int n = 5;//num of col
var mat = new double[m][n];
var label = new int[m][1];
var group = new int[m][1];
String[] cols = new String[]{"col0","col1","col2","col3","col4"};
double[] r0 = new double[]{1,2,3,4,5};
double[] r1 = new double[]{1,2,3,4,5};
double[] r2 = new double[]{1,2,3,4,5};
double[] r3 = new double[]{1,2,3,4,5};
double[] r4 = new double[]{5,4,3,2,1};
int[] label_ = new int[]{0,0,1,1,0};
int[] group_ = new int[]{1,1,2,2,3};
mat[0] = r0;
mat[1] = r1;
mat[2] = r2;
mat[3] = r3;
mat[4] = r4;
for(int i=0; i<m; i++){
label[i][0] = label_[i];
group[i][0] = group_[i];
}
var df = DataFrame.of(mat, cols);
df = df.merge(DataFrame.of(label, "class"));
df = df.merge(DataFrame.of(group, "group"));
df
Remove null/missing values
var null_remove_df = test_df.omitNullRows();
Remove Zero Variance
DataFrame removeZeroVar(DataFrame df){
int n = df.ncol();
ArrayList<Integer> drops = new ArrayList<>();
DataType[] types = df.types();
for(int i =0;i<n;i++){
if(types[i].id() == DataType.ID.Byte || types[i].id() == DataType.ID.Double || types[i].id() == DataType.ID.Float ||
types[i].id() == DataType.ID.Integer || types[i].id() == DataType.ID.Short || types[i].id() == DataType.ID.Long ||
types[i].id() == DataType.ID.Decimal){
if(sd(df.column(i).toDoubleArray()) == 0.0){
drops.add(i);
}
}
}
if(drops.size()==0){
return df;
}
int[] drops_ = new int[drops.size()];
for(int i=0;i<drops.size();i++){
drops_[i] = drops.get(i);
}
DataFrame dropped = df.drop(drops_);
return dropped;
}
And then,
var remove_zero_var_df = removeZeroVar(null_remove_df);
One hot encoding
//first, do factorization
var pre_one_hot_df = remove_zero_var_df.factorize(new String[]{"col5"});
//second, convert to Matrix type object (like np.ndarray)
var one_hot_mat = pre_one_hot_df.toMatrix(/*bias*/false, CategoricalEncoder.ONE_HOT, /*rowNames*/null);
//check one hot state
one_hot_mat
//In here, beck to DataFrame from Matrix to manipulate dropping by column indices
//matrix to df
var df_after_one_hot = DataFrame.of(one_hot_mat.toArray(), one_hot_mat.colNames());
//For example, drop corr one of the pairs
DataFrame dropCorrOneSide(DataFrame /*do not include non-number type*/df, double threshold){
int nCols = df.ncol();
ArrayList<Integer> drops = new ArrayList<>();
for(int i=0; i<nCols; i++){
for(int j=0; j<nCols; j++){
if(i==j){
continue;
}
if(i<j){
var cor = CorTest.spearman(df.column(i).toDoubleArray(), df.column(j).toDoubleArray()).cor;
//System.out.println(cor);
cor = Math.abs(cor);
if(cor >= threshold){
if(!drops.contains(j)){
drops.add(j);
}
}
}
}
}
if(drops.size()==0){
return df;
}
int[] drops_ = new int[drops.size()];
for(int i=0; i<drops.size(); i++){
drops_[i] = drops.get(i);
}
var df_ = df.drop(drops_);
return df_;
}
execution and check result
var drop_corr_df = dropCorrOneSide(/*do not include non-number type*/df_after_one_hot, 0.9);
//check
drop_corr_df
Standardization
//standardization
import smile.feature.transform.*;
var scalar = new Standardizer();
var invertible_transform = scalar.fit(drop_corr_df, drop_corr_df.names());
var df_std = invertible_transform.apply(drop_corr_df);
//invert
var inv_df = invertible_transform.invert(df_std);
inv_df
Appendix A : References
Statistical test
https://haifengl.github.io/statistics.html#undefined
DataFrame
https://haifengl.github.io/data.html
Many notebooks tutorials
see, SMILE home page's "Try it Online".
(Appendix for ImageJ)
//show image on notebook
import ij.ImagePlus;
ImagePlus imp = new ImagePlus("boats.jpg");
imp.getBufferedImage();