LoginSignup
0
0

Manipulating DataFrame Using SMILE

Last updated at Posted at 2024-02-11

Motivation

To understand how to's for handling DataFrame in SMILE

Environment

Jupyter Lab & IJava & temrin jdk 11 (have to use > 9)

Usage of IJava and maven

First, should do load repositories and dependencies
For example,

%mavenRepo scijava-public https://maven.scijava.org/content/repositories/public/
%maven net.imagej:ij:1.54h
%maven com.github.haifengl:smile-core:3.0.2
%maven com.github.haifengl:smile-plot:3.0.2
%maven org.slf4j:slf4j-simple:2.0.11

(Here, ImageJ was loaded, but you do not need, just example...)

Import classes

//example
import smile.io.*;
import smile.plot.swing.*;
import smile.stat.distribution.*;
import smile.stat.*;
import static smile.math.MathEx.*;
import smile.math.matrix.*;
import smile.data.*;
import smile.data.type.*;
import smile.stat.hypothesis.*;
import java.util.*;

Load csv, arff, or more

see tutorial docs;
https://haifengl.github.io/data.html

Show Graph

//download iris.arff from web.
import java.io.InputStream;
import java.nio.file.*;

String iris = "https://storm.cis.fordham.edu/~gweiss/data-mining/weka-data/iris.arff";
InputStream in = new java.net.URL(iris).openStream();
Files.copy(in, Paths.get("iris.arff"), StandardCopyOption.REPLACE_EXISTING);

Then,

import smile.io.*;
import smile.plot.swing.*;
import smile.stat.distribution.*;

//load dataset as DataFrame
var iris = Read.arff("iris.arff");//This is DataFrame
var canvas = ScatterPlot.of(iris, "sepallength", "sepalwidth", "class", '*').canvas();
canvas.setAxisLabels("sepallength", "sepalwidth");
canvas.window();//show Swing component

Or, you can visualize a graph on notebook directly.

canvas.toBufferedImage(400, 400);

Tips for DataFrame

show summary (only show number type column)

iris.summary()

get column values

iris.columns(0);
//or
//iris.columns("col name you want");

get row values

iris.get(0);

get columns names

String[] cols = iris.names();

select columns and copy

DataFrame selected = iris.select("sepallength", "sepalwidth");

select rows and copy

var setosa = DataFrame.of(iris.stream().filter(row -> row.getByte("class") == 0));
var versicolor = DataFrame.of(iris.stream().filter(row -> row.getByte("class") == 1));

Create DataFrame from zero

For example.

// zero variance, multi corr, null value are included.
var row0 = new Object[]{0,0 ,3.5,1,5.3,"M"};
var row1 = new Object[]{1,10,4.7,1,1.8,"F"};
var row2 = new Object[]{2,20,9.7,1,5.5,"M"};
var row3 = new Object[]{3,30,7.7,1,5.2,"F"};
var row4 = new Object[]{4,40,1.2,1,null,"M"};
var row5 = new Object[]{5,50,2.7,1,4.8,"M"};

StructField sf0 = new StructField("col0", DataType.of(Integer.class));
StructField sf1 = new StructField("col1", DataType.of(Integer.class));
StructField sf2 = new StructField("col2", DataType.of(Double.class));
StructField sf3 = new StructField("col3", DataType.of(Integer.class));
StructField sf4 = new StructField("col4", DataType.of(Double.class));
StructField sf5 = new StructField("col5", DataType.of(String.class));

StructType col_type = new StructType(sf0,sf1,sf2,sf3,sf4,sf5);

var row0_ = Tuple.of(row0,col_type);
var row1_ = Tuple.of(row1,col_type);
var row2_ = Tuple.of(row2,col_type);
var row3_ = Tuple.of(row3,col_type);
var row4_ = Tuple.of(row4,col_type);
var row5_ = Tuple.of(row5,col_type);

ArrayList<Tuple> raw = new ArrayList<Tuple>();
raw.add(row0_);
raw.add(row1_);
raw.add(row2_);
raw.add(row3_);
raw.add(row4_);
raw.add(row5_);

var test_df = DataFrame.of(raw);

or, more simply,

//for example
int m = 5;//num of row
int n = 5;//num of col
var mat = new double[m][n];
var label = new int[m][1];
var group = new int[m][1];

String[] cols = new String[]{"col0","col1","col2","col3","col4"};

double[] r0 = new double[]{1,2,3,4,5};
double[] r1 = new double[]{1,2,3,4,5};
double[] r2 = new double[]{1,2,3,4,5};
double[] r3 = new double[]{1,2,3,4,5};
double[] r4 = new double[]{5,4,3,2,1};

int[] label_ = new int[]{0,0,1,1,0};
int[] group_ = new int[]{1,1,2,2,3};

mat[0] = r0;
mat[1] = r1;
mat[2] = r2;
mat[3] = r3;
mat[4] = r4;

for(int i=0; i<m; i++){
    label[i][0] = label_[i];
    group[i][0] = group_[i];
}

var df = DataFrame.of(mat, cols);
df = df.merge(DataFrame.of(label, "class"));
df = df.merge(DataFrame.of(group, "group"));
df

Remove null/missing values

var null_remove_df = test_df.omitNullRows();

Remove Zero Variance

DataFrame removeZeroVar(DataFrame df){
    int n = df.ncol();
    ArrayList<Integer> drops = new ArrayList<>();
    DataType[] types = df.types();
    for(int i =0;i<n;i++){
        if(types[i].id() == DataType.ID.Byte || types[i].id() == DataType.ID.Double || types[i].id() == DataType.ID.Float ||
          types[i].id() == DataType.ID.Integer || types[i].id() == DataType.ID.Short || types[i].id() == DataType.ID.Long || 
          types[i].id() == DataType.ID.Decimal){
            if(sd(df.column(i).toDoubleArray()) == 0.0){
                drops.add(i);
            }
        }
    }
    if(drops.size()==0){
        return df;
    }
    int[] drops_ = new int[drops.size()];
    for(int i=0;i<drops.size();i++){
        drops_[i] = drops.get(i);
    }
    DataFrame dropped = df.drop(drops_);
    return dropped;
}

And then,

var remove_zero_var_df = removeZeroVar(null_remove_df); 

One hot encoding

//first, do factorization
var pre_one_hot_df = remove_zero_var_df.factorize(new String[]{"col5"});
//second, convert to Matrix type object (like np.ndarray)
var one_hot_mat = pre_one_hot_df.toMatrix(/*bias*/false, CategoricalEncoder.ONE_HOT, /*rowNames*/null);
//check one hot state
one_hot_mat
//In here, beck to DataFrame from Matrix to manipulate dropping by column indices
//matrix to df
var df_after_one_hot = DataFrame.of(one_hot_mat.toArray(), one_hot_mat.colNames());
//For example, drop corr one of the pairs
DataFrame dropCorrOneSide(DataFrame /*do not include non-number type*/df, double threshold){
    int nCols = df.ncol();
    ArrayList<Integer> drops = new ArrayList<>();
    for(int i=0; i<nCols; i++){
        for(int j=0; j<nCols; j++){
            if(i==j){
                continue;
            }
            if(i<j){
                var cor = CorTest.spearman(df.column(i).toDoubleArray(), df.column(j).toDoubleArray()).cor;
                //System.out.println(cor);
                cor = Math.abs(cor);
                if(cor >= threshold){
                    if(!drops.contains(j)){
                        drops.add(j);
                    }
                }
            }
        }
    }
    if(drops.size()==0){
        return df;
    }
    int[] drops_ = new int[drops.size()];
    for(int i=0; i<drops.size(); i++){
        drops_[i] = drops.get(i);
    }
    var df_ = df.drop(drops_);
    return df_;
}
execution and check result
var drop_corr_df = dropCorrOneSide(/*do not include non-number type*/df_after_one_hot, 0.9);

//check
drop_corr_df

Standardization

//standardization
import smile.feature.transform.*;
var scalar = new Standardizer();
var invertible_transform = scalar.fit(drop_corr_df, drop_corr_df.names());
var df_std = invertible_transform.apply(drop_corr_df);
//invert
var inv_df = invertible_transform.invert(df_std);
inv_df

Appendix A : References

Statistical test
https://haifengl.github.io/statistics.html#undefined

DataFrame
https://haifengl.github.io/data.html

Many notebooks tutorials
see, SMILE home page's "Try it Online".

(Appendix for ImageJ)

//show image on notebook
import ij.ImagePlus;
ImagePlus imp = new ImagePlus("boats.jpg");
imp.getBufferedImage();
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0