LoginSignup
1
0

More than 5 years have passed since last update.

svm light (libsvm)形式のデータをc++で読み込む(Spirit Qi, 自作)

Posted at

概要

  • Spirit Qiでスマートにsvm light formatのデータをEigenのSparseMatrixとして読み込みたかった。
  • Qiの使い方が悪いのか、自作のloaderとくらべて早くなかった。(かなしい, この程度でQiを使うべきではないのか?)
  • 何かおかしなところがありましたら、ぜひ教えてください。

設定と結果

  • Apple LLVM version 8.1.0 (clang-802.0.42),
  • -O2 -ftree-vectorize -msse2 -mfpmath=sse
  • 読み込むデータセットは, libsvm datasetのreal-sim (n=72309, d=20958, nnz=3709083)
time
Qiを使うやつ 4.67 real, 4.17 user, 0.20 sys
自作 0.99 real, 0.85 user, 0.12 sys
scikit learnのload_svmlight_file 3.67 real, 3.36 user, 0.25 sys

コード

パーサーにBoost Spirit Qiを使う

  • vector<int, vector<pair<int, float>> として読み込む(vector<Eigen::Triplet<float>>として読み込む方法がよくわらないので...)
#include <algorithm>
#include <boost/fusion/include/std_pair.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_stream.hpp>
#include <boost/spirit/include/support_istream_iterator.hpp>
#include <boost/spirit/include/support_line_pos_iterator.hpp>
#include <boost/tuple/tuple.hpp>
#include <eigen3/Eigen/SparseCore>
#include <fstream>
#include <utility>
#include <vector>

int main(int argc, char *argv[]) {
  namespace qi = boost::spirit::qi;
  using pair_1 = std::pair<std::size_t, float>;
  std::vector<std::pair<int, std::vector<pair_1>>> v;
  std::ifstream ifs(argv[1]);
  ifs.unsetf(std::ios::skipws);
  using istreamIter = boost::spirit::istream_iterator;
  istreamIter itb = istreamIter(ifs);
  istreamIter ite = istreamIter();
  qi::parse(
      itb, ite,
      (qi::int_ >> -(qi::lit(' ') >>
                     (qi::int_ >> qi::lit(':') >> qi::float_) % qi::lit(' '))) %
          qi::eol,
      v);
  std::cout << v.size() << std::endl;

  return 0;
}

自分で作る

  • vector<Eigen::Triplet>として読み込んでEigen::SparseMatrixにするまでをやっているので、Qiを使うやつよりも不利
  • ぜんぜんスマートじゃない
#include <algorithm>
#include <eigen3/Eigen/SparseCore>
#include <fstream>
#include <iostream>
#include <utility>
#include <vector>

template <typename ValueType>
ValueType naive_atot(const char *p) {
  ValueType r = 0.0;
  bool neg = false;
  while (*p == ' ') ++p;
  if (*p == '-') {
    neg = true;
    ++p;
  } else if (*p == '+') {
    ++p;
  }
  while (*p >= '0' && *p <= '9') {
    r = (r * 10.0) + (*p - '0');
    ++p;
  }
  if (*p == '.') {
    ValueType f = 0.0;
    int n = 0;
    ++p;
    while (*p >= '0' && *p <= '9') {
      f = (f * 10.0) + (*p - '0');
      ++p;
      ++n;
    }
    r += f / std::pow(10.0, n);
  }
  if (neg) {
    r = -r;
  }
  return r;
}

// for readline (used in load libsvm)
static char *my_line = nullptr;
static int max_line_len;
static inline char *readline(FILE *input) {
  if (fgets(my_line, max_line_len, input) == nullptr) return nullptr;

  while (strrchr(my_line, '\n') == nullptr) {
    max_line_len *= 2;
    my_line = (char *)realloc(my_line, max_line_len);
    int len = (int)strlen(my_line);
    if (fgets(my_line + len, max_line_len - len, input) == nullptr) break;
  }

  return my_line;
}

template <typename Scalar, int Major, typename Index>
bool load_libsvm(Eigen::SparseMatrix<Scalar, Major, Index> &spa_x,
                 Eigen::Array<Scalar, Eigen::Dynamic, 1> &y,
                 const std::string &file_name,
                 const bool &flag_remove_zero = true) {
  FILE *fp = fopen(file_name.c_str(), "r");
  if (fp == nullptr) return false;
  using Tri = Eigen::Triplet<Scalar>;
  std::vector<Tri> tripletList;
  tripletList.reserve(1024);
  y.resize(1024);

  max_line_len = 1024;
  my_line = (char *)malloc(max_line_len * sizeof(char));

  unsigned int n = 0, d = 0, k = 0, num_ele = 0;
  while (readline(fp) != nullptr) {
    char *p = strtok(my_line, " \t\n");
    if (p == nullptr) return false;
    y[n] = naive_atot<Scalar>(p);
    num_ele = 0;
    while (1) {
      char *idx = strtok(nullptr, ":");
      char *val = strtok(nullptr, " \t");
      if (val == nullptr) break;
      ++num_ele;
      k = strtol(idx, nullptr, 10) - 1;
      if (d < k) d = k;
      tripletList.emplace_back(Tri(n, k, naive_atot<Scalar>(val)));
    }
    if (!flag_remove_zero || (flag_remove_zero && num_ele > 0)) ++n;
    if (static_cast<unsigned int>(y.size()) <= n) {
      y.conservativeResize(y.size() * 2);
      tripletList.reserve(y.size() * 2);
    }
  }
  fclose(fp);
  free(my_line);
  ++d;
  y.conservativeResize(n);
  spa_x.resize(n, d);
  spa_x.setFromTriplets(tripletList.begin(), tripletList.end());
  spa_x.makeCompressed();
  return true;
}

int main(int argc, char *argv[]) {
  using namespace std;
  Eigen::SparseMatrix<float, 1, std::ptrdiff_t> sx;
  Eigen::Array<float, Eigen::Dynamic, 1> y;
  const std::string fn = argv[1];
  load_libsvm(sx, y, fn, false);
  std::cout << sx.rows() << ", " << sx.nonZeros() << std::endl;
  return 0;
}

scikit learnのload_svmlight_file (Cython)

from sklearn.datasets import load_svmlight_file
import sys

def main(args):
  _, file = args
  x, y = load_svmlight_file(file)
  print(x.shape)

if __name__ == '__main__':
    args = sys.argv
    main(args)
1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0