を作りました。
https://rubygems.org/gems/bq
https://github.com/takeru/bq/blob/master/lib/bq.rb
Rubyで文字列を組み立てて、ちょっと複雑なクエリを実行したいときに便利です。
データのインポートとかは標準のbqコマンドでやればいいと思います。
% time bundle exec ruby -rpp -e "require 'bq'; pp Bq::InstalledApp.new(:project_id=>'bq-takeru-test',:token_storage=>'.bq_secret_token.json').query('SELECT weight_pounds, state, year, gestation_weeks FROM publicdata:samples.natality ORDER BY weight_pounds DESC LIMIT 20;').to_hash"
{"kind"=>"bigquery#queryResponse",
"schema"=>
{"fields"=>
[{"name"=>"weight_pounds", "type"=>"FLOAT", "mode"=>"NULLABLE"},
{"name"=>"state", "type"=>"STRING", "mode"=>"NULLABLE"},
{"name"=>"year", "type"=>"INTEGER", "mode"=>"NULLABLE"},
{"name"=>"gestation_weeks", "type"=>"INTEGER", "mode"=>"NULLABLE"}]},
"jobReference"=>
{"projectId"=>"bq-takeru-test", "jobId"=>"job_R_ejXGskMARhRxExo_FAxVlaj1Q"},
"totalRows"=>"20",
"rows"=>
[{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2006"}, {"v"=>nil}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2007"}, {"v"=>"41"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"KY"}, {"v"=>"2004"}, {"v"=>"47"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"KY"}, {"v"=>"2004"}, {"v"=>"47"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"WY"}, {"v"=>"1979"}, {"v"=>"35"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2007"}, {"v"=>"38"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"KY"}, {"v"=>"2004"}, {"v"=>"38"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2006"}, {"v"=>"37"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"UT"}, {"v"=>"1982"}, {"v"=>"33"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2006"}, {"v"=>"40"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"KY"}, {"v"=>"2004"}, {"v"=>"47"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"KY"}, {"v"=>"2004"}, {"v"=>"41"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2005"}, {"v"=>"40"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2007"}, {"v"=>"38"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2005"}, {"v"=>"37"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2005"}, {"v"=>"39"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2005"}, {"v"=>nil}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"KY"}, {"v"=>"2004"}, {"v"=>"43"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>nil}, {"v"=>"2007"}, {"v"=>"39"}]},
{"f"=>[{"v"=>"18.0007436923"}, {"v"=>"OR"}, {"v"=>"1972"}, {"v"=>"40"}]}],
"totalBytesProcessed"=>"3751143548",
"jobComplete"=>true,
"cacheHit"=>false}
bundle exec ruby -rpp -e 0.83s user 0.13s system 30% cpu 3.223 total
"totalBytesProcessed"=>"3751143548"
3.7GBのgrep的フルスキャンなクエリ実行を数秒でできているということです。
(で、もっとでかいデータでもこんな具合なんですよね?)
これはまじはかい的テクノロジーなのではないか!
bq_example.rb
require "pp"
require "json"
require "bq"
require "terminal-table"
project_id = nil
project_id ||= ENV["bq_project_id"]
raise "please set your project_id" unless project_id
token_file = ".bq_secret_token.json"
bq = Bq::InstalledApp.new(:project_id=>project_id, :token_storage=>token_file)
unless bq.authorized?
unless bq.authorize
raise "failed to authorize."
end
end
q = DATA.read.strip
result = bq.query(q).to_hash
if result["error"] || result["rows"].nil?
pp result
exit
end
["kind","jobReference","totalRows","totalBytesProcessed","jobComplete","cacheHit"].each do |k|
puts "%20s : %s" % [k, result[k]]
end
# pp [result["rows"], result["schema"]]
rows = result["rows"]
fields = result["schema"]["fields"]
headings = [""] + fields.map{|f| f["name"] + "\n" + f["type"] + "\n" + f["mode"] }
table = Terminal::Table.new :headings=>headings do |t|
rows.each_with_index do |r,i|
t.add_row [i] + r['f'].map{|hash| hash['v'] }
end
end
table.align_column(0, :right)
fields.each_with_index do |s,i|
if %w(INTEGER FLOAT).include?(s["type"])
table.align_column(i+1, :right)
end
end
puts table
__END__
--- SELECT weight_pounds, state, year, gestation_weeks FROM publicdata:samples.natality ORDER BY weight_pounds DESC LIMIT 10;
--- SELECT word FROM publicdata:samples.shakespeare WHERE word="huzzah";
--- SELECT corpus FROM publicdata:samples.shakespeare GROUP BY corpus;
SELECT corpus, sum(word_count) AS wordcount FROM publicdata:samples.shakespeare GROUP BY corpus ORDER BY wordcount DESC;
bq% bq_project_id=bq-takeru-test bundle exec ruby examples/bq_example.rb
kind : bigquery#queryResponse
jobReference : {"projectId"=>"bq-takeru-test", "jobId"=>"job_49OOvoD3eaJ0NiMLybw6G-VMY4c"}
totalRows : 42
totalBytesProcessed : 3781873
jobComplete : true
cacheHit : false
+----+-----------------------+-----------+
| | corpus | wordcount |
| | STRING | INTEGER |
| | NULLABLE | NULLABLE |
+----+-----------------------+-----------+
| 0 | hamlet | 32446 |
| 1 | kingrichardiii | 31868 |
| 2 | coriolanus | 29535 |
| 3 | cymbeline | 29231 |
| 4 | 2kinghenryiv | 28241 |
| 5 | othello | 28076 |
| 6 | kinglear | 28001 |
| 7 | kinghenryv | 27894 |
| 8 | troilusandcressida | 27837 |
| 9 | antonyandcleopatra | 27310 |
| 10 | 2kinghenryvi | 27165 |
| 11 | 1kinghenryiv | 26401 |
| 12 | 3kinghenryvi | 26288 |
| 13 | kinghenryviii | 26265 |
| 14 | romeoandjuliet | 26192 |
| 15 | winterstale | 26181 |
| 16 | allswellthatendswell | 24622 |
| 17 | kingrichardii | 24150 |
| 18 | merrywivesofwindsor | 24033 |
| 19 | measureforemeasure | 23303 |
| 20 | 1kinghenryvi | 23272 |
| 21 | loveslabourslost | 23189 |
| 22 | asyoulikeit | 23087 |
| 23 | muchadoaboutnothing | 22760 |
| 24 | merchantofvenice | 22448 |
| 25 | tamingoftheshrew | 22358 |
| 26 | kingjohn | 21983 |
| 27 | titusandronicus | 21911 |
| 28 | twelfthnight | 21633 |
| 29 | juliuscaesar | 21052 |
| 30 | periclesprinceoftyre | 19846 |
| 31 | timonofathens | 19841 |
| 32 | twogentlemenofverona | 18493 |
| 33 | macbeth | 18439 |
| 34 | sonnets | 17805 |
| 35 | tempest | 17593 |
| 36 | midsummersnightsdream | 17348 |
| 37 | comedyoferrors | 16361 |
| 38 | rapeoflucrece | 15221 |
| 39 | venusandadonis | 10035 |
| 40 | various | 3545 |
| 41 | loverscomplaint | 2586 |
+----+-----------------------+-----------+
これらを参考にしました。
http://d.hatena.ne.jp/a-know/20140506/1399386097
https://github.com/abronte/BigQuery
https://github.com/a-know/ruby-bigquery-test
https://github.com/google/google-api-ruby-client
http://ja.asciicasts.com/episodes/245-new-gem-with-bundler