やろうと思ったきっかけ
過去に SQLServer でビットマップ画像の2値化をやったから。
最近業務で Impala を触り始めたから書き直した。
(そういえば、去年 Hive 版も書いてた)
実装
with
-- シーケンス 0 - 1
seq2 as (
select 0 as x
union all select 1 as x
),
-- シーケンス 0 - 3
seq4 as (
select row_number() over(order by a.x) - 1 as x from seq2 as a cross join seq2 as b
),
-- シーケンス 0 - 15
seq16 as (
select row_number() over(order by a.x) - 1 as x from seq4 as a cross join seq4 as b
),
-- シーケンス 0 - 255
seq256 as (
select row_number() over(order by a.x) - 1 as x from seq16 as a cross join seq16 as b
),
-- シーケンス 0 - 65535
seq65536 as (
select row_number() over(order by a.x) - 1 as x from seq256 as a cross join seq256 as b
),
-- 入力パラメータ
input_param as (
select
50 as threshold,
base64decode(
"ここは base64 エンコードされたビットマップ画像(24bit)を指定する"
) as img
),
--BITMAPFILEHEADER
bitmap_file_header as (
select 'bfType' as name, 1 as pos, 2 as size
union all select 'bfSize' as name, 3 as pos, 4 as size
union all select 'bfReserved1' as name, 7 as pos, 2 as size
union all select 'bfReserved2' as name, 9 as pos, 2 as size
union all select 'bfOffBits' as name, 11 as pos, 4 as size
),
--BITMAPINFOHEADER
bitmap_info_header as (
select 'biSize' as name, 15 as pos, 4 as size
union all select 'biWidth' as name, 19 as pos, 4 as size
union all select 'biHeight' as name, 23 as pos, 4 as size
union all select 'biPlanes' as name, 27 as pos, 2 as size
union all select 'biBitCount' as name, 29 as pos, 2 as size
union all select 'biCopmression' as name, 31 as pos, 2 as size
union all select 'biSizeImage' as name, 33 as pos, 4 as size
union all select 'biXPixPerMeter' as name, 37 as pos, 4 as size
union all select 'biYPixPerMeter' as name, 41 as pos, 4 as size
union all select 'biClrUsed' as name, 45 as pos, 4 as size
union all select 'biCirImportant' as name, 49 as pos, 4 as size
),
--ファイルヘッダのパース
parsed_file_header as (
select
'file_header' as id,
h.index,
h.name,
h.size,
h.pos,
(case
when h.size = 1 then
(cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
when h.size = 2 then
(cast(conv(hex(substr(p.img, h.pos + 1, 1)), 16, 10) as int) +
cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
when h.size = 4 then
(cast(conv(hex(substr(p.img, h.pos + 3, 1)), 16, 10) as int) +
cast(conv(hex(substr(p.img, h.pos + 2, 1)), 16, 10) as int) +
cast(conv(hex(substr(p.img, h.pos + 1, 1)), 16, 10) as int) +
cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
else
0
end
) as _data
from
input_param as p,
(select
row_number() over(order by bfh.pos) - 1 as index,
bfh.name,
bfh.pos,
bfh.size
from
bitmap_file_header as bfh
) as h
),
--情報ヘッダのパース
parsed_info_header as (
select
'info_header' as id,
h.index,
h.name,
h.size,
h.pos,
(case
when h.size = 1 then
(cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
when h.size = 2 then
(cast(conv(hex(substr(p.img, h.pos + 1, 1)), 16, 10) as int) +
cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
when h.size = 4 then
(cast(conv(hex(substr(p.img, h.pos + 3, 1)), 16, 10) as int) +
cast(conv(hex(substr(p.img, h.pos + 2, 1)), 16, 10) as int) +
cast(conv(hex(substr(p.img, h.pos + 1, 1)), 16, 10) as int) +
cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
else
0
end
) as _data
from
input_param as p,
(select
row_number() over(order by bih.pos) - 1 as index,
bih.name,
bih.pos,
bih.size
from
bitmap_info_header as bih
) as h
),
--データ取り出し
bitmap_info as (
select
substr(p.img, b.offbits) as _data,
a.pixel_count,
a.width,
a.height
from
input_param as p,
--総ピクセル数, 幅, 高さ
(select
exp(sum(ln(_data))) as pixel_count,
max(case name when 'biWidth' then _data else 0 end) as width,
max(case name when 'biHeight' then _data else 0 end) as height
from
parsed_info_header
where
name in ('biWidth', 'biHeight')
) as a,
--ファイル先頭から画像データまでのオフセット
(select
_data as offbits
from
parsed_file_header
where name = 'bfOffBits'
) as b
),
make_pix_sequence as (
select
cast(seq.x as int) as row_index
from
seq65536 as seq, bitmap_info as i
where
seq.x < (i.pixel_count)
),
load_pixels as (
select
s.row_index as pos,
cast(trunc(s.row_index / i.height) as int) as row_index,
cast(s.row_index % i.height as int) as col_index,
cast(conv(hex((substr(i._data, cast((s.row_index * 3) + 0 as int), 1))), 16, 10) as int) as red,
cast(conv(hex((substr(i._data, cast((s.row_index * 3) + 1 as int), 1))), 16, 10) as int) as green,
cast(conv(hex((substr(i._data, cast((s.row_index * 3) + 2 as int), 1))), 16, 10) as int) as blue,
i.pixel_count
from
make_pix_sequence as s,
bitmap_info as i
order by
pos,
row_index,
col_index
),
binarize as (
select
p.pos,
p.row_index,
p.col_index,
(case
when p.red < param.threshold then 0
when p.green < param.threshold then 0
when p.blue < param.threshold then 0
else 1
end) as pix
from
input_param as param,
load_pixels as p
order by
pos,
row_index,
col_index
),
--可視化
visualize as (
select
b.row_index,
group_concat(case b.pix when 1 then '■' else '□' end, '') as a
from
binarize as b
group by
b.row_index
order by
row_index
)
select
*
from
visualize
order by
row_index desc
Hive 版よりかは標準の SQL に近くて、こっちの方がすっきり見やすい。
Hive 版は lateral view とか map とか使ってしまった