More than 5 years have passed since last update.
Impala で画像処理

Posted at 2019-08-02
やろうと思ったきっかけ

過去に SQLServer でビットマップ画像の2値化をやったから。
最近業務で Impala を触り始めたから書き直した。
（そういえば、去年 Hive 版も書いてた）
実装

with
    -- シーケンス 0 - 1
    seq2 as (
        select 0 as x
        union all select 1 as x
    ),
    -- シーケンス 0 - 3
    seq4 as (
        select row_number() over(order by a.x) - 1 as x from seq2 as a cross join seq2 as b
    ),
    -- シーケンス 0 - 15
    seq16 as (
        select row_number() over(order by a.x) - 1 as x from seq4 as a cross join seq4 as b
    ),
    -- シーケンス 0 - 255
    seq256 as (
        select row_number() over(order by a.x) - 1 as x from seq16 as a cross join seq16 as b
    ),
    -- シーケンス 0 - 65535
    seq65536 as (
        select row_number() over(order by a.x) - 1 as x from seq256 as a cross join seq256 as b
    ),
    -- 入力パラメータ
    input_param as (
        select
            50 as threshold,
            base64decode(
"ここは base64 エンコードされたビットマップ画像(24bit)を指定する"
            ) as img
    ),
    --BITMAPFILEHEADER
    bitmap_file_header as (
        select 'bfType' as name,                 1 as pos, 2 as size
        union all select 'bfSize' as name,       3 as pos, 4 as size
        union all select 'bfReserved1' as name,  7 as pos, 2 as size
        union all select 'bfReserved2' as name,  9 as pos, 2 as size
        union all select 'bfOffBits' as name,   11 as pos, 4 as size
    ),
    --BITMAPINFOHEADER
    bitmap_info_header as (
        select 'biSize' as name,                    15 as pos, 4 as size
        union all select 'biWidth' as name,         19 as pos, 4 as size
        union all select 'biHeight' as name,        23 as pos, 4 as size
        union all select 'biPlanes' as name,        27 as pos, 2 as size
        union all select 'biBitCount' as name,      29 as pos, 2 as size
        union all select 'biCopmression' as name,   31 as pos, 2 as size
        union all select 'biSizeImage' as name,     33 as pos, 4 as size
        union all select 'biXPixPerMeter' as name,  37 as pos, 4 as size
        union all select 'biYPixPerMeter' as name,  41 as pos, 4 as size
        union all select 'biClrUsed' as name,       45 as pos, 4 as size
        union all select 'biCirImportant' as name,  49 as pos, 4 as size
    ),
    --ファイルヘッダのパース
    parsed_file_header as (
        select
            'file_header' as id,
            h.index,
            h.name,
            h.size,
            h.pos,
            (case
                when h.size = 1 then
                    (cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
                when h.size = 2 then
                    (cast(conv(hex(substr(p.img, h.pos + 1, 1)), 16, 10) as int) +
                     cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
                when h.size = 4 then
                    (cast(conv(hex(substr(p.img, h.pos + 3, 1)), 16, 10) as int) +
                     cast(conv(hex(substr(p.img, h.pos + 2, 1)), 16, 10) as int) +
                     cast(conv(hex(substr(p.img, h.pos + 1, 1)), 16, 10) as int) +
                     cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
                else
                    0
             end
            ) as _data
        from
            input_param as p,
            (select
                row_number() over(order by bfh.pos) - 1 as index,
                bfh.name,
                bfh.pos,
                bfh.size
             from
                bitmap_file_header as bfh
            ) as h
    ),
    --情報ヘッダのパース
    parsed_info_header as (
        select
            'info_header' as id,
            h.index,
            h.name,
            h.size,
            h.pos,
            (case
                when h.size = 1 then
                    (cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
                when h.size = 2 then
                    (cast(conv(hex(substr(p.img, h.pos + 1, 1)), 16, 10) as int) +
                     cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
                when h.size = 4 then
                    (cast(conv(hex(substr(p.img, h.pos + 3, 1)), 16, 10) as int) +
                     cast(conv(hex(substr(p.img, h.pos + 2, 1)), 16, 10) as int) +
                     cast(conv(hex(substr(p.img, h.pos + 1, 1)), 16, 10) as int) +
                     cast(conv(hex(substr(p.img, h.pos + 0, 1)), 16, 10) as int))
                else
                    0
             end
            ) as _data
        from
            input_param as p,
            (select
                row_number() over(order by bih.pos) - 1 as index,
                bih.name,
                bih.pos,
                bih.size
             from
                bitmap_info_header as bih
            ) as h
    ),
    --データ取り出し
    bitmap_info as (
        select
            substr(p.img, b.offbits) as _data,
            a.pixel_count,
            a.width,
            a.height
        from
            input_param as p,
            --総ピクセル数, 幅, 高さ
            (select
                exp(sum(ln(_data))) as pixel_count,
                max(case name when 'biWidth' then _data else 0 end) as width,
                max(case name when 'biHeight' then _data else 0 end) as height
            from
                parsed_info_header
            where
                name in ('biWidth', 'biHeight')
            ) as a,
            --ファイル先頭から画像データまでのオフセット
            (select
                _data as offbits
            from
                parsed_file_header
            where name = 'bfOffBits'
            ) as b
    ),
    make_pix_sequence as (
        select
            cast(seq.x as int) as row_index
        from
            seq65536 as seq, bitmap_info as i
        where
            seq.x < (i.pixel_count)
    ),
    load_pixels as (
        select
            s.row_index as pos,
            cast(trunc(s.row_index / i.height) as int) as row_index,
            cast(s.row_index % i.height as int) as col_index,
            cast(conv(hex((substr(i._data, cast((s.row_index * 3) + 0 as int), 1))), 16, 10) as int) as red,
            cast(conv(hex((substr(i._data, cast((s.row_index * 3) + 1 as int), 1))), 16, 10) as int) as green,
            cast(conv(hex((substr(i._data, cast((s.row_index * 3) + 2 as int), 1))), 16, 10) as int) as blue,
            i.pixel_count
        from
            make_pix_sequence as s,
            bitmap_info as i
        order by
            pos,
            row_index,
            col_index
    ),
    binarize as (
        select
            p.pos,
            p.row_index,
            p.col_index,
            (case
                when p.red < param.threshold then 0
                when p.green < param.threshold then 0
                when p.blue < param.threshold then 0
                else 1
            end) as pix
        from
            input_param as param,
            load_pixels as p
        order by
            pos,
            row_index,
            col_index
    ),
    --可視化
    visualize as (
        select
            b.row_index,
            group_concat(case b.pix when 1 then '■' else '□' end, '') as a
        from
            binarize as b
        group by
            b.row_index
        order by
            row_index
    )
select
    *
from
    visualize
order by
    row_index desc
Hive 版よりかは標準の SQL に近くて、こっちの方がすっきり見やすい。
Hive 版は lateral view とか map とか使ってしまった
実行結果

Hue の画面
テキストエディタに貼ったもの
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up