参考資料:
https://github.com/cloudera/impala-tpcds-kit
1. テストデータを事前に準備
sudo yum -y install java-1.8.0-openjdk-devel maven
sudo yum -y install git gcc make flex bison byacc curl unzip patch
git clone https://github.com/cloudera/impala-tpcds-kit.git
cd impala-tpcds-kit/
cd tpcds-gen/
make
sudo -u hdfs hdfs dfs -mkdir /user/root
sudo -u hdfs hdfs dfs -chown root:root /user/root
hadoop jar target/tpcds-gen-1.0-SNAPSHOT.jar -d /tmp/tpc-ds/sf10000/ -p 10 -s 10
aws s3 cp store_sales s3://秘密のBucket/store_sales/ --recursive --exclude "*" --include "data*"
aws s3 cp catalog_sales s3://秘密のBucket/catalog_sales/ --recursive --exclude "*" --include "data*"
準備したTextデータをImpala Schemaに入れる
create schema if not exists zzeng_tpcds_60_text;
use zzeng_tpcds_60_text;
create external table catalog_sales (
cs_sold_date_sk int,
cs_sold_time_sk int,
cs_ship_date_sk int,
cs_bill_customer_sk int,
cs_bill_cdemo_sk int,
cs_bill_hdemo_sk int,
cs_bill_addr_sk int,
cs_ship_customer_sk int,
cs_ship_cdemo_sk int,
cs_ship_hdemo_sk int,
cs_ship_addr_sk int,
cs_call_center_sk int,
cs_catalog_page_sk int,
cs_ship_mode_sk int,
cs_warehouse_sk int,
cs_item_sk int,
cs_promo_sk int,
cs_order_number bigint,
cs_quantity int,
cs_wholesale_cost decimal(7,2),
cs_list_price decimal(7,2),
cs_sales_price decimal(7,2),
cs_ext_discount_amt decimal(7,2),
cs_ext_sales_price decimal(7,2),
cs_ext_wholesale_cost decimal(7,2),
cs_ext_list_price decimal(7,2),
cs_ext_tax decimal(7,2),
cs_coupon_amt decimal(7,2),
cs_ext_ship_cost decimal(7,2),
cs_net_paid decimal(7,2),
cs_net_paid_inc_tax decimal(7,2),
cs_net_paid_inc_ship decimal(7,2),
cs_net_paid_inc_ship_tax decimal(7,2),
cs_net_profit decimal(7,2)
)
row format delimited fields terminated by '|'
stored as textfile
location '/tmp/tpc-ds/sf10000/catalog_sales'
tblproperties ('serialization.null.format'='')
;
create external table store_sales (
ss_sold_date_sk int,
ss_sold_time_sk int,
ss_item_sk int,
ss_customer_sk int,
ss_cdemo_sk int,
ss_hdemo_sk int,
ss_addr_sk int,
ss_store_sk int,
ss_promo_sk int,
ss_ticket_number bigint,
ss_quantity int,
ss_wholesale_cost decimal(7,2),
ss_list_price decimal(7,2),
ss_sales_price decimal(7,2),
ss_ext_discount_amt decimal(7,2),
ss_ext_sales_price decimal(7,2),
ss_ext_wholesale_cost decimal(7,2),
ss_ext_list_price decimal(7,2),
ss_ext_tax decimal(7,2),
ss_coupon_amt decimal(7,2),
ss_net_paid decimal(7,2),
ss_net_paid_inc_tax decimal(7,2),
ss_net_profit decimal(7,2)
)
row format delimited fields terminated by '|'
stored as textfile
location '/tmp/tpc-ds/sf10000/store_sales'
tblproperties ('serialization.null.format'='')
;