TPC-DS tools for Apache Impala

Last updated at Posted at 2022-12-13


1. テストデータを事前に準備

sudo yum -y install java-1.8.0-openjdk-devel maven
sudo yum -y install git gcc make flex bison byacc curl unzip patch

git clone https://github.com/cloudera/impala-tpcds-kit.git
cd impala-tpcds-kit/
cd tpcds-gen/

sudo -u hdfs hdfs dfs -mkdir /user/root
sudo -u hdfs hdfs dfs -chown root:root /user/root

hadoop jar target/tpcds-gen-1.0-SNAPSHOT.jar -d /tmp/tpc-ds/sf10000/ -p 10 -s 10

aws s3 cp store_sales s3://秘密のBucket/store_sales/ --recursive --exclude "*" --include "data*"
aws s3 cp catalog_sales s3://秘密のBucket/catalog_sales/ --recursive --exclude "*" --include "data*"

準備したTextデータをImpala Schemaに入れる

create schema if not exists zzeng_tpcds_60_text;
use zzeng_tpcds_60_text;

create external table catalog_sales (
  cs_sold_date_sk int,
  cs_sold_time_sk int,
  cs_ship_date_sk int,
  cs_bill_customer_sk int,
  cs_bill_cdemo_sk int,
  cs_bill_hdemo_sk int,
  cs_bill_addr_sk int,
  cs_ship_customer_sk int,
  cs_ship_cdemo_sk int,
  cs_ship_hdemo_sk int,
  cs_ship_addr_sk int,
  cs_call_center_sk int,
  cs_catalog_page_sk int,
  cs_ship_mode_sk int,
  cs_warehouse_sk int,
  cs_item_sk int,
  cs_promo_sk int,
  cs_order_number bigint,
  cs_quantity int,
  cs_wholesale_cost decimal(7,2),
  cs_list_price decimal(7,2),
  cs_sales_price decimal(7,2),
  cs_ext_discount_amt decimal(7,2),
  cs_ext_sales_price decimal(7,2),
  cs_ext_wholesale_cost decimal(7,2),
  cs_ext_list_price decimal(7,2),
  cs_ext_tax decimal(7,2),
  cs_coupon_amt decimal(7,2),
  cs_ext_ship_cost decimal(7,2),
  cs_net_paid decimal(7,2),
  cs_net_paid_inc_tax decimal(7,2),
  cs_net_paid_inc_ship decimal(7,2),
  cs_net_paid_inc_ship_tax decimal(7,2),
  cs_net_profit decimal(7,2)
row format delimited fields terminated by '|'
stored as textfile
location '/tmp/tpc-ds/sf10000/catalog_sales'
tblproperties ('serialization.null.format'='')

create external table store_sales (
  ss_sold_date_sk int,
  ss_sold_time_sk int,
  ss_item_sk int,
  ss_customer_sk int,
  ss_cdemo_sk int,
  ss_hdemo_sk int,
  ss_addr_sk int,
  ss_store_sk int,
  ss_promo_sk int,
  ss_ticket_number bigint,
  ss_quantity int,
  ss_wholesale_cost decimal(7,2),
  ss_list_price decimal(7,2),
  ss_sales_price decimal(7,2),
  ss_ext_discount_amt decimal(7,2),
  ss_ext_sales_price decimal(7,2),
  ss_ext_wholesale_cost decimal(7,2),
  ss_ext_list_price decimal(7,2),
  ss_ext_tax decimal(7,2),
  ss_coupon_amt decimal(7,2),
  ss_net_paid decimal(7,2),
  ss_net_paid_inc_tax decimal(7,2),
  ss_net_profit decimal(7,2)
row format delimited fields terminated by '|'
stored as textfile
location '/tmp/tpc-ds/sf10000/store_sales'
tblproperties ('serialization.null.format'='')

