Qiita×Findy記事投稿キャンペーン「今の開発組織でトライしたこと・トライしていること・トライしようとしていること」

【メモ】Athena関連クエリ作成

Last updated at 2024-05-10Posted at 2024-04-01

本記事で実現できること

Athenaのクエリを作成する

前提

クエリの保存場所を指定する
s3://xxxxx/Athena/

CloudFrontログ参照

テーブル作成

title: CloudFront Make Table

CREATE EXTERNAL TABLE IF NOT EXISTS default.cloudfront_logs (
  `date` DATE,
  time STRING,
  x_edge_location STRING,
  sc_bytes BIGINT,
  c_ip STRING,
  cs_method STRING,
  cs_host STRING,
  cs_uri_stem STRING,
  sc_status INT,
  cs_referrer STRING,
  cs_user_agent STRING,
  cs_uri_query STRING,
  cs_cookie STRING,
  x_edge_result_type STRING,
  x_edge_request_id STRING,
  x_host_header STRING,
  cs_protocol STRING,
  cs_bytes BIGINT,
  time_taken FLOAT,
  x_forwarded_for STRING,
  ssl_protocol STRING,
  ssl_cipher STRING,
  x_edge_response_result_type STRING,
  cs_protocol_version STRING,
  fle_status STRING,
  fle_encrypted_fields INT,
  c_port INT,
  time_to_first_byte FLOAT,
  x_edge_detailed_result_type STRING,
  sc_content_type STRING,
  sc_content_len BIGINT,
  sc_range_start BIGINT,
  sc_range_end BIGINT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LOCATION 's3://s3-cloudfront-log/'
TBLPROPERTIES ( 'skip.header.line.count'='2' )

ログ参照

title: CloudFront Watch Logs

SELECT  "date" AS log_date, time as log_time, x_edge_location as edge_location, sc_bytes, c_ip,
    cs_method, cs_uri_stem, sc_status,
    x_edge_result_type as result_type, cs_bytes, time_taken, x_forwarded_for as forwarded, ssl_protocol,
    ssl_cipher, x_edge_response_result_type, cs_protocol_version, c_port, time_to_first_byte,
    x_edge_detailed_result_type, sc_content_type, sc_content_len
FROM cloudfront_logs
WHERE "date" BETWEEN DATE '2024-01-09' AND DATE '2024-01-11'
ORDER BY "date" desc, time desc
LIMIT 1000;

WAFログ参照

テーブル作成

title: WAF Make Table

CREATE EXTERNAL TABLE default.waf_logs(
    `timestamp` bigint,
    `formatversion` int,
    `webaclid` string,
    `terminatingruleid` string,
    `terminatingruletype` string,
    `action` string,
    `terminatingrulematchdetails` array < struct < conditiontype: string,
    sensitivitylevel: string,
    location: string,
    matcheddata: array < string > > >,
    `httpsourcename` string,
    `httpsourceid` string,
    `rulegrouplist` array < struct < rulegroupid: string,
    terminatingrule: struct < ruleid: string,
    action: string,
    rulematchdetails: array < struct < conditiontype: string,
    sensitivitylevel: string,
    location: string,
    matcheddata: array < string > > > >,
    nonterminatingmatchingrules: array < struct < ruleid: string,
    action: string,
    overriddenaction: string,
    rulematchdetails: array < struct < conditiontype: string,
    sensitivitylevel: string,
    location: string,
    matcheddata: array < string > > > > >,
    excludedrules: string > >,
    `ratebasedrulelist` array < struct < ratebasedruleid: string,
    limitkey: string,
    maxrateallowed: int > >,
    `nonterminatingmatchingrules` array < struct < ruleid: string,
    action: string,
    rulematchdetails: array < struct < conditiontype: string,
    sensitivitylevel: string,
    location: string,
    matcheddata: array < string > > >,
    captcharesponse: struct < responsecode: string,
    solvetimestamp: string > > >,
    `requestheadersinserted` array < struct < name: string,
    value: string > >,
    `responsecodesent` string,
    `httprequest` struct < clientip: string,
    country: string,
    headers: array < struct < name: string,
    value: string > >,
    uri: string,
    args: string,
    httpversion: string,
    httpmethod: string,
    requestid: string >,
    `labels` array < struct < name: string > >,
    `captcharesponse` struct < responsecode: string,
    solvetimestamp: string,
    failureReason: string >,
    `challengeresponse` struct < responsecode: string,
    solvetimestamp: string,
    failureReason: string >,
    `ja3Fingerprint` string
)
PARTITIONED BY (`region` string, `date` string)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://waf-log-s3/AWSLogs/xxx(アカウントID)/WAFLogs/cloudfront/WAF-WebACL/'
TBLPROPERTIES(
    'projection.enabled' = 'true',
    'projection.region.type' = 'enum',
    'projection.region.values' = 'cloudfront,us-east-1,ap-northeast-1',
    'projection.date.type' = 'date',
    'projection.date.range' = '2021/01/01,NOW',
    'projection.date.format' = 'yyyy/MM/dd',
    'projection.date.interval' = '1',
    'projection.date.interval.unit' = 'DAYS',
    'storage.location.template' = 's3://aws-waf-logs-s3/AWSLogs/xxx(アカウントID)/WAFLogs/${region}/WAF-WebACL/${date}/'
)

ログ参照

title: WAF Watch Logs

SELECT timestamp,formatversion,webaclid,terminatingruleid,terminatingruletype,action,terminatingrulematchdetails,rulegrouplist,ratebasedrulelist,nonterminatingmatchingrules,requestheadersinserted,responsecodesent,httprequest,labels,captcharesponse,challengeresponse,ja3fingerprint
FROM waf_logs
WHERE from_unixtime(timestamp/1000) > now() - interval '1' day
LIMIT 1000;

VPCフローログ

テーブル作成

title: VPC Flowlogs Make Table

CREATE EXTERNAL default.vpc_flow_logs (
    `date` date,
    version int,
    account_id string,
    interface_id string,
    srcaddr string,
    dstaddr string,
    srcport int,
    dstport int,
    protocol bigint,
    packets bigint,
    bytes bigint,
    start bigint,
    `end` bigint,
    action string,
    log_status string,
    vpc_id string,
    subnet_id string,
    instance_id string,
    tcp_flags int,
    type string,
    pkt_srcaddr string,
    pkt_dstaddr string,
    region string,
    az_id string,
    sublocation_type string,
    sublocation_id string,
    pkt_src_aws_service string,
    pkt_dst_aws_service string,
    flow_direction string,
    traffic_path int
)
PARTITIONED BY (dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
LOCATION 's3://s3-vpc-log/AWSLogs/xxx(アカウントID)/vpcflowlogs/ap-northeast-1/'
TBLPROPERTIES ("skip.header.line.count" = "1")

S3ログ

テーブル作成

title: S3log Make Table

CREATE EXTERNAL TABLE `default.s3_xxx_logs`(
  `bucketowner` STRING,
  `bucket_name` STRING,
  `requestdatetime` STRING,
  `remoteip` STRING,
  `requester` STRING,
  `requestid` STRING,
  `operation` STRING,
  `key` STRING,
  `request_uri` STRING,
  `httpstatus` STRING,
  `errorcode` STRING,
  `bytessent` BIGINT,
  `objectsize` BIGINT,
  `totaltime` STRING,
  `turnaroundtime` STRING,
  `referrer` STRING,
  `useragent` STRING,
  `versionid` STRING,
  `hostid` STRING,
  `sigv` STRING,
  `ciphersuite` STRING,
  `authtype` STRING,
  `endpoint` STRING,
  `tlsversion` STRING)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  'input.regex'='([^ ]*) ([^ ]*) \\[(.*?)\\] ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) (\"[^\"]*\"|-) (-|[0-9]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) (\"[^\"]*\"|-) ([^ ]*)(?: ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*))?.*$')
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  's3://s3-log/S3/s3-xxx/'

まとめ

本稿が、Athena関連クエリ作成を行いたい方の参考になれば嬉しいです。

ITエンジニア向け、AWS資格取得を目指す方向けに個人ブログで記事も投稿していますので、
合わせて読んでもらえると嬉しいです。
ことなの学校(個人ブログ)

更によりレベル感の高そうなAWS関連の発信はZennの方で行います。
こちらももしよければフォロー、ご確認いただけると嬉しいです。
Zenn個人アカウント

Twitterでも役立つ情報を発信しますのでフォローしてもらえると嬉しいです
→ @tomo_ITBizBlog

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up