はじめに
🎄 本記事は ZOZO Advent Calendar 2024 シリーズ4の24日目です
BigQueryでは、下記の画像のようにテーブルのスキーマ情報をJSONで出力できる。
(画像はJOBS_BY_USERの例)
簡単なテーブル場合は、自身でカラム名を列挙すれば問題ないが、RECORD型を多く持っていたりすると、SELECTのカラム列挙時に苦労する。
Audit Logなどはカラム数が多すぎて、ネストした値までSELECTするのが大変。
今回は、JSON Schemaを引数として渡すと、SELECT文を生成するUDFを紹介する。
UDFの作成
再帰的な処理が必要なため、Javascript UDFを用いた。
CREATE TEMPORARY FUNCTION generate_select(schema STRING, is_null BOOL)
RETURNS STRING
LANGUAGE js AS """
function generateSQL(schema_json, is_null) {
function processField(field, parent) {
const name = parent ? parent + '.' + field.name : field.name;
const nullValue = field.type === 'FLOAT' ? 'CAST(NULL AS FLOAT64)':
`CAST(NULL AS ${field.type})`;
if (field.type === 'RECORD') {
let fields = field.fields.map(f => processField(f, field.mode === 'REPEATED' ? field.name + '_alias' : name)).join(', ');
if (field.mode === 'REPEATED') {
if (is_null) {
return `ARRAY(SELECT AS STRUCT ${fields} FROM UNNEST([STRUCT()])) AS ${field.name}`;
} else {
return `ARRAY(SELECT AS STRUCT ${fields} FROM UNNEST(${name}) AS ${field.name}_alias) AS ${field.name}`;
}
} else {
return is_null ? `STRUCT(${fields}) AS ${field.name}` : `STRUCT(${fields}) AS ${field.name}`;
}
} else if (field.mode === 'REPEATED') {
if (is_null) {
return `ARRAY<${field.type}>[] AS ${field.name}`;
} else {
return `${name} AS ${field.name}`;
}
} else {
return is_null ? `${nullValue} AS ${field.name}` : `${name} AS ${field.name}`;
}
}
const selectFields = schema_json.map(f => processField(f, null)).join(', ');
return `SELECT ${selectFields}`;
}
const schema_json = JSON.parse(schema);
return generateSQL(schema_json, is_null);
第二引数のis_null
にtrue
を渡せば、NULL埋めした結果を表示する。
false
を渡せば、そのままカラム名を指定するので、FROM句に参照したいテーブルを指定すればよい。
実行例
下記はJOBS_BY_USERを用いた場合の例。
実行例
CREATE TEMPORARY FUNCTION generate_select(schema STRING, is_null BOOL)
RETURNS STRING
LANGUAGE js AS """
function generateSQL(schema_json, is_null) {
function processField(field, parent) {
const name = parent ? parent + '.' + field.name : field.name;
const nullValue = field.type === 'FLOAT' ? 'CAST(NULL AS FLOAT64)':
`CAST(NULL AS ${field.type})`;
if (field.type === 'RECORD') {
let fields = field.fields.map(f => processField(f, field.mode === 'REPEATED' ? field.name + '_alias' : name)).join(', ');
if (field.mode === 'REPEATED') {
if (is_null) {
return `ARRAY(SELECT AS STRUCT ${fields} FROM UNNEST([STRUCT()])) AS ${field.name}`;
} else {
return `ARRAY(SELECT AS STRUCT ${fields} FROM UNNEST(${name}) AS ${field.name}_alias) AS ${field.name}`;
}
} else {
return is_null ? `STRUCT(${fields}) AS ${field.name}` : `STRUCT(${fields}) AS ${field.name}`;
}
} else if (field.mode === 'REPEATED') {
if (is_null) {
return `ARRAY<${field.type}>[] AS ${field.name}`;
} else {
return `${name} AS ${field.name}`;
}
} else {
return is_null ? `${nullValue} AS ${field.name}` : `${name} AS ${field.name}`;
}
}
const selectFields = schema_json.map(f => processField(f, null)).join(', ');
return `SELECT ${selectFields}`;
}
const schema_json = JSON.parse(schema);
return generateSQL(schema_json, is_null);
""";
WITH schema_json AS (
SELECT '''
[
{
"name": "creation_time",
"mode": "NULLABLE",
"type": "TIMESTAMP",
"description": "",
"fields": []
},
{
"name": "project_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "project_number",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "user_email",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "job_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "job_type",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "statement_type",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "priority",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "start_time",
"mode": "NULLABLE",
"type": "TIMESTAMP",
"description": "",
"fields": []
},
{
"name": "end_time",
"mode": "NULLABLE",
"type": "TIMESTAMP",
"description": "",
"fields": []
},
{
"name": "query",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "state",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "reservation_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "total_bytes_processed",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "total_slot_ms",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "error_result",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "reason",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "location",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "debug_info",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "message",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "cache_hit",
"mode": "NULLABLE",
"type": "BOOLEAN",
"description": "",
"fields": []
},
{
"name": "destination_table",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "project_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "dataset_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "table_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "referenced_tables",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "project_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "dataset_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "table_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "labels",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "key",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "value",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "timeline",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "elapsed_ms",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "total_slot_ms",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "pending_units",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "completed_units",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "active_units",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "estimated_runnable_units",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
}
]
},
{
"name": "job_stages",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "name",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "id",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "start_ms",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "end_ms",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "input_stages",
"mode": "REPEATED",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "wait_ratio_avg",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
},
{
"name": "wait_ms_avg",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "wait_ratio_max",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
},
{
"name": "wait_ms_max",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "read_ratio_avg",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
},
{
"name": "read_ms_avg",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "read_ratio_max",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
},
{
"name": "read_ms_max",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "compute_ratio_avg",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
},
{
"name": "compute_ms_avg",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "compute_ratio_max",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
},
{
"name": "compute_ms_max",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "write_ratio_avg",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
},
{
"name": "write_ms_avg",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "write_ratio_max",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
},
{
"name": "write_ms_max",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "shuffle_output_bytes",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "shuffle_output_bytes_spilled",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "records_read",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "records_written",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "parallel_inputs",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "completed_parallel_inputs",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "status",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "steps",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "kind",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "substeps",
"mode": "REPEATED",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "slot_ms",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "compute_mode",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "total_bytes_billed",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "transaction_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "parent_job_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "session_info",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "session_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "dml_statistics",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "inserted_row_count",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "deleted_row_count",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "updated_row_count",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
}
]
},
{
"name": "total_modified_partitions",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "bi_engine_statistics",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "bi_engine_mode",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "bi_engine_reasons",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "code",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "message",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "acceleration_mode",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "query_info",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "resource_warning",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "optimization_details",
"mode": "NULLABLE",
"type": "JSON",
"description": "",
"fields": []
},
{
"name": "query_hashes",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "normalized_literals",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "performance_insights",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "avg_previous_execution_ms",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "stage_performance_standalone_insights",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "stage_id",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "slot_contention",
"mode": "NULLABLE",
"type": "BOOLEAN",
"description": "",
"fields": []
},
{
"name": "insufficient_shuffle_quota",
"mode": "NULLABLE",
"type": "BOOLEAN",
"description": "",
"fields": []
},
{
"name": "bi_engine_reasons",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "code",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "message",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "high_cardinality_joins",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "left_rows",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "right_rows",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "output_rows",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "step_index",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
}
]
}
]
},
{
"name": "stage_performance_change_insights",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "stage_id",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "input_data_change",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "records_read_diff_percentage",
"mode": "NULLABLE",
"type": "FLOAT",
"description": "",
"fields": []
}
]
}
]
}
]
}
]
},
{
"name": "transferred_bytes",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "materialized_view_statistics",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "materialized_view",
"mode": "REPEATED",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "table_reference",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "project_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "dataset_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "table_id",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
},
{
"name": "chosen",
"mode": "NULLABLE",
"type": "BOOLEAN",
"description": "",
"fields": []
},
{
"name": "estimated_bytes_saved",
"mode": "NULLABLE",
"type": "INTEGER",
"description": "",
"fields": []
},
{
"name": "rejected_reason",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
}
]
},
{
"name": "edition",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
},
{
"name": "job_creation_reason",
"mode": "NULLABLE",
"type": "RECORD",
"description": "",
"fields": [
{
"name": "code",
"mode": "NULLABLE",
"type": "STRING",
"description": "",
"fields": []
}
]
}
]
''' AS json_schema
)
SELECT generate_select(json_schema, true) AS generated_query
FROM schema_json;
生成されたSELECT文。
SELECT
CAST(NULL AS TIMESTAMP) AS creation_time,
CAST(NULL AS STRING) AS project_id,
CAST(NULL AS INTEGER) AS project_number,
CAST(NULL AS STRING) AS user_email,
CAST(NULL AS STRING) AS job_id,
CAST(NULL AS STRING) AS job_type,
CAST(NULL AS STRING) AS statement_type,
CAST(NULL AS STRING) AS priority,
CAST(NULL AS TIMESTAMP) AS start_time,
CAST(NULL AS TIMESTAMP) AS end_time,
CAST(NULL AS STRING) AS query,
CAST(NULL AS STRING) AS state,
CAST(NULL AS STRING) AS reservation_id,
CAST(NULL AS INTEGER) AS total_bytes_processed,
CAST(NULL AS INTEGER) AS total_slot_ms,
STRUCT(CAST(NULL AS STRING) AS reason,
CAST(NULL AS STRING) AS location,
CAST(NULL AS STRING) AS debug_info,
CAST(NULL AS STRING) AS message) AS error_result,
CAST(NULL AS BOOLEAN) AS cache_hit,
STRUCT(CAST(NULL AS STRING) AS project_id,
CAST(NULL AS STRING) AS dataset_id,
CAST(NULL AS STRING) AS table_id) AS destination_table,
ARRAY(
SELECT
AS STRUCT CAST(NULL AS STRING) AS project_id, CAST(NULL AS STRING) AS dataset_id, CAST(NULL AS STRING) AS table_id
FROM
UNNEST([STRUCT()])) AS referenced_tables,
ARRAY(
SELECT
AS STRUCT CAST(NULL AS STRING) AS KEY, CAST(NULL AS STRING) AS value
FROM
UNNEST([STRUCT()])) AS labels,
ARRAY(
SELECT
AS STRUCT CAST(NULL AS INTEGER) AS elapsed_ms, CAST(NULL AS INTEGER) AS total_slot_ms, CAST(NULL AS INTEGER) AS pending_units, CAST(NULL AS INTEGER) AS completed_units, CAST(NULL AS INTEGER) AS active_units, CAST(NULL AS INTEGER) AS estimated_runnable_units
FROM
UNNEST([STRUCT()])) AS timeline,
ARRAY(
SELECT
AS STRUCT CAST(NULL AS STRING) AS name, CAST(NULL AS INTEGER) AS id, CAST(NULL AS INTEGER) AS start_ms, CAST(NULL AS INTEGER) AS end_ms, ARRAY<INTEGER>[] AS input_stages, CAST(NULL AS FLOAT64) AS wait_ratio_avg, CAST(NULL AS INTEGER) AS wait_ms_avg, CAST(NULL AS FLOAT64) AS wait_ratio_max, CAST(NULL AS INTEGER) AS wait_ms_max, CAST(NULL AS FLOAT64) AS read_ratio_avg, CAST(NULL AS INTEGER) AS read_ms_avg, CAST(NULL AS FLOAT64) AS read_ratio_max, CAST(NULL AS INTEGER) AS read_ms_max, CAST(NULL AS FLOAT64) AS compute_ratio_avg, CAST(NULL AS INTEGER) AS compute_ms_avg, CAST(NULL AS FLOAT64) AS compute_ratio_max, CAST(NULL AS INTEGER) AS compute_ms_max, CAST(NULL AS FLOAT64) AS write_ratio_avg, CAST(NULL AS INTEGER) AS write_ms_avg, CAST(NULL AS FLOAT64) AS write_ratio_max, CAST(NULL AS INTEGER) AS write_ms_max, CAST(NULL AS INTEGER) AS shuffle_output_bytes, CAST(NULL AS INTEGER) AS shuffle_output_bytes_spilled, CAST(NULL AS INTEGER) AS records_read, CAST(NULL AS INTEGER) AS records_written, CAST(NULL AS INTEGER) AS parallel_inputs, CAST(NULL AS INTEGER) AS completed_parallel_inputs, CAST(NULL AS STRING) AS status, ARRAY(
SELECT
AS STRUCT CAST(NULL AS STRING) AS kind, ARRAY<STRING>[] AS substeps
FROM
UNNEST([STRUCT()])) AS steps, CAST(NULL AS INTEGER) AS slot_ms, CAST(NULL AS STRING) AS compute_mode
FROM
UNNEST([STRUCT()])) AS job_stages,
CAST(NULL AS INTEGER) AS total_bytes_billed,
CAST(NULL AS STRING) AS transaction_id,
CAST(NULL AS STRING) AS parent_job_id,
STRUCT(CAST(NULL AS STRING) AS session_id) AS session_info,
STRUCT(CAST(NULL AS INTEGER) AS inserted_row_count,
CAST(NULL AS INTEGER) AS deleted_row_count,
CAST(NULL AS INTEGER) AS updated_row_count) AS dml_statistics,
CAST(NULL AS INTEGER) AS total_modified_partitions,
STRUCT(CAST(NULL AS STRING) AS bi_engine_mode,
ARRAY(
SELECT
AS STRUCT CAST(NULL AS STRING) AS code, CAST(NULL AS STRING) AS message
FROM
UNNEST([STRUCT()])) AS bi_engine_reasons,
CAST(NULL AS STRING) AS acceleration_mode) AS bi_engine_statistics,
STRUCT(CAST(NULL AS STRING) AS resource_warning,
CAST(NULL AS JSON) AS optimization_details,
STRUCT(CAST(NULL AS STRING) AS normalized_literals) AS query_hashes,
STRUCT(CAST(NULL AS INTEGER) AS avg_previous_execution_ms,
ARRAY(
SELECT
AS STRUCT CAST(NULL AS INTEGER) AS stage_id, CAST(NULL AS BOOLEAN) AS slot_contention, CAST(NULL AS BOOLEAN) AS insufficient_shuffle_quota, ARRAY(
SELECT
AS STRUCT CAST(NULL AS STRING) AS code, CAST(NULL AS STRING) AS message
FROM
UNNEST([STRUCT()])) AS bi_engine_reasons, ARRAY(
SELECT
AS STRUCT CAST(NULL AS INTEGER) AS left_rows, CAST(NULL AS INTEGER) AS right_rows, CAST(NULL AS INTEGER) AS output_rows, CAST(NULL AS INTEGER) AS step_index
FROM
UNNEST([STRUCT()])) AS high_cardinality_joins
FROM
UNNEST([STRUCT()])) AS stage_performance_standalone_insights,
ARRAY(
SELECT
AS STRUCT CAST(NULL AS INTEGER) AS stage_id, STRUCT(CAST(NULL AS FLOAT64) AS records_read_diff_percentage) AS input_data_change
FROM
UNNEST([STRUCT()])) AS stage_performance_change_insights) AS performance_insights) AS query_info,
CAST(NULL AS INTEGER) AS transferred_bytes,
STRUCT(ARRAY(
SELECT
AS STRUCT STRUCT(CAST(NULL AS STRING) AS project_id,
CAST(NULL AS STRING) AS dataset_id,
CAST(NULL AS STRING) AS table_id) AS table_reference, CAST(NULL AS BOOLEAN) AS chosen, CAST(NULL AS INTEGER) AS estimated_bytes_saved, CAST(NULL AS STRING) AS rejected_reason
FROM
UNNEST([STRUCT()])) AS materialized_view) AS materialized_view_statistics,
CAST(NULL AS STRING) AS edition,
STRUCT(CAST(NULL AS STRING) AS code) AS job_creation_reason
どんな時に使えるか?
UNION ALLなどで、カラムが意図せず追加された場合
例えば、下記のようなテーブルがあったとする。
WITH sample_data_20240101 AS (
SELECT
"example_value" AS col1,
STRUCT(
"sub_value1" AS sub_col1,
"sub_value2" AS sub_col2
) AS record1
)
SELECT
col1,
record1
FROM
sample_data_20240101
このテーブルが毎日、TABLE_SUFFIXが出力日のテーブルがあるとして、
SELECT
col1,
record1
FROM
sample_data_all
UNION ALL
SELECT
col1,
record1
FROM
sample_data_*
WHERE
_TABLE_SUFFIX = CURRENT_DATE()
の実行結果を、再びsample_data_allに上書きするバッチ処理があったとする。
scheduled queryなどがよく用いられる。
次に、2024/2/1分のテーブルにおいて下記のようにカラムが追加されたとする。
sample_data_20240201 AS (
SELECT
"example_value" AS col1,
STRUCT(
"sub_value1" AS sub_col1,
"sub_value2" AS sub_col2,
"sub_value3" AS sub_col3 -- 追加されたカラム
) AS record1
)
SELECT
col1,
record1
FROM
sample_data_20240201
この時、バッチ処理のクエリはエラーになってしまう。UNION ALL時のカラムの構成が違うためである。
Column 2 in UNION ALL has incompatible types: STRUCT<sub_col1 STRING, sub_col2 STRING>, STRUCT<sub_col1 STRING, sub_col2 STRING, sub_col3 STRING> at [26:1]
このような場合に、リカバリ方法として上記のUDFが役立つ。
SELECT
col1,
STRUCT(
record1.sub_col1 AS sub_col1,
record1.sub_col2 AS sub_col2,
CAST(NULL AS STRING) AS sub_col3
) AS record1
FROM
sample_data_all
UNION ALL
SELECT
col1,
record1.sub_col1,
record1.sub_col2,
record1.sub_col3
FROM
sample_data_20240201
おわりに
バックアップなどを取り残したい場合、スキーマ構造を理解する必要があるが、自身がコントロールできないスキーマのテーブルは結構多い。
何度か「追加カラムをNULL埋めしたSELECT文」を考える機会があったので、このUDFを考えた。