6
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

ZOZOAdvent Calendar 2024

Day 24

[BigQuery] JSON SchemaからSELECT文を生成するUDF

Posted at

はじめに

🎄 本記事は ZOZO Advent Calendar 2024 シリーズ4の24日目です

BigQueryでは、下記の画像のようにテーブルのスキーマ情報をJSONで出力できる。

BigQuery_–zozo-datapool-prd–_Google_Cloud_console.png
(画像はJOBS_BY_USERの例)

簡単なテーブル場合は、自身でカラム名を列挙すれば問題ないが、RECORD型を多く持っていたりすると、SELECTのカラム列挙時に苦労する。

Audit Logなどはカラム数が多すぎて、ネストした値までSELECTするのが大変。

今回は、JSON Schemaを引数として渡すと、SELECT文を生成するUDFを紹介する。

UDFの作成

再帰的な処理が必要なため、Javascript UDFを用いた。

CREATE TEMPORARY FUNCTION generate_select(schema STRING, is_null BOOL)
RETURNS STRING
LANGUAGE js AS """
function generateSQL(schema_json, is_null) {
    function processField(field, parent) {
        const name = parent ? parent + '.' + field.name : field.name;
        const nullValue = field.type === 'FLOAT' ? 'CAST(NULL AS FLOAT64)':
                          `CAST(NULL AS ${field.type})`;
        if (field.type === 'RECORD') {
            let fields = field.fields.map(f => processField(f, field.mode === 'REPEATED' ? field.name + '_alias' : name)).join(', ');
            if (field.mode === 'REPEATED') {
                if (is_null) {
                    return `ARRAY(SELECT AS STRUCT ${fields} FROM UNNEST([STRUCT()])) AS ${field.name}`;
                } else {
                    return `ARRAY(SELECT AS STRUCT ${fields} FROM UNNEST(${name}) AS ${field.name}_alias) AS ${field.name}`;
                }
            } else {
                return is_null ? `STRUCT(${fields}) AS ${field.name}` : `STRUCT(${fields}) AS ${field.name}`;
            }
        } else if (field.mode === 'REPEATED') {
            if (is_null) {
                return `ARRAY<${field.type}>[] AS ${field.name}`;
            } else {
                return `${name} AS ${field.name}`;
            }
        } else {
            return is_null ? `${nullValue} AS ${field.name}` : `${name} AS ${field.name}`;
        }
    }
    const selectFields = schema_json.map(f => processField(f, null)).join(', ');
    return `SELECT ${selectFields}`;
}
const schema_json = JSON.parse(schema);
return generateSQL(schema_json, is_null);

第二引数のis_nulltrueを渡せば、NULL埋めした結果を表示する。
falseを渡せば、そのままカラム名を指定するので、FROM句に参照したいテーブルを指定すればよい。

実行例

下記はJOBS_BY_USERを用いた場合の例。

実行例
CREATE TEMPORARY FUNCTION generate_select(schema STRING, is_null BOOL)
RETURNS STRING
LANGUAGE js AS """
function generateSQL(schema_json, is_null) {
    function processField(field, parent) {
        const name = parent ? parent + '.' + field.name : field.name;
        const nullValue = field.type === 'FLOAT' ? 'CAST(NULL AS FLOAT64)':
                          `CAST(NULL AS ${field.type})`;
        if (field.type === 'RECORD') {
            let fields = field.fields.map(f => processField(f, field.mode === 'REPEATED' ? field.name + '_alias' : name)).join(', ');
            if (field.mode === 'REPEATED') {
                if (is_null) {
                    return `ARRAY(SELECT AS STRUCT ${fields} FROM UNNEST([STRUCT()])) AS ${field.name}`;
                } else {
                    return `ARRAY(SELECT AS STRUCT ${fields} FROM UNNEST(${name}) AS ${field.name}_alias) AS ${field.name}`;
                }
            } else {
                return is_null ? `STRUCT(${fields}) AS ${field.name}` : `STRUCT(${fields}) AS ${field.name}`;
            }
        } else if (field.mode === 'REPEATED') {
            if (is_null) {
                return `ARRAY<${field.type}>[] AS ${field.name}`;
            } else {
                return `${name} AS ${field.name}`;
            }
        } else {
            return is_null ? `${nullValue} AS ${field.name}` : `${name} AS ${field.name}`;
        }
    }
    const selectFields = schema_json.map(f => processField(f, null)).join(', ');
    return `SELECT ${selectFields}`;
}
const schema_json = JSON.parse(schema);
return generateSQL(schema_json, is_null);
""";

WITH schema_json AS (
    SELECT '''
      [
  {
    "name": "creation_time",
    "mode": "NULLABLE",
    "type": "TIMESTAMP",
    "description": "",
    "fields": []
  },
  {
    "name": "project_id",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "project_number",
    "mode": "NULLABLE",
    "type": "INTEGER",
    "description": "",
    "fields": []
  },
  {
    "name": "user_email",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "job_id",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "job_type",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "statement_type",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "priority",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "start_time",
    "mode": "NULLABLE",
    "type": "TIMESTAMP",
    "description": "",
    "fields": []
  },
  {
    "name": "end_time",
    "mode": "NULLABLE",
    "type": "TIMESTAMP",
    "description": "",
    "fields": []
  },
  {
    "name": "query",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "state",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "reservation_id",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "total_bytes_processed",
    "mode": "NULLABLE",
    "type": "INTEGER",
    "description": "",
    "fields": []
  },
  {
    "name": "total_slot_ms",
    "mode": "NULLABLE",
    "type": "INTEGER",
    "description": "",
    "fields": []
  },
  {
    "name": "error_result",
    "mode": "NULLABLE",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "reason",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "location",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "debug_info",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "message",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "cache_hit",
    "mode": "NULLABLE",
    "type": "BOOLEAN",
    "description": "",
    "fields": []
  },
  {
    "name": "destination_table",
    "mode": "NULLABLE",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "project_id",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "dataset_id",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "table_id",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "referenced_tables",
    "mode": "REPEATED",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "project_id",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "dataset_id",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "table_id",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "labels",
    "mode": "REPEATED",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "key",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "value",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "timeline",
    "mode": "REPEATED",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "elapsed_ms",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "total_slot_ms",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "pending_units",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "completed_units",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "active_units",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "estimated_runnable_units",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "job_stages",
    "mode": "REPEATED",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "name",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "id",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "start_ms",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "end_ms",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "input_stages",
        "mode": "REPEATED",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "wait_ratio_avg",
        "mode": "NULLABLE",
        "type": "FLOAT",
        "description": "",
        "fields": []
      },
      {
        "name": "wait_ms_avg",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "wait_ratio_max",
        "mode": "NULLABLE",
        "type": "FLOAT",
        "description": "",
        "fields": []
      },
      {
        "name": "wait_ms_max",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "read_ratio_avg",
        "mode": "NULLABLE",
        "type": "FLOAT",
        "description": "",
        "fields": []
      },
      {
        "name": "read_ms_avg",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "read_ratio_max",
        "mode": "NULLABLE",
        "type": "FLOAT",
        "description": "",
        "fields": []
      },
      {
        "name": "read_ms_max",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "compute_ratio_avg",
        "mode": "NULLABLE",
        "type": "FLOAT",
        "description": "",
        "fields": []
      },
      {
        "name": "compute_ms_avg",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "compute_ratio_max",
        "mode": "NULLABLE",
        "type": "FLOAT",
        "description": "",
        "fields": []
      },
      {
        "name": "compute_ms_max",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "write_ratio_avg",
        "mode": "NULLABLE",
        "type": "FLOAT",
        "description": "",
        "fields": []
      },
      {
        "name": "write_ms_avg",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "write_ratio_max",
        "mode": "NULLABLE",
        "type": "FLOAT",
        "description": "",
        "fields": []
      },
      {
        "name": "write_ms_max",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "shuffle_output_bytes",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "shuffle_output_bytes_spilled",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "records_read",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "records_written",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "parallel_inputs",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "completed_parallel_inputs",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "status",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "steps",
        "mode": "REPEATED",
        "type": "RECORD",
        "description": "",
        "fields": [
          {
            "name": "kind",
            "mode": "NULLABLE",
            "type": "STRING",
            "description": "",
            "fields": []
          },
          {
            "name": "substeps",
            "mode": "REPEATED",
            "type": "STRING",
            "description": "",
            "fields": []
          }
        ]
      },
      {
        "name": "slot_ms",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "compute_mode",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "total_bytes_billed",
    "mode": "NULLABLE",
    "type": "INTEGER",
    "description": "",
    "fields": []
  },
  {
    "name": "transaction_id",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "parent_job_id",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "session_info",
    "mode": "NULLABLE",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "session_id",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "dml_statistics",
    "mode": "NULLABLE",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "inserted_row_count",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "deleted_row_count",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      },
      {
        "name": "updated_row_count",
        "mode": "NULLABLE",
        "type": "INTEGER",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "total_modified_partitions",
    "mode": "NULLABLE",
    "type": "INTEGER",
    "description": "",
    "fields": []
  },
  {
    "name": "bi_engine_statistics",
    "mode": "NULLABLE",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "bi_engine_mode",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "bi_engine_reasons",
        "mode": "REPEATED",
        "type": "RECORD",
        "description": "",
        "fields": [
          {
            "name": "code",
            "mode": "NULLABLE",
            "type": "STRING",
            "description": "",
            "fields": []
          },
          {
            "name": "message",
            "mode": "NULLABLE",
            "type": "STRING",
            "description": "",
            "fields": []
          }
        ]
      },
      {
        "name": "acceleration_mode",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      }
    ]
  },
  {
    "name": "query_info",
    "mode": "NULLABLE",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "resource_warning",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      },
      {
        "name": "optimization_details",
        "mode": "NULLABLE",
        "type": "JSON",
        "description": "",
        "fields": []
      },
      {
        "name": "query_hashes",
        "mode": "NULLABLE",
        "type": "RECORD",
        "description": "",
        "fields": [
          {
            "name": "normalized_literals",
            "mode": "NULLABLE",
            "type": "STRING",
            "description": "",
            "fields": []
          }
        ]
      },
      {
        "name": "performance_insights",
        "mode": "NULLABLE",
        "type": "RECORD",
        "description": "",
        "fields": [
          {
            "name": "avg_previous_execution_ms",
            "mode": "NULLABLE",
            "type": "INTEGER",
            "description": "",
            "fields": []
          },
          {
            "name": "stage_performance_standalone_insights",
            "mode": "REPEATED",
            "type": "RECORD",
            "description": "",
            "fields": [
              {
                "name": "stage_id",
                "mode": "NULLABLE",
                "type": "INTEGER",
                "description": "",
                "fields": []
              },
              {
                "name": "slot_contention",
                "mode": "NULLABLE",
                "type": "BOOLEAN",
                "description": "",
                "fields": []
              },
              {
                "name": "insufficient_shuffle_quota",
                "mode": "NULLABLE",
                "type": "BOOLEAN",
                "description": "",
                "fields": []
              },
              {
                "name": "bi_engine_reasons",
                "mode": "REPEATED",
                "type": "RECORD",
                "description": "",
                "fields": [
                  {
                    "name": "code",
                    "mode": "NULLABLE",
                    "type": "STRING",
                    "description": "",
                    "fields": []
                  },
                  {
                    "name": "message",
                    "mode": "NULLABLE",
                    "type": "STRING",
                    "description": "",
                    "fields": []
                  }
                ]
              },
              {
                "name": "high_cardinality_joins",
                "mode": "REPEATED",
                "type": "RECORD",
                "description": "",
                "fields": [
                  {
                    "name": "left_rows",
                    "mode": "NULLABLE",
                    "type": "INTEGER",
                    "description": "",
                    "fields": []
                  },
                  {
                    "name": "right_rows",
                    "mode": "NULLABLE",
                    "type": "INTEGER",
                    "description": "",
                    "fields": []
                  },
                  {
                    "name": "output_rows",
                    "mode": "NULLABLE",
                    "type": "INTEGER",
                    "description": "",
                    "fields": []
                  },
                  {
                    "name": "step_index",
                    "mode": "NULLABLE",
                    "type": "INTEGER",
                    "description": "",
                    "fields": []
                  }
                ]
              }
            ]
          },
          {
            "name": "stage_performance_change_insights",
            "mode": "REPEATED",
            "type": "RECORD",
            "description": "",
            "fields": [
              {
                "name": "stage_id",
                "mode": "NULLABLE",
                "type": "INTEGER",
                "description": "",
                "fields": []
              },
              {
                "name": "input_data_change",
                "mode": "NULLABLE",
                "type": "RECORD",
                "description": "",
                "fields": [
                  {
                    "name": "records_read_diff_percentage",
                    "mode": "NULLABLE",
                    "type": "FLOAT",
                    "description": "",
                    "fields": []
                  }
                ]
              }
            ]
          }
        ]
      }
    ]
  },
  {
    "name": "transferred_bytes",
    "mode": "NULLABLE",
    "type": "INTEGER",
    "description": "",
    "fields": []
  },
  {
    "name": "materialized_view_statistics",
    "mode": "NULLABLE",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "materialized_view",
        "mode": "REPEATED",
        "type": "RECORD",
        "description": "",
        "fields": [
          {
            "name": "table_reference",
            "mode": "NULLABLE",
            "type": "RECORD",
            "description": "",
            "fields": [
              {
                "name": "project_id",
                "mode": "NULLABLE",
                "type": "STRING",
                "description": "",
                "fields": []
              },
              {
                "name": "dataset_id",
                "mode": "NULLABLE",
                "type": "STRING",
                "description": "",
                "fields": []
              },
              {
                "name": "table_id",
                "mode": "NULLABLE",
                "type": "STRING",
                "description": "",
                "fields": []
              }
            ]
          },
          {
            "name": "chosen",
            "mode": "NULLABLE",
            "type": "BOOLEAN",
            "description": "",
            "fields": []
          },
          {
            "name": "estimated_bytes_saved",
            "mode": "NULLABLE",
            "type": "INTEGER",
            "description": "",
            "fields": []
          },
          {
            "name": "rejected_reason",
            "mode": "NULLABLE",
            "type": "STRING",
            "description": "",
            "fields": []
          }
        ]
      }
    ]
  },
  {
    "name": "edition",
    "mode": "NULLABLE",
    "type": "STRING",
    "description": "",
    "fields": []
  },
  {
    "name": "job_creation_reason",
    "mode": "NULLABLE",
    "type": "RECORD",
    "description": "",
    "fields": [
      {
        "name": "code",
        "mode": "NULLABLE",
        "type": "STRING",
        "description": "",
        "fields": []
      }
    ]
  }
]
    ''' AS json_schema
)

SELECT generate_select(json_schema, true) AS generated_query
FROM schema_json;

生成されたSELECT文。


SELECT
  CAST(NULL AS TIMESTAMP) AS creation_time,
  CAST(NULL AS STRING) AS project_id,
  CAST(NULL AS INTEGER) AS project_number,
  CAST(NULL AS STRING) AS user_email,
  CAST(NULL AS STRING) AS job_id,
  CAST(NULL AS STRING) AS job_type,
  CAST(NULL AS STRING) AS statement_type,
  CAST(NULL AS STRING) AS priority,
  CAST(NULL AS TIMESTAMP) AS start_time,
  CAST(NULL AS TIMESTAMP) AS end_time,
  CAST(NULL AS STRING) AS query,
  CAST(NULL AS STRING) AS state,
  CAST(NULL AS STRING) AS reservation_id,
  CAST(NULL AS INTEGER) AS total_bytes_processed,
  CAST(NULL AS INTEGER) AS total_slot_ms,
  STRUCT(CAST(NULL AS STRING) AS reason,
    CAST(NULL AS STRING) AS location,
    CAST(NULL AS STRING) AS debug_info,
    CAST(NULL AS STRING) AS message) AS error_result,
  CAST(NULL AS BOOLEAN) AS cache_hit,
  STRUCT(CAST(NULL AS STRING) AS project_id,
    CAST(NULL AS STRING) AS dataset_id,
    CAST(NULL AS STRING) AS table_id) AS destination_table,
  ARRAY(
  SELECT
    AS STRUCT CAST(NULL AS STRING) AS project_id, CAST(NULL AS STRING) AS dataset_id, CAST(NULL AS STRING) AS table_id
  FROM
    UNNEST([STRUCT()])) AS referenced_tables,
  ARRAY(
  SELECT
    AS STRUCT CAST(NULL AS STRING) AS KEY, CAST(NULL AS STRING) AS value
  FROM
    UNNEST([STRUCT()])) AS labels,
  ARRAY(
  SELECT
    AS STRUCT CAST(NULL AS INTEGER) AS elapsed_ms, CAST(NULL AS INTEGER) AS total_slot_ms, CAST(NULL AS INTEGER) AS pending_units, CAST(NULL AS INTEGER) AS completed_units, CAST(NULL AS INTEGER) AS active_units, CAST(NULL AS INTEGER) AS estimated_runnable_units
  FROM
    UNNEST([STRUCT()])) AS timeline,
  ARRAY(
  SELECT
    AS STRUCT CAST(NULL AS STRING) AS name, CAST(NULL AS INTEGER) AS id, CAST(NULL AS INTEGER) AS start_ms, CAST(NULL AS INTEGER) AS end_ms, ARRAY<INTEGER>[] AS input_stages, CAST(NULL AS FLOAT64) AS wait_ratio_avg, CAST(NULL AS INTEGER) AS wait_ms_avg, CAST(NULL AS FLOAT64) AS wait_ratio_max, CAST(NULL AS INTEGER) AS wait_ms_max, CAST(NULL AS FLOAT64) AS read_ratio_avg, CAST(NULL AS INTEGER) AS read_ms_avg, CAST(NULL AS FLOAT64) AS read_ratio_max, CAST(NULL AS INTEGER) AS read_ms_max, CAST(NULL AS FLOAT64) AS compute_ratio_avg, CAST(NULL AS INTEGER) AS compute_ms_avg, CAST(NULL AS FLOAT64) AS compute_ratio_max, CAST(NULL AS INTEGER) AS compute_ms_max, CAST(NULL AS FLOAT64) AS write_ratio_avg, CAST(NULL AS INTEGER) AS write_ms_avg, CAST(NULL AS FLOAT64) AS write_ratio_max, CAST(NULL AS INTEGER) AS write_ms_max, CAST(NULL AS INTEGER) AS shuffle_output_bytes, CAST(NULL AS INTEGER) AS shuffle_output_bytes_spilled, CAST(NULL AS INTEGER) AS records_read, CAST(NULL AS INTEGER) AS records_written, CAST(NULL AS INTEGER) AS parallel_inputs, CAST(NULL AS INTEGER) AS completed_parallel_inputs, CAST(NULL AS STRING) AS status, ARRAY(
    SELECT
      AS STRUCT CAST(NULL AS STRING) AS kind, ARRAY<STRING>[] AS substeps
    FROM
      UNNEST([STRUCT()])) AS steps, CAST(NULL AS INTEGER) AS slot_ms, CAST(NULL AS STRING) AS compute_mode
  FROM
    UNNEST([STRUCT()])) AS job_stages,
  CAST(NULL AS INTEGER) AS total_bytes_billed,
  CAST(NULL AS STRING) AS transaction_id,
  CAST(NULL AS STRING) AS parent_job_id,
  STRUCT(CAST(NULL AS STRING) AS session_id) AS session_info,
  STRUCT(CAST(NULL AS INTEGER) AS inserted_row_count,
    CAST(NULL AS INTEGER) AS deleted_row_count,
    CAST(NULL AS INTEGER) AS updated_row_count) AS dml_statistics,
  CAST(NULL AS INTEGER) AS total_modified_partitions,
  STRUCT(CAST(NULL AS STRING) AS bi_engine_mode,
    ARRAY(
    SELECT
      AS STRUCT CAST(NULL AS STRING) AS code, CAST(NULL AS STRING) AS message
    FROM
      UNNEST([STRUCT()])) AS bi_engine_reasons,
    CAST(NULL AS STRING) AS acceleration_mode) AS bi_engine_statistics,
  STRUCT(CAST(NULL AS STRING) AS resource_warning,
    CAST(NULL AS JSON) AS optimization_details,
    STRUCT(CAST(NULL AS STRING) AS normalized_literals) AS query_hashes,
    STRUCT(CAST(NULL AS INTEGER) AS avg_previous_execution_ms,
      ARRAY(
      SELECT
        AS STRUCT CAST(NULL AS INTEGER) AS stage_id, CAST(NULL AS BOOLEAN) AS slot_contention, CAST(NULL AS BOOLEAN) AS insufficient_shuffle_quota, ARRAY(
        SELECT
          AS STRUCT CAST(NULL AS STRING) AS code, CAST(NULL AS STRING) AS message
        FROM
          UNNEST([STRUCT()])) AS bi_engine_reasons, ARRAY(
        SELECT
          AS STRUCT CAST(NULL AS INTEGER) AS left_rows, CAST(NULL AS INTEGER) AS right_rows, CAST(NULL AS INTEGER) AS output_rows, CAST(NULL AS INTEGER) AS step_index
        FROM
          UNNEST([STRUCT()])) AS high_cardinality_joins
      FROM
        UNNEST([STRUCT()])) AS stage_performance_standalone_insights,
      ARRAY(
      SELECT
        AS STRUCT CAST(NULL AS INTEGER) AS stage_id, STRUCT(CAST(NULL AS FLOAT64) AS records_read_diff_percentage) AS input_data_change
      FROM
        UNNEST([STRUCT()])) AS stage_performance_change_insights) AS performance_insights) AS query_info,
  CAST(NULL AS INTEGER) AS transferred_bytes,
  STRUCT(ARRAY(
    SELECT
      AS STRUCT STRUCT(CAST(NULL AS STRING) AS project_id,
        CAST(NULL AS STRING) AS dataset_id,
        CAST(NULL AS STRING) AS table_id) AS table_reference, CAST(NULL AS BOOLEAN) AS chosen, CAST(NULL AS INTEGER) AS estimated_bytes_saved, CAST(NULL AS STRING) AS rejected_reason
    FROM
      UNNEST([STRUCT()])) AS materialized_view) AS materialized_view_statistics,
  CAST(NULL AS STRING) AS edition,
  STRUCT(CAST(NULL AS STRING) AS code) AS job_creation_reason

どんな時に使えるか?

UNION ALLなどで、カラムが意図せず追加された場合

例えば、下記のようなテーブルがあったとする。

WITH sample_data_20240101 AS (
  SELECT
    "example_value" AS col1,
    STRUCT(
      "sub_value1" AS sub_col1,
      "sub_value2" AS sub_col2
    ) AS record1
)

SELECT
  col1,
  record1
FROM
  sample_data_20240101

このテーブルが毎日、TABLE_SUFFIXが出力日のテーブルがあるとして、

SELECT
    col1,
    record1
FROM
    sample_data_all
UNION ALL
SELECT
    col1,
    record1
FROM
    sample_data_*
WHERE
    _TABLE_SUFFIX = CURRENT_DATE()

の実行結果を、再びsample_data_allに上書きするバッチ処理があったとする。
scheduled queryなどがよく用いられる。

次に、2024/2/1分のテーブルにおいて下記のようにカラムが追加されたとする。

sample_data_20240201 AS (
  SELECT
    "example_value" AS col1,
    STRUCT(
      "sub_value1" AS sub_col1,
      "sub_value2" AS sub_col2,
      "sub_value3" AS sub_col3    -- 追加されたカラム
    ) AS record1
)

SELECT
  col1,
  record1
FROM
  sample_data_20240201

この時、バッチ処理のクエリはエラーになってしまう。UNION ALL時のカラムの構成が違うためである。

Column 2 in UNION ALL has incompatible types: STRUCT<sub_col1 STRING, sub_col2 STRING>, STRUCT<sub_col1 STRING, sub_col2 STRING, sub_col3 STRING> at [26:1]

このような場合に、リカバリ方法として上記のUDFが役立つ。

SELECT
    col1,
    STRUCT(
        record1.sub_col1 AS sub_col1,
        record1.sub_col2 AS sub_col2,
        CAST(NULL AS STRING) AS sub_col3
    ) AS record1
FROM
    sample_data_all
UNION ALL
SELECT
    col1,
    record1.sub_col1,
    record1.sub_col2,
    record1.sub_col3
FROM
  sample_data_20240201

おわりに

バックアップなどを取り残したい場合、スキーマ構造を理解する必要があるが、自身がコントロールできないスキーマのテーブルは結構多い。
何度か「追加カラムをNULL埋めしたSELECT文」を考える機会があったので、このUDFを考えた。

6
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
6
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?