概要
Databricks にて Azure SQL Dataabse のデータ型と対応する Spark Dataframe のデータ型の調査方法を共有します。Azure SQL Database にて一時テーブルを作成し、そのテーブルから Spark Dataframe を作成する方法を実施することで容易に確認できます。下記が Spark Dataframe のデータ型です。float(24) と float(25) で Spark のデータ型が変わるなどの仕様があり、すべてのデータ型の調査しているわけではないです。今後の調査を継続的に実施するために実施手順を共有します。
root
|-- bit_col: boolean (nullable = true)
|-- tinyint_col: integer (nullable = true)
|-- smallint_col: short (nullable = true)
|-- int_col: integer (nullable = true)
|-- bigint_col: long (nullable = true)
|-- numeric_col: decimal(18,0) (nullable = true)
|-- smallmoney_col: decimal(10,4) (nullable = true)
|-- money_col: decimal(19,4) (nullable = true)
|-- decimal_col: decimal(18,6) (nullable = true)
|-- float_24_col: float (nullable = true)
|-- float_25_col: double (nullable = true)
|-- float_53_col: double (nullable = true)
|-- real_col: float (nullable = true)
|-- char_col: string (nullable = true)
|-- varchar_col: string (nullable = true)
|-- text_col: string (nullable = true)
|-- nchar_col: string (nullable = true)
|-- nvarchar_col: string (nullable = true)
|-- ntext_col: string (nullable = true)
|-- date_col: date (nullable = true)
|-- datetimeoffset_col: string (nullable = true)
|-- datetime_col: timestamp (nullable = true)
|-- datetime2_col: timestamp (nullable = true)
|-- time_col: timestamp (nullable = true)
|-- binary_col: binary (nullable = true)
|-- varbinary_col: binary (nullable = true)
|-- image_col: binary (nullable = true)
|-- varchar_max_col: string (nullable = true)
|-- nvarchar_max_col: string (nullable = true)
|-- varbinary_max_col: binary (nullable = true)
|-- hierarchyid_col: binary (nullable = true)
|-- xml_col: string (nullable = true)
|-- uniqueidentifier_col: string (nullable = true)
検証方法
Azure SQL Database の接続情報を定義
server = "ads-01" ## サーバー名
database = "ads-01" ## データベース名
username = dbutils.secrets.get(scope="qiita", key="sql_authentificate_user") ## Azure SQL DB のユーザー名
password = dbutils.secrets.get(scope="qiita", key="sql_authentificate_password") ## Azure SQL DB のパスワード
Azure SQL Database にて一時テーブルを作成後に Spark Dataframe を作成
# CREATE TABLE の実行
sql = """
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'#create_tble_01') AND type in (N'U'))
DROP TABLE #create_tble_01
;
CREATE TABLE #create_tble_01
(
-- 正確な数値型 (Exact numerics)
bit_col BIT,
tinyint_col TINYINT,
smallint_col SMALLINT,
int_col INT,
bigint_col BIGINT,
numeric_col NUMERIC(18, 0),
smallmoney_col SMALLMONEY,
money_col MONEY,
decimal_col DECIMAL(18, 6),
-- 近似数値型 (Approximate numerics):
float_24_col FLOAT(24),
float_25_col FLOAT(25),
float_53_col FLOAT(53),
real_col real,
-- 文字列型 (Character strings):
char_col CHAR(10),
varchar_col VARCHAR(100),
text_col text,
-- Unicode 文字列型 (Unicode character strings)
nchar_col NCHAR(10),
nvarchar_col NVARCHAR(100),
ntext_col ntext,
-- 日付と時刻型 (Date and time):
date_col DATE,
datetimeoffset_col DATETIMEOFFSET,
datetime_col DATETIME,
datetime2_col DATETIME2,
time_col TIME,
-- バイナリ文字列型 (Binary strings):
binary_col BINARY(50),
varbinary_col VARBINARY(50),
image_col IMAGE,
-- 大きな値のデータ型 (Large value data types)
varchar_max_col VARCHAR(MAX),
nvarchar_max_col NVARCHAR(MAX),
-- 大きなオブジェクトのデータ型 (Large object data types)
varbinary_max_col VARBINARY(MAX),
-- その他のデータ型 (Other data types):
hierarchyid_col HIERARCHYID,
xml_col XML,
uniqueidentifier_col UNIQUEIDENTIFIER NOT NULL PRIMARY KEY DEFAULT NEWID()
);
"""
df = (
spark.read.format("sqlserver")
.option("host", f"{server}.database.windows.net")
.option("user", username)
.option("password", password)
.option("database", database)
.option("prepareQuery", sql)
.option("dbTable", "#create_tble_01")
.load()
)
df.printSchema()
root
|-- bit_col: boolean (nullable = true)
|-- tinyint_col: integer (nullable = true)
|-- smallint_col: short (nullable = true)
|-- int_col: integer (nullable = true)
|-- bigint_col: long (nullable = true)
|-- numeric_col: decimal(18,0) (nullable = true)
|-- smallmoney_col: decimal(10,4) (nullable = true)
|-- money_col: decimal(19,4) (nullable = true)
|-- decimal_col: decimal(18,6) (nullable = true)
|-- float_24_col: float (nullable = true)
|-- float_25_col: double (nullable = true)
|-- float_53_col: double (nullable = true)
|-- real_col: float (nullable = true)
|-- char_col: string (nullable = true)
|-- varchar_col: string (nullable = true)
|-- text_col: string (nullable = true)
|-- nchar_col: string (nullable = true)
|-- nvarchar_col: string (nullable = true)
|-- ntext_col: string (nullable = true)
|-- date_col: date (nullable = true)
|-- datetimeoffset_col: string (nullable = true)
|-- datetime_col: timestamp (nullable = true)
|-- datetime2_col: timestamp (nullable = true)
|-- time_col: timestamp (nullable = true)
|-- binary_col: binary (nullable = true)
|-- varbinary_col: binary (nullable = true)
|-- image_col: binary (nullable = true)
|-- varchar_max_col: string (nullable = true)
|-- nvarchar_max_col: string (nullable = true)
|-- varbinary_max_col: binary (nullable = true)
|-- hierarchyid_col: binary (nullable = true)
|-- xml_col: string (nullable = true)
|-- uniqueidentifier_col: string (nullable = true)