11
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

カーディナリティの低いカラムに作成したインデックスの有効性について

Posted at

概要

[MySQL(InnoDB)でカーディナリティの低いカラムにINDEXを張る] (https://qiita.com/hmatsu47/items/2d44c173a9114fd06853)という記事で詳しく解説されています(参考にさせて頂きました)が、仕事上で実際に確認してみる必要が出てきましたので、この記事の方法で確認してみました。

環境

  • Windows 10 Professional
  • MySQL 8.0.13 Community Edition

参考

サンプルデータ

userテーブル

DROP TABLE IF EXISTS user;
CREATE TABLE user (
  id INT AUTO_INCREMENT                       COMMENT 'ユーザーID',
  nick_name VARCHAR(60) NOT NULL              COMMENT 'ニックネーム',
  sex CHAR(1) NOT NULL                        COMMENT '性別 M:男性 F:女性',
  prefecture_id TINYINT(1) NOT NULL DEFAULT 0 COMMENT '地方ID 0:不明、1:北海道 2:東北 - 8:九州・沖縄',
  email VARCHAR(120)                          COMMENT 'メールアドレス',
  create_at DATETIME NOT NULL DEFAULT NOW(),
  update_at DATETIME NOT NULL DEFAULT NOW(),
  PRIMARY KEY (id)
)
ENGINE = INNODB
DEFAULT CHARSET = UTF8MB4
COMMENT = 'ユーザーテーブル';

index

CREATE INDEX idx_sex ON user (sex) USING BTREE;
CREATE INDEX idx_pref ON user (prefecture_id) USING BTREE;

テストデータ作成

プロシージャで700万件のテストデータを作成します。性別と地方IDカラムはカーディナリティが低く、また特定の値に偏るようにしています。

DELIMITER //

/*******************************************************************************
 * ランダムで性別を返す関数
 * 割合は男性が98%、女性が2%という想定。
 *
 ******************************************************************************/
SELECT 'create function v3_func_get_sex' as 'progress'//

DROP FUNCTION IF EXISTS v3_func_get_sex//
SHOW WARNINGS//

CREATE FUNCTION v3_func_get_sex() RETURNS CHAR(1)
DETERMINISTIC NO SQL
BEGIN
  DECLARE n INT;
  DECLARE v_sex CHAR(1);

  /* ランダムで性別を選択 (0-99) */
  SET n = FLOOR(RAND() * 100);

  IF n < 1 THEN
      SET v_sex = 'F';
  ELSE
      SET v_sex = 'M';
  END IF;

  RETURN v_sex;

END;
//
SHOW WARNINGS//


/*******************************************************************************
 * ランダムでメールアドレスを返す関数
 *
 *******************************************************************************/
SELECT 'create function v3_func_get_email' as 'progress'//

DROP FUNCTION IF EXISTS v3_func_get_email//
SHOW WARNINGS//

CREATE FUNCTION v3_func_get_email() RETURNS VARCHAR(120)
DETERMINISTIC NO SQL
BEGIN
  DECLARE n INT;
  DECLARE v_email VARCHAR(120);
  DECLARE v_domain VARCHAR(30);

  /* ランダムでドメインを選択 (0-6) */
  SET n = FLOOR(RAND() * 7);

  CASE n
    WHEN 0 THEN
      SET v_domain = 'example.jp';
    WHEN 1 THEN
      SET v_domain = 'example.co.jp';
    WHEN 2 THEN
      SET v_domain = 'example.ne.jp';
    WHEN 3 THEN
      SET v_domain = 'example.com';
    WHEN 4 THEN
      SET v_domain = 'example.net';
    WHEN 5 THEN
      SET v_domain = 'example.info';
    ELSE
      SET v_domain = 'example.biz';
  END CASE;

  SET v_email = CONCAT(SUBSTRING(MD5(RAND()), 1, 4), '.', SUBSTRING(MD5(RAND()), 1, 5), '@', v_domain);

  RETURN v_email;

END;
//
SHOW WARNINGS//


/*******************************************************************************
 * ランダムで地方を返す関数
 * 
 ******************************************************************************/
SELECT 'create function v3_func_get_prefecture' as 'progress'//

DROP FUNCTION IF EXISTS v3_func_get_prefecture//
SHOW WARNINGS//

CREATE FUNCTION v3_func_get_prefecture() RETURNS TINYINT(1)
DETERMINISTIC NO SQL
BEGIN
  DECLARE n INT;
  DECLARE v_prefecture_id TINYINT(1);

  /* 0-999 */
  SET n = FLOOR(RAND() * 1000);

  /*
   * 1:北海道 2%
   * 2:東北   4%
   * 3:関東  70%
   * 4:中部   5%
   * 5:近畿   9%
   * 6:中国   3%
   * 7:四国   2%
   * 8:九州・沖縄 5%
   */

  IF n >= 0 AND n <= 19 THEN
    /* 北海道 */
    SET v_prefecture_id = 1;

  ELSEIF n >= 20 AND n <= 59 THEN
    /* 東北地方 */
    SET v_prefecture_id = 2;

  ELSEIF n >= 60 AND n <= 759 THEN
    /* 関東地方 */
    SET v_prefecture_id = 3;

  ELSEIF n >= 760 AND n <= 809 THEN
    /* 中部地方 */
    SET v_prefecture_id = 4;

  ELSEIF n >= 810 AND n <= 899 THEN
    /* 近畿地方 */
    SET v_prefecture_id = 5;

  ELSEIF n >= 900 AND n <= 929 THEN
    /* 中国地方 */
    SET v_prefecture_id = 6;

  ELSEIF n >= 930 AND n <= 949 THEN
    /* 四国地方 */
    SET v_prefecture_id = 7;

  ELSE
    /* 九州・沖縄地方 */
    SET v_prefecture_id = 8;

  END IF;

  RETURN v_prefecture_id;

END;
//
SHOW WARNINGS//


/*******************************************************************************
 * テストデータを生成するプロシージャ
 *
 * call v3_proc_create_test_user();
 *
 ******************************************************************************/
SELECT 'create procedure v3_proc_create_test_user' as 'progress'//

DROP PROCEDURE IF EXISTS v3_proc_create_test_user//
SHOW WARNINGS//

CREATE PROCEDURE v3_proc_create_test_user()
BEGIN
  DECLARE done INT DEFAULT 0;

  DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = 1;

  /**
   * テストデータ初期化
   */
  BEGIN
    SET foreign_key_checks = 0;
    TRUNCATE TABLE user;
    SET foreign_key_checks = 1;
  END;

  SELECT NOW() AS "START CREATE USER DATA";

  create_user: BEGIN
    DECLARE v_nick_name VARCHAR(60);
    DECLARE v_sex CHAR(1);
    DECLARE v_prefecture_id TINYINT(1);
    DECLARE v_email VARCHAR(120);

    DECLARE v_max_user, v_cnt_user INT;

    START TRANSACTION;

    /* テストユーザーの作成件数 */
    SET v_max_user = 7000000;
    SET v_cnt_user = 0;

    WHILE v_max_user > v_cnt_user DO

      /* ユーザー名をランダムで決定 */
      SET v_nick_name = CONCAT('TESTUSER-', SUBSTRING(MD5(RAND()), 1, 20));
      /* 性別をランダムで決定 */
      SET v_sex = v3_func_get_sex();
      /* メールアドレスをランダムで決定 */
      SET v_email = v3_func_get_email();
      /* 住所をランダムで決定 */
      SET v_prefecture_id = v3_func_get_prefecture();

      INSERT INTO user (
        nick_name,
        sex,
        prefecture_id,
        email)
      VALUES (
        v_nick_name,
        v_sex,
        v_prefecture_id,
        v_email);

      /* 1000件ごとにcommit */
      IF v_cnt_user % 1000 = 0 THEN
        COMMIT;
        START TRANSACTION;
      END IF;

      SET v_cnt_user = v_cnt_user + 1;
    END WHILE;

    COMMIT;

  END create_user;

  SELECT NOW() AS "END CREATE USER DATA";

END;
//

SHOW WARNINGS//

DELIMITER ;

コンパイルと実行

> source v3_proc_create_test_user.sql

> call v3_proc_create_test_user();

作成したデータ件数を確認

> select count(*) from user;
+----------+
| count(*) |
+----------+
|  7000000 |
+----------+
1 row in set (1.36 sec)

カーディナリティの確認

> show index from user\G;
*************************** 1. row ***************************
        Table: user
   Non_unique: 0
     Key_name: PRIMARY
 Seq_in_index: 1
  Column_name: id
    Collation: A
  Cardinality: 6429148
     Sub_part: NULL
       Packed: NULL
         Null:
   Index_type: BTREE
      Comment:
Index_comment:
      Visible: YES
   Expression: NULL
*************************** 2. row ***************************
        Table: user
   Non_unique: 1
     Key_name: idx_sex
 Seq_in_index: 1
  Column_name: sex
    Collation: A
  Cardinality: 1
     Sub_part: NULL
       Packed: NULL
         Null:
   Index_type: BTREE
      Comment:
Index_comment:
      Visible: YES
   Expression: NULL
*************************** 3. row ***************************
        Table: user
   Non_unique: 1
     Key_name: idx_pref
 Seq_in_index: 1
  Column_name: prefecture_id
    Collation: A
  Cardinality: 7
     Sub_part: NULL
       Packed: NULL
         Null:
   Index_type: BTREE
      Comment:
Index_comment:
      Visible: YES
   Expression: NULL
3 rows in set (0.25 sec)

ERROR:
No query specified

インデックスの効果を確認

インデックス有りの場合

プロファイルを有効にします。

> set profiling = 1;
Query OK, 0 rows affected, 1 warning (0.00 sec)

Warning (Code 1287): '@@profiling' is deprecated and will be removed in a future release.
> select count(*) from user where sex = 'F';
+----------+
| count(*) |
+----------+
|    69373 |
+----------+
1 row in set (0.07 sec)

> select count(*) from user where sex = 'M';
+----------+
| count(*) |
+----------+
|  6930627 |
+----------+
1 row in set (4.49 sec)
> explain select count(*) from user where sex = 'F';
+----+-------------+-------+------------+------+---------------+---------+---------+-------+--------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key     | key_len | ref   | rows   | filtered | Extra       |
+----+-------------+-------+------------+------+---------------+---------+---------+-------+--------+----------+-------------+
|  1 | SIMPLE      | user  | NULL       | ref  | idx_sex       | idx_sex | 4       | const | 136944 |   100.00 | Using index |
+----+-------------+-------+------------+------+---------------+---------+---------+-------+--------+----------+-------------+
1 row in set, 1 warning (0.02 sec)

> explain select count(*) from user where sex = 'M';
+----+-------------+-------+------------+------+---------------+---------+---------+-------+---------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key     | key_len | ref   | rows    | filtered | Extra       |
+----+-------------+-------+------------+------+---------------+---------+---------+-------+---------+----------+-------------+
|  1 | SIMPLE      | user  | NULL       | ref  | idx_sex       | idx_sex | 4       | const | 3214799 |   100.00 | Using index |
+----+-------------+-------+------------+------+---------------+---------+---------+-------+---------+----------+-------------+
1 row in set, 1 warning (0.01 sec)
> show profiles;
+----------+-------------+-----------------------------------------------------------+
| Query_ID | Duration    | Query                                                     |
+----------+-------------+-----------------------------------------------------------+
|        1 |  0.07124875 | select count(*) from user where sex = 'F'                 |
|        2 |  4.49132500 | select count(*) from user where sex = 'M'                 |
|        3 |  0.02003875 | explain select count(*) from user where sex = 'F'         |
|        4 |  0.00013800 | show warnings                                             |
|        5 |  0.01165050 | explain select count(*) from user where sex = 'M'         |
|        6 |  0.00023750 | show warnings                                             |
+----------+-------------+-----------------------------------------------------------+
6 rows in set, 1 warning (0.00 sec)

Warning (Code 1287): 'SHOW PROFILES' is deprecated and will be removed in a future release. Please use Performance Schema instead

インデックス無しの場合

DROP INDEX idx_sex ON user;
DROP INDEX idx_pref ON user;
> select count(*) from user where sex = 'F';
+----------+
| count(*) |
+----------+
|    69373 |
+----------+
1 row in set (10.34 sec)

> select count(*) from user where sex = 'M';
+----------+
| count(*) |
+----------+
|  6930627 |
+----------+
1 row in set (10.39 sec)
> explain select count(*) from user where sex = 'F';
+----+-------------+-------+------------+------+---------------+------+---------+------+---------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key  | key_len | ref  | rows    | filtered | Extra       |
+----+-------------+-------+------------+------+---------------+------+---------+------+---------+----------+-------------+
|  1 | SIMPLE      | user  | NULL       | ALL  | NULL          | NULL | NULL    | NULL | 6429598 |    10.00 | Using where |
+----+-------------+-------+------------+------+---------------+------+---------+------+---------+----------+-------------+
1 row in set, 1 warning (0.00 sec)

> explain select count(*) from user where sex = 'M';
+----+-------------+-------+------------+------+---------------+------+---------+------+---------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key  | key_len | ref  | rows    | filtered | Extra       |
+----+-------------+-------+------------+------+---------------+------+---------+------+---------+----------+-------------+
|  1 | SIMPLE      | user  | NULL       | ALL  | NULL          | NULL | NULL    | NULL | 6429598 |    10.00 | Using where |
+----+-------------+-------+------------+------+---------------+------+---------+------+---------+----------+-------------+
1 row in set, 1 warning (0.00 sec)

かんたんなまとめ

この記事の例では700万件あるユーザーテーブルを使い、性別カラムに対してインデックスを張って効果を確認してみました。
性別カラムはM(男性)とF(女性)という2値を取るカーディナリティ―が低いカラムですが、データの分布でいえばMが6,930,627件、Fが69,373件と極端に偏りがあります。

結果として、カーディナリティが低いカラムでも値の分布が極端に偏っていて、分布の少ない値を検索する場合にインデックスは有効だということがわかりました。

11
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
11
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?