LoginSignup
2
2

More than 5 years have passed since last update.

md raid のメンテメモ

Last updated at Posted at 2016-09-26

md raid のメンテメモ

目的:

  • 個人用の備忘録
  • 個人用の作業手順

対象:

  • 自分
  • md raid 利用者
  • 古い Disk が頻繁に Fail するような環境

環境:

  • OS: Ubuntu14.04 Server
  • Disk: SATA HDDs
# mdadm --version
mdadm - v3.2.5 - 18th May 2012
smartctl 6.2 2013-07-26 r3841 [x86_64-linux-3.13.0-29-generic] (local build)

手順:

1. Fail した Disk の確認方法

1-1. syslog から

# grep -P 'md\d*:.*disabling' /var/log/syslog*
/var/log/syslog.1:Sep 25 16:15:24 fs2 kernel: [25914184.413892] md/raid1:md1: Disk failure on sda3, disabling device.
/var/log/syslog.1:Sep 25 19:00:10 fs2 kernel: [25924078.611045] md/raid1:md2: Disk failure on sda5, disabling device.

1-2. proc から

# cat /proc/mdstat | grep '(F)'
md2 : active raid1 sda5[0](F) sdc5[2]
md1 : active raid1 sda3[3](F) sdc3[2]

md1 の sda3 と
md2 の sda5 が raid から切り離されたことが分かる

2. Disk の状態確認

2-1. fdisk で見えるか?

# fdisk -l /dev/sda

Disk /dev/sda: 2000.4 GB, 2000398934016 bytes
255 heads, 63 sectors/track, 243201 cylinders, total 3907029168 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk identifier: 0x00070f5f
(snip)
/dev/sda3        16209920    35741695     9765888   fd  Linux raid autodetect
/dev/sda5        35743744    74803199    19529728   fd  Linux raid autodetect
(snip)

2-2. S.M.A.R.T 情報

# smartctl --all /dev/sda
# smartctl --attributes /dev/sda
# smartctl --attributes /dev/sda | grep -E "_Err|Unco|Power_On_Hours|Temperature_Celsius"
  1 Raw_Read_Error_Rate     0x002f   154   154   051    Pre-fail  Always       -       79071
  7 Seek_Error_Rate         0x002e   200   200   000    Old_age   Always       -       0
  9 Power_On_Hours          0x0032   063   063   000    Old_age   Always       -       27525
194 Temperature_Celsius     0x0022   110   090   000    Old_age   Always       -       40
198 Offline_Uncorrectable   0x0030   200   196   000    Old_age   Offline      -       100
199 UDMA_CRC_Error_Count    0x0032   200   200   000    Old_age   Always       -       0
200 Multi_Zone_Error_Rate   0x0008   001   001   000    Old_age   Offline      -       125766
  • シリアル番号の確認
$ udevadm info --query=all --name=/dev/sdi | grep ID_SERIAL
E: ID_SERIAL=WDC_WD30EZRX-00MMMB0_WD-WCAWZ0624076
E: ID_SERIAL_SHORT=WD-WCAWZ0624076

$ sudo smartctl -i /dev/sdi | grep -E "Serial Number|Device Model"
Device Model:     WDC WD30EZRX-00MMMB0
Serial Number:    WD-WCAWZ0624076

3. RAID からの切り離し

3-1. 切り離し

# mdadm /dev/md1 --remove /dev/sda3
mdadm: hot removed /dev/sda3 from /dev/md1

# mdadm /dev/md2 --remove /dev/sda5
mdadm: hot removed /dev/sda5 from /dev/md2

3-2. 切り離し確認

# mdadm --detail /dev/md1
(snip)
    Number   Major   Minor   RaidDevice State
       0       0        0        0      removed
       2       8       35        1      active sync   /dev/sdc3
# mdadm --detail /dev/md2
(snip)
    Number   Major   Minor   RaidDevice State
       0       0        0        0      removed
       2       8       37        1      active sync   /dev/sdc5
syslog
Sep 27 00:57:44 fs2 kernel: [26032025.115138] md: unbind<sda3>
Sep 27 00:57:44 fs2 kernel: [26032025.133178] md: export_rdev(sda3)
Sep 27 00:58:08 fs2 kernel: [26032048.442038] md: unbind<sda5>
Sep 27 00:58:08 fs2 kernel: [26032048.460046] md: export_rdev(sda5)

4. fdisk (if needed)

5. RAID にもどす

5-1. もどし

# mdadm --manage  /dev/md1 --add /dev/sda3
mdadm: added /dev/sda3

# mdadm --manage  /dev/md2 --add /dev/sda5
mdadm: added /dev/sda5

5-2. もどし確認

rebuilded
# mdadm --detail /dev/md1
(snip)
    Number   Major   Minor   RaidDevice State
       3       8        3        0      active sync   /dev/sda3
       2       8       35        1      active sync   /dev/sdc3
rebuilding
# mdadm --detail /dev/md2
(snip)
    Number   Major   Minor   RaidDevice State
       3       8        5        0      spare rebuilding   /dev/sda5
       2       8       37        1      active sync   /dev/sdc5
syslog
Sep 27 01:16:27 fs2 kernel: [26033148.773364] md: recovery of RAID array md1
Sep 27 01:16:27 fs2 kernel: [26033148.773374] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
Sep 27 01:16:33 fs2 kernel: [26033154.668741] md: delaying recovery of md2 until md1 has finished (they share one or more physical units)
Sep 27 01:17:49 fs2 kernel: [26033230.326188] md: md1: recovery done.
Sep 27 01:17:49 fs2 kernel: [26033230.339303] md: recovery of RAID array md2
Sep 27 01:17:49 fs2 kernel: [26033230.339311] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
Sep 27 01:20:29 fs2 kernel: [26033390.864021] md: md2: recovery done.
proc
# cat /proc/mdstat
md1 : active raid1 sda3[3] sdc3[2]
      9757568 blocks super 1.2 [2/2] [UU]
(snip)
md2 : active raid1 sda5[3] sdc5[2]
      19513216 blocks super 1.2 [2/2] [UU]

6. 手動で Fail にして Remove する場合

root@fs2:~# mdadm --detail /dev/md2
(snip)
    Number   Major   Minor   RaidDevice State
       3       8        5        0      active sync   /dev/sda5
       2       8       37        1      active sync   /dev/sdc5

root@fs2:~# mdadm /dev/md2 --fail /dev/sda5
mdadm: set /dev/sda5 faulty in /dev/md2

root@fs2:~# mdadm /dev/md2 --remove /dev/sda5
mdadm: hot removed /dev/sda5 from /dev/md2

root@fs2:~# mdadm --detail /dev/md2
(snip)
    Number   Major   Minor   RaidDevice State
       0       0        0        0      removed
       2       8       37        1      active sync   /dev/sdc5

7. おわり

2
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
2