はじめに
Excel VBAで日本語テキストの文字コード判定をする方法を調べたところ、
UTF8(BOM有り):先頭3バイトのBOMで判定可能
UTF16:先頭2バイトのBOMで判定可能
で判定可能だが、Shift_JISとUTF8(BOMなし)についてはテキストファイル内の文字コードの使用範囲で判定する方法しかないことが分かった。
NKFなどが導入できない環境向けにVBAコードを作成した。
文字コード判定については、ファイル操作の練習もかねて2種類作成した。
Shift_JISとUTF8Nの判定のロジックはどちらも同じです。
デバッグとロジック最適化のため、ChatGPTさんにソースコードを添削してもらいました。
adodbのバイナリ読み込み .Position を使用
DetectTextEncode.bas
Option Explicit
Private Const adSaveCreateNotExist = 1
Private Const adSaveCreateOverWrite = 2
Private Const adWriteChar = 0
Private Const adWriteLine = 1
Private Const adReadLine = -2
Private Const adReadAll = -1
Private Const adTypeBinary = 1
Private Const adTypeText = 2
Private Const adCRLF = -1
Private Const adCR = 13
Private Const adLF = 10
'Const MaxReadSize = 1024 ^ 2 * 10 '上限10MBにしておく
Enum Encode
ENC_UNKNOWN = 0
ENC_UTF8 = 1
ENC_UTF8N = 2
ENC_UTF16LE = 3
ENC_UTF16BE = 4
ENC_SHIFT_JIS = 5
End Enum
Function DetectTextEncode(ByVal FilePath As String) As Encode
Dim ByteData() As Byte
DetectTextEncode = ENC_UNKNOWN
With CreateObject("ADODB.Stream")
.Open
.Type = adTypeBinary
.LoadFromFile FilePath
'BOMで判定
If .Size > 3 Then
.Position = 0
ByteData = .Read(1)
Select Case ByteData(0)
Case &HEF '■UTF8(BOM有り)
ByteData = .Read(2)
If ByteData(0) = &HBB And ByteData(1) = &HBF Then
DetectTextEncode = ENC_UTF8
End If
Case &HFF '■UTF16LE
ByteData = .Read(1)
If ByteData(0) = &HFE Then
DetectTextEncode = ENC_UTF16LE
End If
Case &HFE '■UTF16BE
ByteData = .Read(1)
If ByteData(0) = &HFF Then
DetectTextEncode = ENC_UTF16BE
End If
Case Else
'BOMなしファイル判定に移行
End Select
End If
If DetectTextEncode = ENC_UNKNOWN Then
'■Shift_JIS判定
Dim isSJIS As Boolean: isSJIS = True
.Position = 0
Do While .Position < .Size
ByteData = .Read(1)
If ByteData(0) <= &H7F Or (ByteData(0) >= &HA1 And ByteData(0) <= &HDF) Then
'Single Byte (ASCII or half-width Katakana)
ElseIf (ByteData(0) >= &H81 And ByteData(0) <= &H9F) Or (ByteData(0) >= &HE0 And ByteData(0) <= &HFC) Then
If .Position < .Size Then
ByteData = .Read(1)
If ((ByteData(0) >= &H40 And ByteData(0) <= &H7E) Or (ByteData(0) >= &H80 And ByteData(0) <= &HFC)) Then
'First byte of a double-byte character
Else
isSJIS = False
Exit Do
End If
Else
isSJIS = False
Exit Do
End If
Else
isSJIS = False
Exit Do
End If
Loop
If isSJIS Then
DetectTextEncode = ENC_SHIFT_JIS
.Close
Exit Function
End If
'■UTF8N判定
Dim isUTF8N As Boolean: isUTF8N = True
.Position = 0
Do While .Position < .Size
ByteData = .Read(1)
If ByteData(0) <= &H7F Then
'Single Byte (ASCII)
ElseIf &HC2 <= ByteData(0) And ByteData(0) <= &HDF Then
If .Position < .Size Then
ByteData = .Read(1)
If &H80 <= ByteData(0) And ByteData(0) <= &HBF Then
'Two-byte sequence
Else
isUTF8N = False
Exit Do
End If
Else
isUTF8N = False
Exit Do
End If
ElseIf &HE0 <= ByteData(0) And ByteData(0) <= &HEF Then
If .Position + 1 < .Size Then
ByteData = .Read(2)
If &H80 <= ByteData(0) And ByteData(0) <= &HBF And _
&H80 <= ByteData(1) And ByteData(1) <= &HBF Then
'Three-byte sequence
Else
isUTF8N = False
Exit Do
End If
Else
isUTF8N = False
Exit Do
End If
ElseIf &HF0 <= ByteData(0) And ByteData(0) <= &HF4 Then
If .Position + 2 < .Size Then
ByteData = .Read(3)
If &H80 <= ByteData(0) And ByteData(0) <= &HBF And _
&H80 <= ByteData(1) And ByteData(1) <= &HBF And _
&H80 <= ByteData(2) And ByteData(2) <= &HBF Then
'Four-byte sequence
Else
isUTF8N = False
Exit Do
End If
Else
isUTF8N = False
Exit Do
End If
Else
isUTF8N = False
Exit Do
End If
Loop
If isUTF8N Then
DetectTextEncode = ENC_UTF8N
.Close
Exit Function
End If
End If
.Close
End With
End Function
adodbのバイナリで一括でByte配列に読み込む
DetectTextEncoding.bas
Option Explicit
Private Const adSaveCreateNotExist = 1
Private Const adSaveCreateOverWrite = 2
Private Const adWriteChar = 0
Private Const adWriteLine = 1
Private Const adReadLine = -2
Private Const adReadAll = -1
Private Const adTypeBinary = 1
Private Const adTypeText = 2
Private Const adCRLF = -1
Private Const adCR = 13
Private Const adLF = 10
Enum Encoding
ENC_UNKNOWN = 0
ENC_UTF8 = 1
ENC_UTF8N = 2
ENC_UTF16LE = 3
ENC_UTF16BE = 4
ENC_SHIFT_JIS = 5
End Enum
Function DetectTextEncoding(ByVal FilePath As String) As Encoding
Dim FSO As Object: Set FSO = CreateObject("Scripting.FileSystemObject")
DetectTextEncoding = ENC_UNKNOWN
If FSO.FileExists(FilePath) Then
Dim FileToByte() As Byte
Dim i As Long
With CreateObject("ADODB.Stream")
.Open
.Type = adTypeBinary
.LoadFromFile FilePath
.Position = 0
FileToByte = .Read()
.Close
End With
'■UTF8(BOM有り)
If UBound(FileToByte) >= 3 Then
If FileToByte(0) = &HEF And FileToByte(1) = &HBB And FileToByte(2) = &HBF Then
DetectTextEncoding = ENC_UTF8
Exit Function
End If
End If
'■UTF16
If UBound(FileToByte) >= 2 Then
If FileToByte(0) = &HFF And FileToByte(1) = &HFE Then
DetectTextEncoding = ENC_UTF16LE
Exit Function
ElseIf FileToByte(0) = &HFE And FileToByte(1) = &HFF Then
DetectTextEncoding = ENC_UTF16BE
Exit Function
End If
End If
'■Shift_JIS
If isSJIS(FileToByte) Then
DetectTextEncoding = ENC_SHIFT_JIS
Exit Function
End If
'■UTF8(BOMなし)
If isUTF8N(FileToByte) Then
DetectTextEncoding = ENC_UTF8
Exit Function
End If
End If
End Function
Function isSJIS(ByRef FileToByte() As Byte) As Boolean
On Error GoTo ErrorHandler ' エラー処理の追加
Dim i As Long
isSJIS = True
i = 0 ' 初期化
Do While i <= UBound(FileToByte)
If FileToByte(i) <= &H7F Or (FileToByte(i) >= &HA1 And FileToByte(i) <= &HDF) Then
' Single Byte (ASCII or half-width Katakana)
i = i + 1
ElseIf (FileToByte(i) >= &H81 And FileToByte(i) <= &H9F) Or (FileToByte(i) >= &HE0 And FileToByte(i) <= &HFC) Then
' First byte of a double-byte character
If i + 1 <= UBound(FileToByte) And _
((FileToByte(i + 1) >= &H40 And FileToByte(i + 1) <= &H7E) Or (FileToByte(i + 1) >= &H80 And FileToByte(i + 1) <= &HFC)) Then
i = i + 2
Else
isSJIS = False
Exit Do
End If
Else
isSJIS = False
Exit Do
End If
Loop
ErrorHandler:
isSJIS = False
MsgBox "エラーが発生しました: " & Err.Description, vbCritical
End Function
Function isUTF8N(ByRef FileToByte() As Byte) As Boolean
On Error GoTo ErrorHandler ' エラー処理の追加
Dim i As Long
isUTF8N = True
i = 0 ' 初期化
Do While i <= UBound(FileToByte)
If FileToByte(i) <= &H7F Then
' Single Byte (ASCII)
i = i + 1
ElseIf &HC2 <= FileToByte(i) And FileToByte(i) <= &HDF Then
' Two-byte sequence
If i + 1 <= UBound(FileToByte) And _
&H80 <= FileToByte(i + 1) And FileToByte(i + 1) <= &HBF Then
i = i + 2
Else
isUTF8N = False
Exit Do
End If
ElseIf &HE0 <= FileToByte(i) And FileToByte(i) <= &HEF Then
' Three-byte sequence
If i + 2 <= UBound(FileToByte) And _
&H80 <= FileToByte(i + 1) And FileToByte(i + 1) <= &HBF And _
&H80 <= FileToByte(i + 2) And FileToByte(i + 2) <= &HBF Then
i = i + 3
Else
isUTF8N = False
Exit Do
End If
ElseIf &HF0 <= FileToByte(i) And FileToByte(i) <= &HF4 Then
' Four-byte sequence
If i + 3 <= UBound(FileToByte) And _
&H80 <= FileToByte(i + 1) And FileToByte(i + 1) <= &HBF And _
&H80 <= FileToByte(i + 2) And FileToByte(i + 2) <= &HBF And _
&H80 <= FileToByte(i + 3) And FileToByte(i + 3) <= &HBF Then
i = i + 4
Else
isUTF8N = False
Exit Do
End If
Else
isUTF8N = False
Exit Do
End If
Loop
Exit Function
ErrorHandler:
isUTF8N = False
MsgBox "エラーが発生しました: " & Err.Description, vbCritical
End Function
使用方法(基本的に使用方法も戻り値もどちらも同じ)
TestFunction.bas
Sub TestFunction()
Dim WSH As Object: Set WSH = CreateObject("WScript.Shell")
Dim FilePath As String
FilePath = WSH.SpecialFolders("Desktop") & "\TEST_UTF8N.txt"
Select Case DetectTextEncode(FilePath)
Case ENC_UTF8
Debug.Print "UTF8"
Case ENC_UTF8N
Debug.Print "UTF8N"
Case ENC_UTF16LE
Debug.Print "UTF16LE"
Case ENC_UTF16BE
Debug.Print "UTF16BE"
Case ENC_SHIFT_JIS
Debug.Print "SHIFT_JIS"
Case Else
Debug.Print "UNKNOWN"
End Select
End Sub
おまけ
改行コード判定も作ってみた。
DetectLineFeedCode.bas
Function DetectLineFeedCode(ByVal FilePath As String) As String
Const MaxReadSize As Long = 1024 ^ 2 ' 1MB に修正
Const adTypeBinary = 1
Dim ByteData() As Byte
Dim ReadByte As Integer
DetectLineFeedCode = vbNullString
With CreateObject("ADODB.Stream")
.Open
.Type = adTypeBinary
.LoadFromFile FilePath
.Position = 0
Do While .Position < .Size
ByteData = .Read(1)
ReadByte = ByteData(0)
If ReadByte = AscW(vbCr) Then
If .Position < .Size Then
ByteData = .Read(1)
ReadByte = ByteData(0)
If ReadByte = AscW(vbLf) Then
DetectLineFeedCode = vbCrLf
Else
DetectLineFeedCode = vbCr
End If
Else
DetectLineFeedCode = vbCr
End If
Exit Do
ElseIf ReadByte = AscW(vbLf) Then
DetectLineFeedCode = vbLf
Exit Do
End If
If .Position >= MaxReadSize Then Exit Do
Loop
.Close
End With
End Function