LoginSignup
0
1

VBAで日本語テキストの文字コード判定

Last updated at Posted at 2024-06-15

はじめに

Excel VBAで日本語テキストの文字コード判定をする方法を調べたところ、

UTF8(BOM有り):先頭3バイトのBOMで判定可能
UTF16:先頭2バイトのBOMで判定可能

で判定可能だが、Shift_JISとUTF8(BOMなし)についてはテキストファイル内の文字コードの使用範囲で判定する方法しかないことが分かった。

NKFなどが導入できない環境向けにVBAコードを作成した。

文字コード判定については、ファイル操作の練習もかねて2種類作成した。
Shift_JISとUTF8Nの判定のロジックはどちらも同じです。
デバッグとロジック最適化のため、ChatGPTさんにソースコードを添削してもらいました。

adodbのバイナリ読み込み .Position を使用

DetectTextEncode.bas
Option Explicit
Private Const adSaveCreateNotExist = 1
Private Const adSaveCreateOverWrite = 2
Private Const adWriteChar = 0
Private Const adWriteLine = 1
Private Const adReadLine = -2
Private Const adReadAll = -1
Private Const adTypeBinary = 1
Private Const adTypeText = 2
Private Const adCRLF = -1
Private Const adCR = 13
Private Const adLF = 10
'Const MaxReadSize = 1024 ^ 2 * 10 '上限10MBにしておく
Enum Encode
    ENC_UNKNOWN = 0
    ENC_UTF8 = 1
    ENC_UTF8N = 2
    ENC_UTF16LE = 3
    ENC_UTF16BE = 4
    ENC_SHIFT_JIS = 5
End Enum
Function DetectTextEncode(ByVal FilePath As String) As Encode
    Dim ByteData() As Byte
    DetectTextEncode = ENC_UNKNOWN
    With CreateObject("ADODB.Stream")
        .Open
        .Type = adTypeBinary
        .LoadFromFile FilePath
        'BOMで判定
        If .Size > 3 Then
            .Position = 0
            ByteData = .Read(1)
            Select Case ByteData(0)
                Case &HEF '■UTF8(BOM有り)
                    ByteData = .Read(2)
                    If ByteData(0) = &HBB And ByteData(1) = &HBF Then
                        DetectTextEncode = ENC_UTF8
                    End If
                Case &HFF '■UTF16LE
                    ByteData = .Read(1)
                    If ByteData(0) = &HFE Then
                        DetectTextEncode = ENC_UTF16LE
                    End If
                Case &HFE '■UTF16BE
                    ByteData = .Read(1)
                    If ByteData(0) = &HFF Then
                        DetectTextEncode = ENC_UTF16BE
                    End If
                Case Else
                    'BOMなしファイル判定に移行
            End Select
        End If
        If DetectTextEncode = ENC_UNKNOWN Then
            '■Shift_JIS判定
            Dim isSJIS As Boolean: isSJIS = True
            .Position = 0
            Do While .Position < .Size
                ByteData = .Read(1)
                If ByteData(0) <= &H7F Or (ByteData(0) >= &HA1 And ByteData(0) <= &HDF) Then
                    'Single Byte (ASCII or half-width Katakana)
                ElseIf (ByteData(0) >= &H81 And ByteData(0) <= &H9F) Or (ByteData(0) >= &HE0 And ByteData(0) <= &HFC) Then
                    If .Position < .Size Then
                        ByteData = .Read(1)
                        If ((ByteData(0) >= &H40 And ByteData(0) <= &H7E) Or (ByteData(0) >= &H80 And ByteData(0) <= &HFC)) Then
                            'First byte of a double-byte character
                        Else
                            isSJIS = False
                            Exit Do
                        End If
                    Else
                        isSJIS = False
                        Exit Do
                    End If
                Else
                    isSJIS = False
                    Exit Do
                End If
            Loop
            If isSJIS Then
                DetectTextEncode = ENC_SHIFT_JIS
                .Close
                Exit Function
            End If
            '■UTF8N判定
            Dim isUTF8N As Boolean: isUTF8N = True
            .Position = 0
            Do While .Position < .Size
                ByteData = .Read(1)
                If ByteData(0) <= &H7F Then
                    'Single Byte (ASCII)
                ElseIf &HC2 <= ByteData(0) And ByteData(0) <= &HDF Then
                    If .Position < .Size Then
                        ByteData = .Read(1)
                        If &H80 <= ByteData(0) And ByteData(0) <= &HBF Then
                            'Two-byte sequence
                        Else
                            isUTF8N = False
                            Exit Do
                        End If
                    Else
                        isUTF8N = False
                        Exit Do
                    End If
                ElseIf &HE0 <= ByteData(0) And ByteData(0) <= &HEF Then
                    If .Position + 1 < .Size Then
                        ByteData = .Read(2)
                        If &H80 <= ByteData(0) And ByteData(0) <= &HBF And _
                           &H80 <= ByteData(1) And ByteData(1) <= &HBF Then
                            'Three-byte sequence
                        Else
                            isUTF8N = False
                            Exit Do
                        End If
                    Else
                        isUTF8N = False
                        Exit Do
                    End If
                ElseIf &HF0 <= ByteData(0) And ByteData(0) <= &HF4 Then
                    If .Position + 2 < .Size Then
                        ByteData = .Read(3)
                        If &H80 <= ByteData(0) And ByteData(0) <= &HBF And _
                           &H80 <= ByteData(1) And ByteData(1) <= &HBF And _
                           &H80 <= ByteData(2) And ByteData(2) <= &HBF Then
                            'Four-byte sequence
                        Else
                            isUTF8N = False
                            Exit Do
                        End If
                    Else
                        isUTF8N = False
                        Exit Do
                    End If
                Else
                    isUTF8N = False
                    Exit Do
                End If
            Loop
            If isUTF8N Then
                DetectTextEncode = ENC_UTF8N
                .Close
                Exit Function
            End If
        End If
        .Close
    End With
End Function

adodbのバイナリで一括でByte配列に読み込む

DetectTextEncoding.bas
Option Explicit
Private Const adSaveCreateNotExist = 1
Private Const adSaveCreateOverWrite = 2
Private Const adWriteChar = 0
Private Const adWriteLine = 1
Private Const adReadLine = -2
Private Const adReadAll = -1
Private Const adTypeBinary = 1
Private Const adTypeText = 2
Private Const adCRLF = -1
Private Const adCR = 13
Private Const adLF = 10
Enum Encoding
    ENC_UNKNOWN = 0
    ENC_UTF8 = 1
    ENC_UTF8N = 2
    ENC_UTF16LE = 3
    ENC_UTF16BE = 4
    ENC_SHIFT_JIS = 5
End Enum
Function DetectTextEncoding(ByVal FilePath As String) As Encoding
    Dim FSO As Object: Set FSO = CreateObject("Scripting.FileSystemObject")
    DetectTextEncoding = ENC_UNKNOWN
    If FSO.FileExists(FilePath) Then
        Dim FileToByte() As Byte
        Dim i As Long
        With CreateObject("ADODB.Stream")
            .Open
            .Type = adTypeBinary
            .LoadFromFile FilePath
            .Position = 0
            FileToByte = .Read()
            .Close
        End With
        '■UTF8(BOM有り)
        If UBound(FileToByte) >= 3 Then
            If FileToByte(0) = &HEF And FileToByte(1) = &HBB And FileToByte(2) = &HBF Then
                DetectTextEncoding = ENC_UTF8
                Exit Function
            End If
        End If
        '■UTF16
        If UBound(FileToByte) >= 2 Then
            If FileToByte(0) = &HFF And FileToByte(1) = &HFE Then
                DetectTextEncoding = ENC_UTF16LE
                Exit Function
            ElseIf FileToByte(0) = &HFE And FileToByte(1) = &HFF Then
                DetectTextEncoding = ENC_UTF16BE
                Exit Function
            End If
        End If
        '■Shift_JIS
        If isSJIS(FileToByte) Then
            DetectTextEncoding = ENC_SHIFT_JIS
            Exit Function
        End If
        '■UTF8(BOMなし)
        If isUTF8N(FileToByte) Then
            DetectTextEncoding = ENC_UTF8
            Exit Function
        End If
    End If
End Function
Function isSJIS(ByRef FileToByte() As Byte) As Boolean
    On Error GoTo ErrorHandler ' エラー処理の追加
    Dim i As Long
    isSJIS = True
    i = 0 ' 初期化
    Do While i <= UBound(FileToByte)
        If FileToByte(i) <= &H7F Or (FileToByte(i) >= &HA1 And FileToByte(i) <= &HDF) Then
            ' Single Byte (ASCII or half-width Katakana)
            i = i + 1
        ElseIf (FileToByte(i) >= &H81 And FileToByte(i) <= &H9F) Or (FileToByte(i) >= &HE0 And FileToByte(i) <= &HFC) Then
            ' First byte of a double-byte character
            If i + 1 <= UBound(FileToByte) And _
               ((FileToByte(i + 1) >= &H40 And FileToByte(i + 1) <= &H7E) Or (FileToByte(i + 1) >= &H80 And FileToByte(i + 1) <= &HFC)) Then
                i = i + 2
            Else
                isSJIS = False
                Exit Do
            End If
        Else
            isSJIS = False
            Exit Do
        End If
    Loop
ErrorHandler:
    isSJIS = False
    MsgBox "エラーが発生しました: " & Err.Description, vbCritical
End Function
Function isUTF8N(ByRef FileToByte() As Byte) As Boolean
    On Error GoTo ErrorHandler ' エラー処理の追加
    Dim i As Long
    isUTF8N = True
    i = 0 ' 初期化
    Do While i <= UBound(FileToByte)
        If FileToByte(i) <= &H7F Then
            ' Single Byte (ASCII)
            i = i + 1
        ElseIf &HC2 <= FileToByte(i) And FileToByte(i) <= &HDF Then
            ' Two-byte sequence
            If i + 1 <= UBound(FileToByte) And _
                &H80 <= FileToByte(i + 1) And FileToByte(i + 1) <= &HBF Then
                i = i + 2
            Else
                isUTF8N = False
                Exit Do
            End If
        ElseIf &HE0 <= FileToByte(i) And FileToByte(i) <= &HEF Then
            ' Three-byte sequence
            If i + 2 <= UBound(FileToByte) And _
               &H80 <= FileToByte(i + 1) And FileToByte(i + 1) <= &HBF And _
               &H80 <= FileToByte(i + 2) And FileToByte(i + 2) <= &HBF Then
                i = i + 3
            Else
                isUTF8N = False
                Exit Do
            End If
        ElseIf &HF0 <= FileToByte(i) And FileToByte(i) <= &HF4 Then
            ' Four-byte sequence
            If i + 3 <= UBound(FileToByte) And _
               &H80 <= FileToByte(i + 1) And FileToByte(i + 1) <= &HBF And _
               &H80 <= FileToByte(i + 2) And FileToByte(i + 2) <= &HBF And _
               &H80 <= FileToByte(i + 3) And FileToByte(i + 3) <= &HBF Then
                i = i + 4
            Else
                isUTF8N = False
                Exit Do
            End If
        Else
            isUTF8N = False
            Exit Do
        End If
    Loop
    Exit Function
ErrorHandler:
    isUTF8N = False
    MsgBox "エラーが発生しました: " & Err.Description, vbCritical
End Function

使用方法(基本的に使用方法も戻り値もどちらも同じ)

TestFunction.bas
Sub TestFunction()
    Dim WSH As Object: Set WSH = CreateObject("WScript.Shell")
    Dim FilePath As String
    FilePath = WSH.SpecialFolders("Desktop") & "\TEST_UTF8N.txt"
    Select Case DetectTextEncode(FilePath)
        Case ENC_UTF8
            Debug.Print "UTF8"
        Case ENC_UTF8N
            Debug.Print "UTF8N"
        Case ENC_UTF16LE
            Debug.Print "UTF16LE"
        Case ENC_UTF16BE
            Debug.Print "UTF16BE"
        Case ENC_SHIFT_JIS
            Debug.Print "SHIFT_JIS"
        Case Else
            Debug.Print "UNKNOWN"
    End Select
End Sub

おまけ

改行コード判定も作ってみた。

DetectLineFeedCode.bas
Function DetectLineFeedCode(ByVal FilePath As String) As String
    Const MaxReadSize As Long = 1024 ^ 2 ' 1MB に修正
    Const adTypeBinary = 1
    Dim ByteData() As Byte
    Dim ReadByte As Integer
    DetectLineFeedCode = vbNullString
    With CreateObject("ADODB.Stream")
        .Open
        .Type = adTypeBinary
        .LoadFromFile FilePath
        .Position = 0
        Do While .Position < .Size
            ByteData = .Read(1)
            ReadByte = ByteData(0)
            If ReadByte = AscW(vbCr) Then
                If .Position < .Size Then
                    ByteData = .Read(1)
                    ReadByte = ByteData(0)
                    If ReadByte = AscW(vbLf) Then
                        DetectLineFeedCode = vbCrLf
                    Else
                        DetectLineFeedCode = vbCr
                    End If
                Else
                    DetectLineFeedCode = vbCr
                End If
                Exit Do
            ElseIf ReadByte = AscW(vbLf) Then
                DetectLineFeedCode = vbLf
                Exit Do
            End If
            If .Position >= MaxReadSize Then Exit Do
        Loop
        .Close
    End With
End Function
0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1