0
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

VBAでRAG

Posted at

VBAでBM25アルゴリズムを実装したコードです:

' BM25検索アルゴリズム実装(VBA版)
Option Explicit

' BM25のパラメータ
Private Const k1 As Double = 1.5
Private Const b As Double = 0.75

' ドキュメント情報を格納する型
Private Type DocInfo
    Terms() As String       ' 分かち書きされた単語
    TermFreq As Object      ' 単語の出現頻度(Dictionary)
    DocLength As Long       ' ドキュメントの長さ(単語数)
End Type

' 検索エンジンクラス
Private Type BM25Engine
    Documents() As DocInfo      ' ドキュメント配列
    AvgDocLength As Double      ' 平均ドキュメント長
    DocCount As Long            ' ドキュメント数
    Vocabulary As Object        ' 語彙(全単語のDictionary)
    IDF As Object               ' 逆文書頻度(Dictionary)
    IsInitialized As Boolean    ' 初期化済みフラグ
End Type

' エンジンのグローバルインスタンス
Private Engine As BM25Engine

' エンジンを初期化する
Public Sub InitializeBM25Engine()
    ' 辞書オブジェクトの作成
    Set Engine.Vocabulary = CreateObject("Scripting.Dictionary")
    Set Engine.IDF = CreateObject("Scripting.Dictionary")
    
    Engine.DocCount = 0
    Engine.AvgDocLength = 0
    Engine.IsInitialized = True
End Sub

' ドキュメントを追加する
Public Sub AddDocument(docText As String, docID As Long)
    If Not Engine.IsInitialized Then
        InitializeBM25Engine
    End If
    
    ' ドキュメント数を増やす
    Engine.DocCount = Engine.DocCount + 1
    
    ' 必要に応じて配列をリサイズ
    ReDim Preserve Engine.Documents(1 To Engine.DocCount)
    
    ' テキストを単語に分割
    Dim terms() As String
    terms = SplitTextIntoTerms(docText)
    
    ' 単語の頻度辞書を作成
    Dim termFreq As Object
    Set termFreq = CreateObject("Scripting.Dictionary")
    
    Dim i As Long
    For i = LBound(terms) To UBound(terms)
        ' 単語頻度を更新
        If termFreq.Exists(terms(i)) Then
            termFreq(terms(i)) = termFreq(terms(i)) + 1
        Else
            termFreq.Add terms(i), 1
        End If
        
        ' 全体語彙に追加
        If Not Engine.Vocabulary.Exists(terms(i)) Then
            Engine.Vocabulary.Add terms(i), 1
        End If
    Next i
    
    ' ドキュメント情報を格納
    With Engine.Documents(Engine.DocCount)
        .Terms = terms
        Set .TermFreq = termFreq
        .DocLength = UBound(terms) - LBound(terms) + 1
    End With
    
    ' 平均ドキュメント長を更新
    Dim totalLength As Long
    For i = 1 To Engine.DocCount
        totalLength = totalLength + Engine.Documents(i).DocLength
    Next i
    Engine.AvgDocLength = totalLength / Engine.DocCount
    
    ' IDFを計算
    CalculateIDF
End Sub

' 単語ごとのIDFを計算
Private Sub CalculateIDF()
    Set Engine.IDF = CreateObject("Scripting.Dictionary")
    
    Dim term As Variant
    For Each term In Engine.Vocabulary.Keys
        ' この単語を含むドキュメント数をカウント
        Dim docFreq As Long
        docFreq = 0
        
        Dim i As Long
        For i = 1 To Engine.DocCount
            If Engine.Documents(i).TermFreq.Exists(term) Then
                docFreq = docFreq + 1
            End If
        Next i
        
        ' IDF計算: log((N-n+0.5)/(n+0.5) + 1)
        Dim idfValue As Double
        idfValue = Log10((Engine.DocCount - docFreq + 0.5) / (docFreq + 0.5) + 1)
        
        ' 負の値は0にする(まれなケース)
        If idfValue < 0 Then idfValue = 0
        
        ' IDFを保存
        If Engine.IDF.Exists(term) Then
            Engine.IDF(term) = idfValue
        Else
            Engine.IDF.Add term, idfValue
        End If
    Next term
End Sub

' 検索を実行し、ドキュメントのランキングを返す
Public Function Search(query As String) As Variant
    If Not Engine.IsInitialized Or Engine.DocCount = 0 Then
        MsgBox "検索エンジンが初期化されていないか、ドキュメントがありません"
        Exit Function
    End If
    
    ' クエリを単語に分割
    Dim queryTerms() As String
    queryTerms = SplitTextIntoTerms(query)
    
    ' 各ドキュメントのスコアを計算
    Dim scores() As Double
    ReDim scores(1 To Engine.DocCount)
    
    Dim i As Long, j As Long
    Dim term As String
    
    For j = LBound(queryTerms) To UBound(queryTerms)
        term = queryTerms(j)
        
        ' クエリ内の単語が語彙にある場合のみ処理
        If Engine.Vocabulary.Exists(term) Then
            For i = 1 To Engine.DocCount
                Dim doc As DocInfo
                doc = Engine.Documents(i)
                
                ' 単語がドキュメントに存在する場合のみスコアを計算
                If doc.TermFreq.Exists(term) Then
                    Dim tf As Long
                    tf = doc.TermFreq(term)
                    
                    ' BM25スコア計算
                    Dim idf As Double
                    idf = Engine.IDF(term)
                    
                    Dim numerator As Double
                    numerator = tf * (k1 + 1)
                    
                    Dim denominator As Double
                    denominator = tf + k1 * (1 - b + b * doc.DocLength / Engine.AvgDocLength)
                    
                    Dim termScore As Double
                    termScore = idf * numerator / denominator
                    
                    ' ドキュメントのスコアに加算
                    scores(i) = scores(i) + termScore
                End If
            Next i
        End If
    Next j
    
    ' ランク付けされた結果を返す(スコアと文書ID)
    Dim results() As Variant
    ReDim results(1 To Engine.DocCount, 1 To 2)
    
    For i = 1 To Engine.DocCount
        results(i, 1) = i            ' ドキュメントID
        results(i, 2) = scores(i)    ' スコア
    Next i
    
    ' スコアでソート(簡易版バブルソート)
    Dim temp(1 To 2) As Variant
    For i = 1 To Engine.DocCount - 1
        For j = i + 1 To Engine.DocCount
            If results(i, 2) < results(j, 2) Then
                ' スワップ
                temp(1) = results(i, 1)
                temp(2) = results(i, 2)
                
                results(i, 1) = results(j, 1)
                results(i, 2) = results(j, 2)
                
                results(j, 1) = temp(1)
                results(j, 2) = temp(2)
            End If
        Next j
    Next i
    
    Search = results
End Function

' テキストを単語に分割(簡易的な実装)
Private Function SplitTextIntoTerms(text As String) As String()
    ' 大文字を小文字に変換
    text = LCase(text)
    
    ' 記号を空白に置換
    text = Replace(text, ".", " ")
    text = Replace(text, ",", " ")
    text = Replace(text, ";", " ")
    text = Replace(text, ":", " ")
    text = Replace(text, "!", " ")
    text = Replace(text, "?", " ")
    text = Replace(text, "(", " ")
    text = Replace(text, ")", " ")
    text = Replace(text, "[", " ")
    text = Replace(text, "]", " ")
    text = Replace(text, "{", " ")
    text = Replace(text, "}", " ")
    text = Replace(text, """", " ")
    text = Replace(text, "'", " ")
    text = Replace(text, "-", " ")
    text = Replace(text, "_", " ")
    
    ' 連続する空白を単一の空白に変換
    Do While InStr(text, "  ") > 0
        text = Replace(text, "  ", " ")
    Loop
    
    ' 先頭と末尾の空白を削除
    text = Trim(text)
    
    ' 分割して返す
    SplitTextIntoTerms = Split(text, " ")
End Function

' 対数関数(VBAにはLog10がないため)
Private Function Log10(x As Double) As Double
    Log10 = Log(x) / Log(10#)
End Function

' 使用例
Public Sub TestBM25()
    ' エンジンを初期化
    InitializeBM25Engine
    
    ' ドキュメントを追加
    AddDocument "Excelは表計算ソフトウェアです。", 1
    AddDocument "VBAはマクロプログラミング言語です。", 2
    AddDocument "ExcelでVBAを使うと業務を自動化できます。", 3
    AddDocument "ビジネスアプリケーションの開発にはVBAが便利です。", 4
    
    ' 検索を実行
    Dim results As Variant
    results = Search("VBA Excel 自動化")
    
    ' 結果を表示
    Dim i As Long
    For i = 1 To UBound(results, 1)
        Debug.Print "ドキュメントID: " & results(i, 1) & ", スコア: " & results(i, 2)
    Next i
End Sub

使用方法:

  1. Excel VBAエディタを開きます(Alt + F11)
  2. 新しいモジュールを挿入します(挿入 > モジュール)
  3. 上記のコードを貼り付けます
  4. TestBM25 サブルーチンを実行して動作確認ができます

実際の活用例:

Public Sub 雪だるま開発ツール用BM25検索()
    ' エンジンを初期化
    InitializeBM25Engine
    
    ' ドキュメントのロード(ここではセルからデータを取得する例)
    Dim ws As Worksheet
    Set ws = ThisWorkbook.Worksheets("ナレッジベース")
    
    Dim lastRow As Long
    lastRow = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row
    
    Dim i As Long
    For i = 2 To lastRow ' A列に格納されているドキュメントをロード
        AddDocument ws.Cells(i, "A").Value, i - 1
    Next i
    
    ' ユーザー入力で検索
    Dim query As String
    query = InputBox("検索キーワードを入力してください:", "雪だるま開発ツール検索")
    
    If query = "" Then Exit Sub
    
    ' 検索実行
    Dim results As Variant
    results = Search(query)
    
    ' 結果をシートに出力
    Dim outputWs As Worksheet
    Set outputWs = ThisWorkbook.Worksheets("検索結果")
    
    ' 結果をクリア
    outputWs.Range("A1:C1000").ClearContents
    
    ' ヘッダー設定
    outputWs.Range("A1").Value = "ドキュメントID"
    outputWs.Range("B1").Value = "スコア"
    outputWs.Range("C1").Value = "内容"
    
    ' 結果出力
    For i = 1 To UBound(results, 1)
        If results(i, 2) > 0 Then ' スコアが0より大きい場合のみ表示
            outputWs.Cells(i + 1, "A").Value = results(i, 1)
            outputWs.Cells(i + 1, "B").Value = results(i, 2)
            outputWs.Cells(i + 1, "C").Value = ws.Cells(results(i, 1) + 1, "A").Value
        End If
    Next i
End Sub

このVBA実装は、セキュリティ制約のある環境でも安全に使用できる軽量な代替アルゴリズムです。日本語テキストの場合は、分かち書き(単語分割)処理をより洗練されたものに置き換えることで精度を向上させることができます。

0
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?