6
Help us understand the problem. What are the problem?

More than 1 year has passed since last update.

posted at

updated at

ExcelでN-gramってみる

Bigramしかないけど。

コード

出典:
https://www.reddit.com/r/excel/comments/377vxb/matching_slightly_different_string_variables/


Function CreateNGram(strInput, intN)
    Dim arrNGram, intBound, i, j, strGram, didInc, arrTemp

    If Len(strInput) = 0 Then Exit Function

    ReDim arrNGram(Len(strInput) + 1, 1)
    strInput = Chr(0) & UCase(Trim(strInput)) & Chr(0)
    intBound = -1

    For i = 1 To Len(strInput) - intN + 1
        strGram = Mid(strInput, i, intN)
        didInc = False

        For j = 0 To intBound
            If strGram = arrNGram(j, 0) Then
                arrNGram(j, 1) = arrNGram(j, 1) + 1
                didInc = True
                Exit For
            End If
        Next

        If Not didInc Then
            intBound = intBound + 1
            arrNGram(intBound, 0) = strGram
            arrNGram(intBound, 1) = 1
        End If
    Next

    ReDim arrTemp(intBound, 1)
    For i = 0 To intBound
        arrTemp(i, 0) = arrNGram(i, 0)
        arrTemp(i, 1) = arrNGram(i, 1)
    Next

    CreateNGram = arrTemp
End Function

Function CompareNGram(arr1(), arr2())
    Dim i, j, intMatches, intCount1, intCount2

    intMatches = 0
    intCount1 = 0

    For i = 0 To UBound(arr1)
        intCount1 = intCount1 + arr1(i, 1)
        intCount2 = 0

        For j = 0 To UBound(arr2)
            intCount2 = intCount2 + arr2(j, 1)

            If arr1(i, 0) = arr2(j, 0) Then
                If arr1(i, 1) >= arr2(j, 1) Then
                    intMatches = intMatches + arr2(j, 1)
                Else
                    intMatches = intMatches + arr1(i, 1)
                End If
            End If
        Next
    Next

    CompareNGram = 2 * intMatches / (intCount1 + intCount2)
End Function

Function Bigram(ByVal str1 As String, ByVal str2 As String) As Double
    Dim x(), y()
    x = CreateNGram(str1, 2)
    y = CreateNGram(str2, 2)
    Bigram = CompareNGram(x, y)
End Function

渡した文字列が壊されないようByValを追加しました。:wink:

感想

文字列に空白が含まれていて且つ空白の前後が入れ替わることが発生する場合("姓 名"と"名 姓"みたいな)はレーベンシュタイン距離よりいい塩梅に判定できているようです。
どう標本するかは一考の余地があるかも。

Why not register and get more from Qiita?
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away
Sign upLogin
6
Help us understand the problem. What are the problem?