英単語の類似度（一致文字数）を計算して類似語を検索するCプログラム

#下書き

Last updated at 2025-07-18Posted at 2025-07-18

テストです

はじめに

英語学習や単語リスト管理の場面で、
似ている単語を探したいことはありませんか？

たとえば「advertisement」と「advertising」のように、
文字が部分的に一致する単語を見つけたい場合、
単純な文字列一致ではなく「どれくらい似ているか」の判定が必要です。

本記事では「二つの単語の文字列を左から比較し、
一致している文字数と一致率（％）を算出する」
C言語プログラムを紹介します。

さらにCSV形式の単語帳を読み込み、

全単語ペアの類似度検索
指定単語との類似語検索
ができるように設計しています。
GitHubリポジトリ

プログラムの特徴

一致している文字数をカウントしてスコア化
スコアをもとに一致率（％）を計算
一致文字を*、不一致文字を_で可視化表示
CSV（英単語,日本語訳）ファイルから単語リスト読み込み
類似度閾値（--minscore）を指定して類似単語を抽出
特定単語のみ検索（--search）も可能
同じ単語同士の比較は行わない（重複表示回避）
出力は標準出力かファイル指定可

ソースコード全文

#define _CRT_SECURE_NO_WARNINGS

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_WORDS 1000
#define MAX_LINE 256

typedef struct {
    char english[64];
    char japanese[128];
} WordEntry;

int similarity_score(const char* w1, const char* w2) {
    int len = (strlen(w1) < strlen(w2)) ? strlen(w1) : strlen(w2);
    int score = 0;
    for(int i=0; i<len; i++){
        if(w1[i] == w2[i]) score++;
    }
    return score;
}

double similarity_rate(const char* w1, const char* w2) {
    int len = (strlen(w1) < strlen(w2)) ? strlen(w1) : strlen(w2);
    if(len == 0) return 0.0;
    int match = similarity_score(w1, w2);
    return (double)match / len * 100.0;
}

void print_similarity_visual(FILE* out, const char* w1, const char* w2) {
    int len = (strlen(w1) < strlen(w2)) ? strlen(w1) : strlen(w2);
    fprintf(out, "%s\n%s\n", w1, w2);
    for(int i=0; i<len; i++){
        fprintf(out, "%c", (w1[i] == w2[i]) ? '*' : '_');
    }
    fprintf(out, "\n\n");
}

int read_csv(const char* filename, WordEntry* entries) {
    FILE* fp = fopen(filename, "r");
    if(!fp) {
        perror("ファイルオープンエラー");
        return -1;
    }
    char line[MAX_LINE];
    int count = 0;
    while(fgets(line, sizeof(line), fp) && count < MAX_WORDS) {
        char* comma = strchr(line, ',');
        if(!comma) continue;
        *comma = '\0';
        strncpy(entries[count].english, line, sizeof(entries[count].english) - 1);
        strncpy(entries[count].japanese, comma + 1, sizeof(entries[count].japanese) - 1);
        char* nl = strchr(entries[count].japanese, '\n');
        if(nl) *nl = '\0';
        count++;
    }
    fclose(fp);
    return count;
}

void compare_all_pairs(FILE* out, WordEntry* entries, int count, int minscore) {
    for(int i=0; i<count; i++) {
        for(int j=i+1; j<count; j++) {
            if(strcmp(entries[i].english, entries[j].english) == 0) continue;
            int score = similarity_score(entries[i].english, entries[j].english);
            if(score >= minscore) {
                double rate = similarity_rate(entries[i].english, entries[j].english);
                fprintf(out, "[一致:%d文字, 類似度:%.1f%%] %s (%s) <-> %s (%s)\n",
                    score, rate,
                    entries[i].english, entries[i].japanese,
                    entries[j].english, entries[j].japanese);
                print_similarity_visual(out, entries[i].english, entries[j].english);
            }
        }
    }
}

void search_single_word(FILE* out, WordEntry* entries, int count, const char* target, int minscore) {
    for(int i=0; i<count; i++) {
        if(strcmp(entries[i].english, target) == 0) continue;
        int score = similarity_score(entries[i].english, target);
        if(score >= minscore) {
            double rate = similarity_rate(entries[i].english, target);
            fprintf(out, "[一致:%d文字, 類似度:%.1f%%] %s <-> %s (%s)\n",
                score, rate,
                target,
                entries[i].english, entries[i].japanese);
            print_similarity_visual(out, target, entries[i].english);
        }
    }
}

int main(int argc, char* argv[]) {
    if(argc < 3) {
        printf("使い方: %s 入力ファイル --minscore N [--search word] [-o 出力ファイル]\n", argv[0]);
        return 1;
    }

    const char *input_file = NULL;
    const char *search_word = NULL;
    const char *output_file = NULL;
    int minscore = -1;

    for(int i=1; i<argc; i++) {
        if(strcmp(argv[i], "--minscore") == 0 && i+1 < argc) {
            minscore = atoi(argv[++i]);
        } else if(strcmp(argv[i], "--search") == 0 && i+1 < argc) {
            search_word = argv[++i];
        } else if(strcmp(argv[i], "-o") == 0 && i+1 < argc) {
            output_file = argv[++i];
        } else if(!input_file) {
            input_file = argv[i];
        }
    }

    if(!input_file || minscore < 0) {
        fprintf(stderr, "入力ファイルと --minscore を指定してください。\n");
        return 1;
    }

    FILE* out = stdout;
    if(output_file) {
        out = fopen(output_file, "w");
        if(!out) {
            perror("出力ファイルオープンエラー");
            return 1;
        }
    }

    WordEntry entries[MAX_WORDS];
    int count = read_csv(input_file, entries);
    if(count < 0) return 1;

    if(search_word) {
        search_single_word(out, entries, count, search_word, minscore);
    } else {
        compare_all_pairs(out, entries, count, minscore);
    }

    if(out != stdout) fclose(out);
    return 0;
}

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up