JavaScript で２つの PDF のテキストの差分を検出

Posted at 2025-03-16

JavaScript で２つの PDF のテキストの差分を検出する方法のメモ。

編集距離

２つの文字列の差分は動的計画法を用いた編集距離で計算することができます。
https://qiita.com/sinchir0/items/02697cae7f960356eaba

PDF のテキストの差分

動的計画法では、２つの系列の対応づけを行うことができます。
２つの文字列の編集距離の計算では動的計画法で文字単位で対応づけを行いますが、PDF では PDF の要素を単位とすると、動的計画法で２つの PDF のテキストの対応づけを行うことができ、対応づけができなかった要素を差分として検出します。

プログラム

プログラム本体

長さ M と N の２つの文字列の編集距離を計算する際に (M+1) × (N+1) のサイズの２次元配列を作成しますが、ここではメモリを節約するため、(M+1) の配列を２つ使って計算を行っています。
メモリを節約しましたが、大きなファイルでは実行時間がかかりすぎて、終わらない可能性があります。

pdf_text_matcher.mjs

//
// pdf text matcher
//
// caller needs to import pdf.js
// import * as pdfjsLib from './pdfjs/build/pdf.mjs';
//


export class PdfTextObj {
  constructor(page, item) {
    this.page = page;
    this.item = item;

    this.x = item.transform[4];
    this.y = item.transform[5];
    this.str = this.item.str;
    this.id = `id_${this.page}_${this.x}_${this.y}`;
  }
}

export class PdfTextMatcher {
  constructor(pdf1, pdf2) {
    this.pdf1 = pdf1;
    this.pdf2 = pdf2;

    this.text_objs1 = [];
    this.text_objs2 = [];

    this.matches = [];
    this.matches1 = {};
    this.matches2 = {};

    this.unmatches1 = [];
    this.unmatches2 = [];
  }

  async get_text_objs_from_page(pdf, page_id) {
    const text_objs = [];

    const pdf_page = await pdf.getPage(page_id);
    const content = await pdf_page.getTextContent({ disableCombineTextItem: true, includeMarkedContent: false });
    content.items.map((item) => {
      if (item.str) {
        const text_obj = new PdfTextObj(page_id, item)
        text_objs.push(text_obj);
      }
    });

    return text_objs;
  }

  async get_text_objs(pdf) {
    const text_objs = [];
    
    for (let i = 1; i <= pdf.numPages; i++) {
      const page_text_objs = await this.get_text_objs_from_page(pdf, i);
      text_objs.push(...page_text_objs);
    }

    return text_objs;
  }

  async diff(id_progress=null) {
    if (id_progress && document.getElementById(id_progress)) document.getElementById(id_progress).innerText = 'parsing pdf1';
    const text_objs1 = this.text_objs1 = await this.get_text_objs(this.pdf1);

    if (id_progress && document.getElementById(id_progress)) document.getElementById(id_progress).innerText = 'parsing pdf2';
    const text_objs2 = this.text_objs2 = await this.get_text_objs(this.pdf2);

    //   _ a b c ...
    // _ 0 1 2 3
    // a 1 0 1 2
    // c 2 1 2 1
    // ...
    
    let len1 = text_objs1.length;
    let len2 = text_objs2.length;

    console.log(len1, len2);

    let col_prev = Array(len1+1);
    let col_cur = Array(len1+1);

    col_prev[0] = { cost: 0, matches: [] };
    for (let s = 1; s <= len1; s++) {
      col_prev[s] = { cost: s, matches: [] };
    }

    for (let t = 1; t <= len2; t++) {
      col_cur[0] = { cost: t, matches: [] };
      for (let s = 1; s <= len1; s++) {
        // col_cur[s] を決定
        // 上
        const cost1 = col_cur[s-1].cost + 1;
        // 左
        const cost2 = col_prev[s].cost + 1;
        // 左上
        let cost3 = col_prev[s-1].cost + 1;
        let matches3 = [...col_prev[s-1].matches];
        if (text_objs1[s-1].str == text_objs2[t-1].str) {
          cost3 = col_prev[s-1].cost;
          matches3.push([s-1, t-1]);
        }

        if (cost1 <= cost2 && cost1 <= cost3) {
          col_cur[s] = {
            cost: cost1,
            //path: [...col_cur[s-1].path, ...[[s-1, t-1]]],
            matches: [...col_cur[s-1].matches],
          };
        }
        else if (cost2 <= cost1 && cost2 <= cost3) {
          col_cur[s] = {
            cost: cost2,
            //path: [...col_prev[s].path, ...[[s-1, t-1]]],
            matches: [...col_prev[s].matches],
          };
        }
        else {
          col_cur[s] = {
            cost: cost3,
            //path: [...col_prev[s-1].path, [(s-1, t-1)]],
            matches: matches3,
          };
        }
      }

      if (t % 10 == 1) {
        const progress = `${t} / ${len2} (${Math.floor(t/len2*100)} %)`;
        console.log(progress);
        if (id_progress && document.getElementById(id_progress)) document.getElementById(id_progress).innerText = progress;
      }

      if (t < len2) {
        const tmp = col_prev;
        col_prev = col_cur;
        col_cur = tmp;
      }
    }

    const result = col_cur[len1];
    this.matches = result.matches;

    console.log(col_cur);
    console.log(this.matches);

    // match
    for (let i = 0; i < this.matches.length; i++) {
      const m = this.matches[i];
      const obj1 = text_objs1[m[0]];
      const obj2 = text_objs2[m[1]];
      this.matches1[obj1.id] = obj2;
      this.matches2[obj2.id] = obj1;
    }

    // unmatch
    for (let i = 0; i < this.text_objs1.length; i++) {
      const obj = this.text_objs1[i];
      if (this.matches1[obj.id]) continue;
      this.unmatches1.push(obj);
    }

    for (let i = 0; i < this.text_objs2.length; i++) {
      const obj = this.text_objs2[i];
      if (this.matches2[obj.id]) continue;
      this.unmatches2.push(obj);
    }
  }

  log_matches() {
    for (let i = 0; i < this.matches.length; i++) {
      const m = this.matches[i];
      const obj1 = this.text_objs1[m[0]];
      const obj2 = this.text_objs2[m[1]];
      console.log(`${m[0]}: ${obj1.str} -- ${m[1]}: ${obj2.str}`);
    }
  }

}

呼び出し元 HTML

HTML 内で呼び出している SimplePdfViewer は以下の記事に記載しました。
https://qiita.com/dakikd/items/27da27acf5abff97207e

pdf_text_diff_viewer.html

<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>pdf text diff</title>
  </head>
  <body>
    <div>pdf の text の diff</div><br>
    <div>
      <form id="id_form" name="form_name">
        <input id="id_pdf1_file" type="file" accept="pdf" /><br />
        <br />
        <input id="id_pdf2_file" type="file" accept="pdf" /><br />
        <br>
        <input id="id_submit" type="button" value="diff" />
      </form>
    </div>

    <div id="id_progress"></div>

    <div>
      <div style="border: solid 1px;">
        <ul id="id_diff_list1"></ul>
      </div>
    </div>

    <div>
      <div style="border: solid 1px;">
        <ul id="id_diff_list2"></ul>
      </div>
    </div>

    <div>
      <div style="display: inline-block; border: solid 1px;">
        <div style="display: flex;">
          <input id="id_pdf1_prev" type="button" value="&lt;" />
          <p id="id_pdf1_page_num"> </p>
          <input id="id_pdf1_next" type="button" value="&gt;" />
        </div>
        <canvas id="id_pdf1_canvas"></canvas>
      </div>

      <div style="display: inline-block; border: solid 1px;">
        <div style="display: flex;">
          <input id="id_pdf2_prev" type="button" value="&lt;" />
          <p id="id_pdf2_page_num"> </p>
          <input id="id_pdf2_next" type="button" value="&gt;" />
        </div>
        <canvas id="id_pdf2_canvas"></canvas>
      </div>
    </div>
  </body>
  <script type="text/javascript" src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
  <script type="module">
import * as pdfjsLib from './pdfjs/build/pdf.mjs';
pdfjsLib.GlobalWorkerOptions.workerSrc = './pdfjs/build/pdf.worker.mjs';
import { SimplePdfViewer } from './simple_pdf_viewer.mjs';
import { PdfTextMatcher } from './pdf_text_matcher.mjs';


async function diff() {
  const pdf1 = pdf_viewer1.pdf;
  const pdf2 = pdf_viewer2.pdf;

  const m = new PdfTextMatcher(pdf1, pdf2);
  await m.diff();
  m.log_matches();

  for (let i = 0; i < m.unmatches1.length; i++) {
    const obj = m.unmatches1[i];
    $('#id_diff_list1').append(`<li>pdf1: p.${obj.page}, "${obj.str}"</li>`);
  }

  for (let i = 0; i < m.unmatches2.length; i++) {
    const obj = m.unmatches2[i];
    $('#id_diff_list2').append(`<li>pdf2: p.${obj.page}, "${obj.str}"</li>`);
  }
}


const pdf1_params = {
  id_canvas: 'id_pdf1_canvas',
  id_file: 'id_pdf1_file',
  id_button_prev: 'id_pdf1_prev',
  id_button_next: 'id_pdf1_next',
  id_text_page_num: 'id_pdf1_page_num',
  scale: 0.75,
};

const pdf2_params = {
  id_canvas: 'id_pdf2_canvas',
  id_file: 'id_pdf2_file',
  id_button_prev: 'id_pdf2_prev',
  id_button_next: 'id_pdf2_next',
  id_text_page_num: 'id_pdf2_page_num',
  scale: 0.75,
};

const pdf_viewer1 = new SimplePdfViewer(pdf1_params);
pdf_viewer1.render_file(pdf1_params.id_file);

const pdf_viewer2 = new SimplePdfViewer(pdf2_params);
pdf_viewer2.render_file(pdf2_params.id_file);

$('#id_submit').on('click', (e) => { diff(); });

  </script>
</html>

実行例

以下の２つのファイルの差分を検出します。

ファイル1

あああああ

いいいいい

ううううう

えええええ

おおおおお

かかかかか

ききききき

くくくくく

けけけけけ

こここここ

さささささ

ししししし

すすすすす

せせせせせ

そそそそそ

たたたたた

ちちちちち

つつつつつ

ててててて

ととととと

ファイル２

あああああ

いいいいい

ううううう

えええええ

おおおおお

＃＃＃＃＃

かかかかか

ききききき

くくくくく

けけけけけ

さささささ

ししししし

すすすすす

せせせせせ

そそそそそ

＃＃＃＃＃

たたたたた

ちちちちち

つつつつつ

ててててて

ととととと

差分

上記の html でファイル１、ファイル２とでマッチしなかった箇所が検出されます。

pdf1: p.1, "こここここ"

pdf2: p.1, "＃＃＃＃＃"
pdf2: p.2, "＃＃＃＃＃"

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up