More than 1 year has passed since last update.

Qiita Engineer Festa20242024年7月17日まで開催中！

@taka_yayoi(Takaaki Yayoi)in

データブリックス・ジャパン株式会社

UnstructuredによるPDFからの画像抽出

Last updated at 2024-06-17Posted at 2024-06-17

最近、Unstructuredというライブラリの存在を知りました。そしてこちらのYoutube動画も見ました。

サンプルノートブックがあったのでウォークスルーしました。

セットアップ

%%captureってマジックコマンド初めて知りました。アウトプットを抑制してくれるんですね。

%%capture
%pip install "unstructured[all-docs]" unstructured-client watermark
dbutils.library.restartPython()

# 警告の制御
import warnings
warnings.filterwarnings('ignore')

from IPython.display import JSON

import json

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements, elements_to_json

%load_ext watermark

import unstructured

%watermark --iversions

unstructured       : 0.14.6
json               : 2.0.9
unstructured_client: 0.23.5

初回の探索(要素の抽出)

import unstructured.partition

help(unstructured.partition)

Help on package unstructured.partition in unstructured:

NAME
    unstructured.partition

PACKAGE CONTENTS
    api
    auto
    common
    csv
    doc
    docx
    email
    epub
    html
    image
    json
    lang
    md
    model_init
    msg
    odt
    org
    pdf
    pdf_image (package)
    ppt
    pptx
    rst
    rtf
    strategies
    text
    text_type
    tsv
    utils (package)
    xlsx
    xml

FILE
    /local_disk0/.ephemeral_nfs/envs/pythonEnv-1d54fac3-205e-4212-b4a8-f9a731bb4d57/lib/python3.10/site-packages/unstructured/partition/__init__.py

こちらのPDFをワークスペースにdata/gpt4all.pdfとして格納しておきます。

from unstructured.partition.pdf import partition_pdf

# PDFファイルへのパスを指定
filename = "data/gpt4all.pdf"

# partition_pdf 関数の呼び出し
# パースされたpdfドキュメントのページに存在する List[Element] を返却
elements = partition_pdf(filename)

# これで、パースされたpdfドキュメントのページにある全ての要素のリストが elements に格納されました

elements

[<unstructured.documents.elements.Text at 0x7f5be4700a30>,
 <unstructured.documents.elements.Title at 0x7f5be4700df0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4701930>,
 <unstructured.documents.elements.Text at 0x7f5be47019c0>,
 <unstructured.documents.elements.Title at 0x7f5be4702710>,
 <unstructured.documents.elements.Title at 0x7f5be4701a50>,
 <unstructured.documents.elements.Title at 0x7f5be4701ae0>,
 <unstructured.documents.elements.Title at 0x7f5be4701ba0>,
 <unstructured.documents.elements.Title at 0x7f5be4701c90>,
 <unstructured.documents.elements.Title at 0x7f5be4701d50>,
 <unstructured.documents.elements.Title at 0x7f5be4701de0>,
 <unstructured.documents.elements.Title at 0x7f5be4702020>,
 <unstructured.documents.elements.Title at 0x7f5be4701f30>,
 <unstructured.documents.elements.Title at 0x7f5be4702740>,
 <unstructured.documents.elements.Title at 0x7f5be47021a0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4702290>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be47023b0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4702b60>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4702c50>,
 <unstructured.documents.elements.Title at 0x7f5be4702d40>,
 <unstructured.documents.elements.Title at 0x7f5be4702c20>,
 <unstructured.documents.elements.Text at 0x7f5be4702380>,
 <unstructured.documents.elements.Title at 0x7f5be4702650>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be47027d0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4702f80>,
 <unstructured.documents.elements.Title at 0x7f5be47029e0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4702f50>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4700070>,
 <unstructured.documents.elements.Title at 0x7f5be47001f0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be7f68130>,
 <unstructured.documents.elements.Title at 0x7f5be7f6beb0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be7f6b640>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be7f6b8b0>,
 <unstructured.documents.elements.Title at 0x7f5be7f6b5e0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be7f6bac0>,
 <unstructured.documents.elements.Title at 0x7f5be7f6b580>,
 <unstructured.documents.elements.Title at 0x7f5be7f6be20>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be7f6aef0>,
 <unstructured.documents.elements.Title at 0x7f5be7f6bb50>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be7f6ada0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be7f6ac80>,
 <unstructured.documents.elements.Title at 0x7f5be7f6ac20>,
 <unstructured.documents.elements.Title at 0x7f5be7f6b160>,
 <unstructured.documents.elements.NarrativeText at 0x7f5bec1239a0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5bec123760>,
 <unstructured.documents.elements.Title at 0x7f5bec1231c0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5bec1235e0>,
 <unstructured.documents.elements.Text at 0x7f5be21bd3f0>,
 <unstructured.documents.elements.Text at 0x7f5be21bd450>,
 <unstructured.documents.elements.Text at 0x7f5be21bd540>,
 <unstructured.documents.elements.Text at 0x7f5be21bd630>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be21bd870>,
 <unstructured.documents.elements.Title at 0x7f5be21bda20>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be21bdae0>,
 <unstructured.documents.elements.Text at 0x7f5be7f6b7f0>,
 <unstructured.documents.elements.Text at 0x7f5be21bdc00>,
 <unstructured.documents.elements.Text at 0x7f5be21bdf00>,
 <unstructured.documents.elements.Text at 0x7f5be21be0e0>,
 <unstructured.documents.elements.Text at 0x7f5be21be2c0>,
 <unstructured.documents.elements.Text at 0x7f5be21be4a0>,
 <unstructured.documents.elements.Text at 0x7f5be21be770>,
 <unstructured.documents.elements.Text at 0x7f5be21be950>,
 <unstructured.documents.elements.Title at 0x7f5be21bda80>,
 <unstructured.documents.elements.Text at 0x7f5be21bdcf0>,
 <unstructured.documents.elements.Text at 0x7f5be21bdff0>,
 <unstructured.documents.elements.Text at 0x7f5be21be1d0>,
 <unstructured.documents.elements.Text at 0x7f5be21be3b0>,
 <unstructured.documents.elements.Text at 0x7f5be21be590>,
 <unstructured.documents.elements.Text at 0x7f5be21be860>,
 <unstructured.documents.elements.Text at 0x7f5be21bea40>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be21bee60>,
 <unstructured.documents.elements.Text at 0x7f5be21beb30>,
 <unstructured.documents.elements.Text at 0x7f5be21bec20>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be46843a0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4684490>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4684940>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4684550>,
 <unstructured.documents.elements.Title at 0x7f5be46846a0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4684040>,
 <unstructured.documents.elements.Title at 0x7f5be4684ac0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4684bb0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be4684ca0>,
 <unstructured.documents.elements.Title at 0x7f5be2234160>,
 <unstructured.documents.elements.Title at 0x7f5be22340a0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2234340>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2234430>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2234400>,
 <unstructured.documents.elements.Title at 0x7f5be2234490>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2234730>,
 <unstructured.documents.elements.Title at 0x7f5be22347f0>,
 <unstructured.documents.elements.Title at 0x7f5be2234670>,
 <unstructured.documents.elements.Title at 0x7f5be22348e0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2234af0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2234be0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2234cd0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2234dc0>,
 <unstructured.documents.elements.Title at 0x7f5be2234e80>,
 <unstructured.documents.elements.Title at 0x7f5be2234ee0>,
 <unstructured.documents.elements.Title at 0x7f5be2235060>,
 <unstructured.documents.elements.Title at 0x7f5be2235120>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2235330>,
 <unstructured.documents.elements.Title at 0x7f5be2235090>,
 <unstructured.documents.elements.Title at 0x7f5be2235420>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be22355d0>,
 <unstructured.documents.elements.Title at 0x7f5be2235630>,
 <unstructured.documents.elements.Text at 0x7f5be2235780>,
 <unstructured.documents.elements.Title at 0x7f5be22356f0>,
 <unstructured.documents.elements.Title at 0x7f5be2235990>,
 <unstructured.documents.elements.Title at 0x7f5be22358a0>,
 <unstructured.documents.elements.Text at 0x7f5be2235b40>,
 <unstructured.documents.elements.Title at 0x7f5be22359f0>,
 <unstructured.documents.elements.Title at 0x7f5be2235c90>,
 <unstructured.documents.elements.Title at 0x7f5be2235db0>,
 <unstructured.documents.elements.Title at 0x7f5be2235e10>,
 <unstructured.documents.elements.Title at 0x7f5be2235f60>,
 <unstructured.documents.elements.Title at 0x7f5be2236080>,
 <unstructured.documents.elements.Text at 0x7f5be2236110>,
 <unstructured.documents.elements.Title at 0x7f5be22361d0>,
 <unstructured.documents.elements.Title at 0x7f5be22362c0>,
 <unstructured.documents.elements.Title at 0x7f5be2236440>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be22365c0>,
 <unstructured.documents.elements.Title at 0x7f5be22364d0>,
 <unstructured.documents.elements.Title at 0x7f5be22368c0>,
 <unstructured.documents.elements.Title at 0x7f5be22367a0>,
 <unstructured.documents.elements.Title at 0x7f5be22366b0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2236aa0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2236bc0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be2236ad0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be22351e0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be1f7e800>,
 <unstructured.documents.elements.Title at 0x7f5be1f7f880>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be1f7ead0>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be1f7eb90>,
 <unstructured.documents.elements.NarrativeText at 0x7f5be1f7ece0>]

len(elements)

134要素が抽出されています。

element_dict = [el.to_dict() for el in elements]
output = json.dumps(element_dict, indent=2)
print(output)

抽出された要素を確認します。

[
  {
    "type": "UncategorizedText",
    "element_id": "b0c5cfcf93a217591e27d5c97845f59b",
    "text": "3 2 0 2",
    "metadata": {
      "coordinates": {
        "points": [
          [
            16.34,
            263.81000000000006
          ],
          [
            16.34,
            303.81000000000006
          ],
          [
            36.34,
            303.81000000000006
          ],
          [
            36.34,
            263.81000000000006
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "d71e9973e25dde0d96dc422b5a8fd429",
    "text": "v o N 6",
    "metadata": {
      "coordinates": {
        "points": [
          [
            16.34,
            308.81000000000006
          ],
          [
            16.34,
            358.25
          ],
          [
            36.34,
            358.25
          ],
          [
            36.34,
            308.81000000000006
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "da9a4b336f710784f847aa01becae2d8",
    "text": "] L C . s c [",
    "metadata": {
      "coordinates": {
        "points": [
          [
            16.34,
            368.24999999999994
          ],
          [
            16.34,
            428.78999999999996
          ],
          [
            36.34,
            428.78999999999996
          ],
          [
            36.34,
            368.24999999999994
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "parent_id": "d71e9973e25dde0d96dc422b5a8fd429",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "UncategorizedText",
    "element_id": "cf466a5a422c76d228e6f9c56a8428ce",
    "text": "1 v 1 3 9 4 0 . 1 1 3 2 : v i X r a",
    "metadata": {
      "coordinates": {
        "points": [
          [
            16.34,
            438.78999999999996
          ],
          [
            16.34,
            604.89
          ],
          [
            36.34,
            604.89
          ],
          [
            36.34,
            438.78999999999996
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "parent_id": "d71e9973e25dde0d96dc422b5a8fd429",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "741c25a4fb94e81aa7239a1c0534a9e8",
    "text": "GPT4All: An Ecosystem of Open Source Compressed Language Models",
    "metadata": {
      "coordinates": {
        "points": [
          [
            77.497,
            78.40315579999992
          ],
          [
            77.497,
            92.74935579999999
          ],
          [
            517.7818779999999,
            92.74935579999999
          ],
          [
            517.7818779999999,
            78.40315579999992
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "3f394fcdfd3c020fa260d031c7aa3551",
    "text": "Yuvanesh Anand Nomic AI yuvanesh@nomic.ai",
    "metadata": {
      "coordinates": {
        "points": [
          [
            69.06,
            110.08451589999993
          ],
          [
            69.06,
            150.07418739999991
          ],
          [
            170.67834999999997,
            150.07418739999991
          ],
          [
            170.67834999999997,
            110.08451589999993
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "c9f10a02ba1baf5861871dea2427ea13",
    "text": "Zach Nussbaum Nomic AI zach@nomic.ai",
    "metadata": {
      "coordinates": {
        "points": [
          [
            196.531,
            110.08451589999993
          ],
          [
            196.531,
            150.07418739999991
          ],
          [
            279.2363818,
            150.07418739999991
          ],
          [
            279.2363818,
            110.08451589999993
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "1fb7ac4ca22ad79ab5acc099895231b9",
    "text": "Adam Treat Nomic AI adam@nomic.ai",
    "metadata": {
      "coordinates": {
        "points": [
          [
            318.538,
            110.08451589999993
          ],
          [
            318.538,
            150.07418739999991
          ],
          [
            396.24615,
            150.07418739999991
          ],
          [
            396.24615,
            110.08451589999993
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "c3528c06b42170925c8845cd0516afaf",
    "text": "Aaron Miller Nomic AI aaron@nomic.ai",
    "metadata": {
      "coordinates": {
        "points": [
          [
            435.058,
            110.08451589999993
          ],
          [
            435.058,
            150.07418739999991
          ],
          [
            518.7436999999999,
            150.07418739999991
          ],
          [
            518.7436999999999,
            110.08451589999993
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "87aa6b53bb4ad01ca3786c2082164fa2",
    "text": "Richard Guo Nomic AI richard@nomic.ai",
    "metadata": {
      "coordinates": {
        "points": [
          [
            100.43099999999997,
            173.28351589999988
          ],
          [
            100.43099999999997,
            213.27318739999987
          ],
          [
            196.07179999999994,
            213.27318739999987
          ],
          [
            196.07179999999994,
            173.28351589999988
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "0f9ab7485cd89cf606234ade290a822f",
    "text": "Ben Schmidt Nomic AI ben@nomic.ai",
    "metadata": {
      "coordinates": {
        "points": [
          [
            261.77199999999993,
            173.28351589999988
          ],
          [
            261.77199999999993,
            213.27318739999987
          ],
          [
            333.5026,
            213.27318739999987
          ],
          [
            333.5026,
            173.28351589999988
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "9eace5e83f6c7f55784da14b30801a6e",
    "text": "GPT4All Community Planet Earth",
    "metadata": {
      "coordinates": {
        "points": [
          [
            392.05999999999995,
            173.28351589999988
          ],
          [
            392.05999999999995,
            199.54930159999992
          ],
          [
            501.98714449999994,
            199.54930159999992
          ],
          [
            501.98714449999994,
            173.28351589999988
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "adefd49403515997446e8633e38d94f7",
    "text": "Brandon Duderstadt\u2217 Nomic AI brandon@nomic.ai",
    "metadata": {
      "coordinates": {
        "points": [
          [
            145.07499999999993,
            235.17709939999986
          ],
          [
            145.07499999999993,
            276.4731873999999
          ],
          [
            255.25451412999993,
            276.4731873999999
          ],
          [
            255.25451412999993,
            235.17709939999986
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": ".",
          "url": "Hfootnote.1",
          "start_index": 42
        }
      ],
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "aa800040abcb3a68245ea9e047eb25a1",
    "text": "Andriy Mulyar\u2217 Nomic AI andriy@nomic.ai",
    "metadata": {
      "coordinates": {
        "points": [
          [
            352.39699999999993,
            235.17709939999997
          ],
          [
            352.39699999999993,
            276.47318740000003
          ],
          [
            442.06024999999994,
            276.47318740000003
          ],
          [
            442.06024999999994,
            235.17709939999997
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "e083963f2380b8c5311d045f988e23fa",
    "text": "Abstract",
    "metadata": {
      "coordinates": {
        "points": [
          [
            157.75799999999992,
            304.80951590000006
          ],
          [
            157.75799999999992,
            316.7646159000001
          ],
          [
            202.24292709999992,
            316.7646159000001
          ],
          [
            202.24292709999992,
            304.80951590000006
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "a90d3e5cedb83aa21ccdb130e575542b",
    "text": "Large language models (LLMs) have recently achieved human-level performance on a range of professional and academic benchmarks. The accessibility of these models has lagged behind their performance. State-of-the-art LLMs re- quire costly infrastructure; are only accessible via rate-limited, geo-locked, and censored web interfaces; and lack publicly available code and technical reports.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            87.625,
            331.47432159999994
          ],
          [
            87.625,
            437.07792159999997
          ],
          [
            273.7749204880001,
            437.07792159999997
          ],
          [
            273.7749204880001,
            331.47432159999994
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "parent_id": "e083963f2380b8c5311d045f988e23fa",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "3f27ae624f926eb91c661791f642f9e3",
    "text": "In this paper, we tell the story of GPT4All, a popular open source repository that aims to democratize access to LLMs. We outline the technical details of the original GPT4All model family, as well as the evolution of the GPT4All project from a single model into a fully fledged open source ecosystem. It is our hope that this paper acts as both a technical overview of the original GPT4All models as well as a case study on the subsequent growth of the GPT4All open source ecosystem.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            87.874,
            444.3423216
          ],
          [
            87.874,
            573.8569216
          ],
          [
            272.1287004640001,
            573.8569216
          ],
          [
            272.1287004640001,
            444.3423216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "parent_id": "e083963f2380b8c5311d045f988e23fa",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "185bfce5f50e0f150589bc1d066df5a9",
    "text": "variety of queries, responding only with the now infa- mous \"As an AI Language Model, I cannot...\" prefix (Vincent, 2023). These transparency and accessibility concerns spurred several developers to begin creating open source large language model (LLM) alternatives. Several grassroots efforts focused on fine tuning Meta\u2019s open code LLaMA model (Touvron et al., 2023; McMil- lan, 2023), whose weights were leaked on BitTorrent less than a week prior to the release of GPT-4 (Verge, 2023). GPT4All started as one of these variants.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            305.813,
            306.45532159999993
          ],
          [
            305.813,
            424.01392159999995
          ],
          [
            526.1474706388001,
            424.01392159999995
          ],
          [
            526.1474706388001,
            306.45532159999993
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "Vincent",
          "url": "cite.verge2023ai",
          "start_index": 106
        },
        {
          "text": "2023",
          "url": "cite.verge2023ai",
          "start_index": 115
        },
        {
          "text": "Touvronetal .,",
          "url": "cite.touvron2023llama",
          "start_index": 345
        },
        {
          "text": "2023",
          "url": "cite.touvron2023llama",
          "start_index": 361
        },
        {
          "text": "McMil",
          "url": "cite.wsj_llama",
          "start_index": 367
        },
        {
          "text": "lan",
          "url": "cite.wsj_llama",
          "start_index": 374
        },
        {
          "text": "2023",
          "url": "cite.wsj_llama",
          "start_index": 379
        },
        {
          "text": "Verge",
          "url": "cite.verge-meta-ai-leak-2023",
          "start_index": 474
        },
        {
          "text": "2023",
          "url": "cite.verge-meta-ai-leak-2023",
          "start_index": 481
        }
      ],
      "page_number": 1,
      "parent_id": "e083963f2380b8c5311d045f988e23fa",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "c4a4c71ac956e6c69d6d761d32e81e0f",
    "text": "In this paper, we tell the story of GPT4All. We com- ment on the technical details of the original GPT4All model (Anand et al., 2023), as well as the evolution of GPT4All from a single model to an ecosystem of several models. We remark on the impact that the project has had on the open source community, and discuss future directions. It is our hope that this paper acts as both a technical overview of the original GPT4All models as well as a case study on the subsequent growth of the GPT4All open source ecosystem.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            305.783,
            426.5143216
          ],
          [
            305.783,
            544.0729216
          ],
          [
            526.0674912600001,
            544.0729216
          ],
          [
            526.0674912600001,
            426.5143216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "Anandetal .,",
          "url": "cite.gpt4all",
          "start_index": 114
        },
        {
          "text": "2023",
          "url": "cite.gpt4all",
          "start_index": 128
        }
      ],
      "page_number": 1,
      "parent_id": "e083963f2380b8c5311d045f988e23fa",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "4c756886e4dbf688c1568c8e65bda77d",
    "text": "2 The Original GPT4All Model",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            557.3085159
          ],
          [
            306.142,
            569.2636159
          ],
          [
            474.5295835,
            569.2636159
          ],
          [
            474.5295835,
            557.3085159
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "49fcd165ead3de8d30afc73d0ac33943",
    "text": "2.1 Data Collection and Curation",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            578.7575833999999
          ],
          [
            306.142,
            588.7201834
          ],
          [
            454.4751514,
            588.7201834
          ],
          [
            454.4751514,
            578.7575833999999
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "UncategorizedText",
    "element_id": "b06f3763a242682a426d25678eee97da",
    "text": "1",
    "metadata": {
      "coordinates": {
        "points": [
          [
            70.866,
            589.5385159
          ],
          [
            70.866,
            601.4936159
          ],
          [
            76.84355,
            601.4936159
          ],
          [
            76.84355,
            589.5385159
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "parent_id": "49fcd165ead3de8d30afc73d0ac33943",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "ce826364cf996377d784570e339ddbcd",
    "text": "Introduction",
    "metadata": {
      "coordinates": {
        "points": [
          [
            88.79865,
            589.5385159
          ],
          [
            88.79865,
            601.4936159
          ],
          [
            153.6789777,
            601.4936159
          ],
          [
            153.6789777,
            589.5385159
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "3abe933a09cfef67aa4b1b9a3a889472",
    "text": "On March 14 2023, OpenAI released GPT-4, a large language model capable of achieving human level per- formance on a variety of professional and academic benchmarks. Despite the popularity of the release, the GPT-4 technical report (OpenAI, 2023) contained virtually no details regarding the architecture, hard- ware, training compute, dataset construction, or training method used to create the model. Moreover, users could only access the model through the internet interface at chat.openai.com, which was severely rate limited and unavailable in several locales (e.g. Italy) (BBC News, 2023). Additionally, GPT-4 refused to answer a wide",
    "metadata": {
      "coordinates": {
        "points": [
          [
            70.507,
            611.3753216
          ],
          [
            70.507,
            752.8449216
          ],
          [
            290.7879641256002,
            752.8449216
          ],
          [
            290.7879641256002,
            611.3753216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "OpenAI",
          "url": "cite.openai2023gpt4",
          "start_index": 232
        },
        {
          "text": "2023",
          "url": "cite.openai2023gpt4",
          "start_index": 240
        },
        {
          "text": "BBCNews",
          "url": "cite.bbc2023chatgpt",
          "start_index": 578
        }
      ],
      "page_number": 1,
      "parent_id": "ce826364cf996377d784570e339ddbcd",
      "filetype": "application/pdf"
    }
  },
  {
    "type":

*** WARNING: max output size exceeded, skipping output. ***

996
          ],
          [
            421.855008724,
            212.44092160000002
          ],
          [
            526.1562576520001,
            212.44092160000002
          ],
          [
            526.1562576520001,
            202.47832159999996
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "81055478269ce81b5561884c98424677",
    "text": "https://huggingface.co/NousResearch/ Nous-Hermes-llama-2-7b. Model on Hugging Face.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            321.086,
            213.0188923999999
          ],
          [
            321.086,
            245.31792159999998
          ],
          [
            524.408199856,
            245.31792159999998
          ],
          [
            524.408199856,
            213.0188923999999
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "https :// huggingface . co / NousResearch",
          "url": "https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b",
          "start_index": 0
        },
        {
          "text": "Nous - Hermes - llama - 2 - 7b",
          "url": "https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b",
          "start_index": 37
        }
      ],
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "95738f8b6f648f2d97a991304ccb420e",
    "text": "Nous-Research.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            256.2593215999999
          ],
          [
            306.142,
            266.2219216
          ],
          [
            370.76121686799996,
            266.2219216
          ],
          [
            370.76121686799996,
            256.2593215999999
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "UncategorizedText",
    "element_id": "2484431e07b97b4f98a6ac8d59303d97",
    "text": "2023d.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            381.268571836,
            256.2593215999999
          ],
          [
            381.268571836,
            266.2219216
          ],
          [
            409.213664836,
            266.2219216
          ],
          [
            409.213664836,
            256.2593215999999
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 5,
      "parent_id": "95738f8b6f648f2d97a991304ccb420e",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "0d0a73807f488d425d129e474ce8114a",
    "text": "Redmond-puffin-13b.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            438.459474892,
            256.2593215999999
          ],
          [
            438.459474892,
            266.2219216
          ],
          [
            526.1562576520001,
            266.2219216
          ],
          [
            526.1562576520001,
            256.2593215999999
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "c3c419152e30de95467de68011b36673",
    "text": "https://huggingface.co/NousResearch/ Redmond-Puffin-13B. Model on Hugging Face.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            321.086,
            266.7988924
          ],
          [
            321.086,
            288.1389216
          ],
          [
            517.2897843999999,
            288.1389216
          ],
          [
            517.2897843999999,
            266.7988924
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "https :// huggingface . co / NousResearch Redmond - Puffin - 13B . ModelonHuggingFace",
          "url": "https://huggingface.co/NousResearch/Redmond-Puffin-13B",
          "start_index": 0
        }
      ],
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "f4154c37f41040e7f7b5dd83d6bb9f7d",
    "text": "OpenAI. 2023. Gpt-4 technical report.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            299.08032159999993
          ],
          [
            306.142,
            309.0429216
          ],
          [
            459.6855912000001,
            309.0429216
          ],
          [
            459.6855912000001,
            299.08032159999993
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "Gpt - 4technicalreport",
          "url": "http://arxiv.org/abs/2303.08774",
          "start_index": 14
        }
      ],
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "0a537655c8ca57a623a7fa87b3193a18",
    "text": "Victor Sanh, Albert Webson, Colin Raffel, Stephen H. Bach, Lintang Sutawika, Zaid Alyafeai, Antoine Chaffin, Arnaud Stiegler, Teven Le Scao, Arun Raja, Manan Dey, M Saiful Bari, Canwen Xu, Urmish Thakker, Shanya Sharma Sharma, Eliza Szczechla, Taewoon Kim, Gunjan Chhablani, Nihal Nayak, Debajyoti Datta, Jonathan Chang, Mike Tian-Jian Jiang, Han Wang, Matteo Manica, Sheng Shen, Zheng Xin Yong, Harshit Pandey, Rachel Bawden, Thomas Wang, Trishala Neeraj, Jos Rozen, Ab- heesht Sharma, Andrea Santilli, Thibault Fevry, Ja- son Alan Fries, Ryan Teehan, Stella Biderman, Leo Gao, Tali Bers, Thomas Wolf, and Alexander M. Rush. 2021. Multitask prompted training enables zero-shot task generalization.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            319.98332159999995
          ],
          [
            306.142,
            483.3709216
          ],
          [
            526.1562576520001,
            483.3709216
          ],
          [
            526.1562576520001,
            319.98332159999995
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": ".",
          "url": "http://arxiv.org/abs/2110.08207",
          "start_index": 629
        },
        {
          "text": "zero - shottaskgeneralization",
          "url": "http://arxiv.org/abs/2110.08207",
          "start_index": 667
        }
      ],
      "page_number": 5,
      "parent_id": "f4154c37f41040e7f7b5dd83d6bb9f7d",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "886a7f467ba741ee5d07336e15bca148",
    "text": "Stability-AI. 2023. Stablelm. https://github.com/ Stability-AI/StableLM. GitHub repository.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            493.89389239999997
          ],
          [
            306.142,
            515.2339216
          ],
          [
            525.4057,
            515.2339216
          ],
          [
            525.4057,
            493.89389239999997
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "https :// github . com",
          "url": "https://github.com/Stability-AI/StableLM",
          "start_index": 30
        },
        {
          "text": "Stability - AI / StableLM",
          "url": "https://github.com/Stability-AI/StableLM",
          "start_index": 50
        }
      ],
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "6161d90c7111121c9df590576b63d8d2",
    "text": "https://github.com/",
    "metadata": {
      "coordinates": {
        "points": [
          [
            430.761,
            525.7558924
          ],
          [
            430.761,
            535.7184924000001
          ],
          [
            525.4057,
            535.7184924000001
          ],
          [
            525.4057,
            525.7558924
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "14880807bec39d9525429193f10ca671",
    "text": "StanGirard. 2023.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            526.1743216
          ],
          [
            306.142,
            536.1369216
          ],
          [
            381.563265544,
            536.1369216
          ],
          [
            381.563265544,
            526.1743216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "819962e277b97813ce073fd44e85f56f",
    "text": "quivr. StanGirard/quivr. GitHub repository.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            321.086,
            526.1743216
          ],
          [
            321.086,
            547.0959216
          ],
          [
            481.48406000000006,
            547.0959216
          ],
          [
            481.48406000000006,
            526.1743216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "page_number": 5,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "f9127289b648f97d5468d9c534385556",
    "text": "Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. Stanford alpaca: An instruction-following llama model. https:// github.com/tatsu-lab/stanford_alpaca.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            558.0373216
          ],
          [
            306.142,
            611.8349216
          ],
          [
            525.7912084368,
            611.8349216
          ],
          [
            525.7912084368,
            558.0373216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "https",
          "url": "https://github.com/tatsu-lab/stanford_alpaca",
          "start_index": 189
        },
        {
          "text": "github . com / tatsu - lab / stanford _ alpaca",
          "url": "https://github.com/tatsu-lab/stanford_alpaca",
          "start_index": 198
        }
      ],
      "page_number": 5,
      "parent_id": "819962e277b97813ce073fd44e85f56f",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "1e03777b59eb46b371e2c0ec4e04c04c",
    "text": "Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. Llama: Open and efficient foundation language models.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            622.7763216
          ],
          [
            306.142,
            698.4919216
          ],
          [
            526.1521733600001,
            698.4919216
          ],
          [
            526.1521733600001,
            622.7763216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "Llama",
          "url": "http://arxiv.org/abs/2302.13971",
          "start_index": 238
        },
        {
          "text": "models",
          "url": "http://arxiv.org/abs/2302.13971",
          "start_index": 283
        }
      ],
      "page_number": 5,
      "parent_id": "819962e277b97813ce073fd44e85f56f",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "cb3d4cb151a83b755fbb5d192919a87a",
    "text": "The Verge. 2023. Meta\u2019s powerful ai language model has leaked online \u2014 what happens now? The Verge.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            709.4333216
          ],
          [
            306.142,
            730.3549216
          ],
          [
            526.15332765,
            730.3549216
          ],
          [
            526.15332765,
            709.4333216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "Meta \u2019 hasleakedonline \u2014 whathappensnow ? TheVerge",
          "url": "https://www.theverge.com/2023/3/8/23629362/meta-ai-language-model-llama-leak-online-misuse",
          "start_index": 17
        },
        {
          "text": "hasleakedonline \u2014 whathappensnow ?",
          "url": "https://www.theverge.com/2023/3/8/23629362/meta-ai-language-model-llama-leak-online-misuse",
          "start_index": 51
        }
      ],
      "page_number": 5,
      "parent_id": "819962e277b97813ce073fd44e85f56f",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "e3c968af341f5c57d0f600fc18d74a87",
    "text": "James Vincent. 2023. As an ai generated language model: The phrase that shows how ai is polluting the web. The Verge.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            306.142,
            741.2953216
          ],
          [
            306.142,
            773.1759216
          ],
          [
            524.4084191080001,
            773.1759216
          ],
          [
            524.4084191080001,
            741.2953216
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": ". model : theweb . TheVerge",
          "url": "https://www.theverge.com/2023/4/25/23697218/ai-generated-spam-fake-user-reviews-as-an-ai-language-model",
          "start_index": 19
        },
        {
          "text": "model : theweb . TheVerge",
          "url": "https://www.theverge.com/2023/4/25/23697218/ai-generated-spam-fake-user-reviews-as-an-ai-language-model",
          "start_index": 49
        },
        {
          "text": "theweb",
          "url": "https://www.theverge.com/2023/4/25/23697218/ai-generated-spam-fake-user-reviews-as-an-ai-language-model",
          "start_index": 98
        }
      ],
      "page_number": 5,
      "parent_id": "819962e277b97813ce073fd44e85f56f",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "056688ab6012c536a7138e0807ad0938",
    "text": "Ben Wang and Aran Komatsuzaki. 2021. GPT-J-6B: A 6 Billion Parameter Autoregressive Language https://github.com/kingoflolz/ Model. mesh-transformer-jax.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            70.866,
            74.0143215999999
          ],
          [
            70.866,
            116.85392159999992
          ],
          [
            290.5144309800001,
            116.85392159999992
          ],
          [
            290.5144309800001,
            74.0143215999999
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "https :// github . com / kingoflolz",
          "url": "https://github.com/kingoflolz/mesh-transformer-jax",
          "start_index": 93
        },
        {
          "text": "mesh - transformer - jax",
          "url": "https://github.com/kingoflolz/mesh-transformer-jax",
          "start_index": 131
        }
      ],
      "page_number": 6,
      "parent_id": "819962e277b97813ce073fd44e85f56f",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "363eb135576e9f6c12c1dd7f912c5941",
    "text": "Eric J. Wang. 2023. alpaca-lora. https://github. com/tloen/alpaca-lora. GitHub repository.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            70.866,
            125.40089239999998
          ],
          [
            70.866,
            146.74092159999998
          ],
          [
            292.6205,
            146.74092159999998
          ],
          [
            292.6205,
            125.40089239999998
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "https :// github",
          "url": "https://github.com/tloen/alpaca-lora",
          "start_index": 33
        },
        {
          "text": "com / tloen / alpaca - lora",
          "url": "https://github.com/tloen/alpaca-lora",
          "start_index": 49
        }
      ],
      "page_number": 6,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "9e1872a7627d65000001f6a1a6cdd620",
    "text": "Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A. Smith, Daniel Khashabi, and Han- naneh Hajishirzi. 2023. Self-instruct: Aligning lan- guage models with self-generated instructions.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            70.866,
            155.7063215999999
          ],
          [
            70.866,
            198.54592160000004
          ],
          [
            290.784716692,
            198.54592160000004
          ],
          [
            290.784716692,
            155.7063215999999
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "Self - instruct : Aligninglan",
          "url": "http://arxiv.org/abs/2212.10560",
          "start_index": 121
        },
        {
          "text": "guagemodelswithself - generatedinstructions",
          "url": "http://arxiv.org/abs/2212.10560",
          "start_index": 150
        }
      ],
      "page_number": 6,
      "parent_id": "363eb135576e9f6c12c1dd7f912c5941",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "1d4f3165609e78d1d6fb3065ae898420",
    "text": "Can Xu, Qingfeng Sun, Kai Zheng, Xiubo Geng, Pu Zhao, Jiazhan Feng, Chongyang Tao, and Daxin Jiang. 2023. Wizardlm: Empowering large language models to follow complex instructions.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            70.866,
            207.51132159999997
          ],
          [
            70.866,
            250.3509216
          ],
          [
            290.3823269040001,
            250.3509216
          ],
          [
            290.3823269040001,
            207.51132159999997
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": "Wizardlm",
          "url": "http://arxiv.org/abs/2304.12244",
          "start_index": 106
        },
        {
          "text": "modelstofollowcomplexinstructions",
          "url": "http://arxiv.org/abs/2304.12244",
          "start_index": 142
        }
      ],
      "page_number": 6,
      "parent_id": "363eb135576e9f6c12c1dd7f912c5941",
      "filetype": "application/pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "7ef5091817209ce37c6cfb895b7c8acd",
    "text": "Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric. P Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. Judging llm-as-a-judge with mt-bench and chatbot arena.",
    "metadata": {
      "coordinates": {
        "points": [
          [
            70.866,
            259.3163215999999
          ],
          [
            70.866,
            313.1139216
          ],
          [
            290.37824261200007,
            313.1139216
          ],
          [
            290.37824261200007,
            259.3163215999999
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 595.276,
        "layout_height": 841.89
      },
      "file_directory": "data",
      "filename": "gpt4all.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-06-17T01:44:00",
      "links": [
        {
          "text": ".",
          "url": "http://arxiv.org/abs/2306.05685",
          "start_index": 184
        },
        {
          "text": "llm - as - a - judgewithmt - benchandchatbotarena",
          "url": "http://arxiv.org/abs/2306.05685",
          "start_index": 194
        }
      ],
      "page_number": 6,
      "parent_id": "363eb135576e9f6c12c1dd7f912c5941",
      "filetype": "application/pdf"
    }
  }
]

どのようなタイプが抽出されたのかを確認します。

unique_types = set()

for item in element_dict:
    unique_types.add(item['type'])

print(unique_types)

{'Title', 'UncategorizedText', 'NarrativeText'}

`Image`が見当たりません。期待した通りに画像情報が抽出されていないので、別の戦略を使いましょう。

PDFからの画像抽出

対象のPDFに表があり、画像データを保存したいものとします。
strategyパラメータをhi_resとして指定する必要があります。表を抽出し、構造を保持するために、コンピュータビジョンとOptical Character Recognition (OCR)の組み合わせを活用します。

注意: さらに画像抽出を改善するために、Unstructuredは既存のオープンソースモデルを改善するためのAPIを提供しています。

マシンに応じて、さまざまなモジュール/ライブラリの問題に直面するかもしれません。以下のリンクが助けになるかもしれません

from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(filename=filename,
                     strategy='hi_res',
           )

しかし、ここでエラーPDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?が出ました。

こちらを参考にWebターミナルからインストールします。

apt-get install poppler-utils

今ではノートブックの画面から直接Webターミナルにアクセスできて便利です。

element_dict = [el.to_dict() for el in elements]

unique_types = set()

for item in element_dict:
    unique_types.add(item['type'])

print(unique_types)

今度はImageも抽出されています。

{'FigureCaption', 'ListItem', 'NarrativeText', 'Footer', 'Image', 'Table', 'UncategorizedText', 'Header', 'Title'}

images = [el for el in elements if el.category == "Image"]

print(images[5].text)
print(images[5].metadata.text_as_html)

Github Repo Growth — opraall — UamMa — Alpaca 50000 40000 30000 Github Stars 20000 10000 0 20 40 60 80 100 120 140 Days Since Launch
None

images

[<unstructured.documents.elements.Image at 0x7f5b7b617d60>,
 <unstructured.documents.elements.Image at 0x7f5b7b614340>,
 <unstructured.documents.elements.Image at 0x7f5b7b6170d0>,
 <unstructured.documents.elements.Image at 0x7f5b7b615d80>,
 <unstructured.documents.elements.Image at 0x7f5b7b617f10>,
 <unstructured.documents.elements.Image at 0x7f5b7b370df0>]

len(images)

6つの画像が抽出されています。

別の方法 ( 画像 + 要素の抽出 / 表示 )

partition_pdf??

Signature: def partition_pdf(filename: Optional[str]=None, file: Optional[IO[bytes]]=None, include_page_breaks: bool=False, strategy: str=PartitionStrategy.AUTO, infer_table_structure: bool=False, ocr_languages: Optional[str]=None, languages: Optional[list[str]]=None, include_metadata: bool=True, metadata_filename: Optional[str]=None, metadata_last_modified: Optional[str]=None, chunking_strategy: Optional[str]=None, hi_res_model_name: Optional[str]=None, extract_images_in_pdf: bool=False, extract_image_block_types: Optional[list[str]]=None, extract_image_block_output_dir: Optional[str]=None, extract_image_block_to_payload: bool=False, date_from_file_object: bool=False, starting_page_number: int=1, extract_forms: bool=False, form_extraction_skip_tables: bool=True, **kwargs: Any) -> list[Element]
Source:   
@process_metadata()
@add_metadata_with_filetype(FileType.PDF)
@add_chunking_strategy
def partition_pdf(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    include_page_breaks: bool = False,
    strategy: str = PartitionStrategy.AUTO,
    infer_table_structure: bool = False,
    ocr_languages: Optional[str] = None,  # changing to optional for deprecation
    languages: Optional[list[str]] = None,
    include_metadata: bool = True,  # used by decorator
    metadata_filename: Optional[str] = None,  # used by decorator
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,  # used by decorator
    hi_res_model_name: Optional[str] = None,
    extract_images_in_pdf: bool = False,
    extract_image_block_types: Optional[list[str]] = None,
    extract_image_block_output_dir: Optional[str] = None,
    extract_image_block_to_payload: bool = False,
    date_from_file_object: bool = False,
    starting_page_number: int = 1,
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    **kwargs: Any,
) -> list[Element]:
    """Parses a pdf document into a list of interpreted elements.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object as bytes --> open(filename, "rb").
    strategy
        The strategy to use for partitioning the PDF. Valid strategies are "hi_res",
        "ocr_only", and "fast". When using the "hi_res" strategy, the function uses
        a layout detection model to identify document elements. When using the
        "ocr_only" strategy, partition_pdf simply extracts the text from the
        document using OCR and processes it. If the "fast" strategy is used, the text
        is extracted directly from the PDF. The default strategy `auto` will determine
        when a page can be extracted using `fast` mode, otherwise it will fall back to `hi_res`.
    infer_table_structure
        Only applicable if `strategy=hi_res`.
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        The languages present in the document, for use in partitioning and/or OCR. To use a language
        with Tesseract, you'll first need to install the appropriate Tesseract language pack.
    metadata_last_modified
        The last modified date for the document.
    hi_res_model_name
        The layout detection model used when partitioning strategy is set to `hi_res`.
    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
        If True, any detected images will be saved in the path specified by
        'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
        'extract_image_block_types' for broader extraction capabilities.
    extract_image_block_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
        saved in the path specified by 'extract_image_block_output_dir' or stored as base64
        encoded data within metadata fields.
    extract_image_block_to_payload
        Only applicable if `strategy=hi_res`.
        If True, images of the element type(s) defined in 'extract_image_block_types' will be
        encoded as base64 data and stored in two metadata fields: 'image_base64' and
        'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
    extract_image_block_output_dir
        Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
        The filesystem path for saving images of the element type(s)
        specified in 'extract_image_block_types'.
    date_from_file_object
        Applies only when providing file via `file` parameter. If this option is True, attempt
        infer last_modified metadata from bytes, otherwise set it to None.
    extract_forms
        Whether the form extraction logic should be run
        (results in adding FormKeysValues elements to output).
    form_extraction_skip_tables
        Whether the form extraction logic should ignore regions designated as Tables.
    """

    exactly_one(filename=filename, file=file)

    languages = check_language_args(languages or [], ocr_languages) or ["eng"]

    return partition_pdf_or_image(
        filename=filename,
        file=file,
        include_page_breaks=include_page_breaks,
        strategy=strategy,
        infer_table_structure=infer_table_structure,
        languages=languages,
        metadata_last_modified=metadata_last_modified,
        hi_res_model_name=hi_res_model_name,
        extract_images_in_pdf=extract_images_in_pdf,
        extract_image_block_types=extract_image_block_types,
        extract_image_block_output_dir=extract_image_block_output_dir,
        extract_image_block_to_payload=extract_image_block_to_payload,
        date_from_file_object=date_from_file_object,
        starting_page_number=starting_page_number,
        extract_forms=extract_forms,
        form_extraction_skip_tables=form_extraction_skip_tables,
        **kwargs,
    )
File:      /local_disk0/.ephemeral_nfs/envs/pythonEnv-1d54fac3-205e-4212-b4a8-f9a731bb4d57/lib/python3.10/site-packages/unstructured/partition/pdf.py
Line:      101
Type:      function

# 要素の取得
path = "images"
raw_pdf_elements = partition_pdf(filename=filename,
                                 # Unstructuredはまず埋め込まれた画像ブロックを特定します
                                 # `strategy=hi_res`の場合にのみ適用されます
                                 extract_images_in_pdf=True,
                                 strategy = "hi_res",
                                 infer_table_structure=True,
                                 # `strategy=hi_res`の場合にのみ適用されます
                                 extract_image_block_output_dir = path,
                                 )

element_dict = [el.to_dict() for el in raw_pdf_elements]

unique_types = set()

for item in element_dict:
    unique_types.add(item['type'])

print(unique_types)

{'FigureCaption', 'ListItem', 'NarrativeText', 'Footer', 'Image', 'Table', 'UncategorizedText', 'Header', 'Title'}

images = [el for el in raw_pdf_elements if el.category == "Image"]

print(images[5].text)
print(images[5].metadata.text_as_html)

Github Repo Growth — opraall — UamMa — Alpaca 50000 40000 30000 Github Stars 20000 10000 0 20 40 60 80 100 120 140 Days Since Launch
None

抽出した画像を表示しましょう

from IPython.display import Image, display
import glob

# 画像を格納するフォルダへのパスを定義します
folder_path = "images/*.jpg"  # 必要に応じてファイルタイプを更新します

# 指定されたフォルダのJPGファイルを検索するためにglobを使います
image_files = glob.glob(folder_path)

# 画像ファイルのリストをイテレートし、インラインでそれぞれの画像を表示します
for image_file in image_files:
    display(Image(filename=image_file))