0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

Posted at

import streamlit as st
import pdfplumber
import pandas as pd
import re
import mistune # 高速Markdownパーサ

st.set_page_config(page_title="PDF → Markdown 抽出ツール", layout="wide")
st.title("📄 PDFをMarkdownに変換してテキスト・表を抽出")

Markdownに整形する関数

def text_to_markdown(text):
lines = text.split('\n')
md_lines = []

for line in lines:
    line = line.strip()
    if not line:
        continue

    # セクションっぽい行
    if re.match(r"^[0-90-9]+[\.\.、)]\s*", line):
        md_lines.append(f"## {line}")
    # 表っぽい行(スペース or タブ区切り)
    elif '\t' in line or re.search(r'\s{2,}', line):
        cells = re.split(r'\t+|\s{2,}', line)
        md_lines.append('| ' + ' | '.join(c.strip() for c in cells) + ' |')
    else:
        md_lines.append(line)

return '\n'.join(md_lines)

Markdown ASTからテーブルとテキストを分離

def parse_markdown(md_text):
markdown = mistune.create_markdown(renderer=mistune.AstRenderer())
ast = markdown(md_text)

tables = []
texts = []

for block in ast:
    if block['type'] == 'table':
        headers = [h['text'] for h in block['header']]
        rows = []
        for row in block['cells']:
            rows.append([cell['text'] for cell in row])
        df = pd.DataFrame(rows, columns=headers)
        tables.append(df)
    elif block['type'] in ['paragraph', 'heading']:
        texts.append(block['text'])

return texts, tables

メイン処理

uploaded_file = st.file_uploader("PDFファイルをアップロードしてください", type=["pdf"])

if uploaded_file:
uploaded_file.seek(0)
all_text = ""
with pdfplumber.open(uploaded_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
all_text += page_text + '\n'

if not all_text.strip():
    st.warning("PDFからテキストを抽出できませんでした。画像型PDFの可能性があります。")
else:
    md_text = text_to_markdown(all_text)
    st.subheader("📝 Markdown形式に変換されたテキスト")
    st.code(md_text, language="markdown")

    st.download_button(
        "Markdownをダウンロード",
        data=md_text,
        file_name="extracted.md",
        mime="text/markdown"
    )

    # Markdownパースして表示
    st.subheader("📃 テキスト抽出結果")
    texts, tables = parse_markdown(md_text)

    for i, txt in enumerate(texts):
        st.markdown(f"**段落 {i+1}**\n\n{txt}")

    st.subheader("📊 表抽出結果")
    if tables:
        for i, df in enumerate(tables):
            st.markdown(f"**表 {i+1}**")
            st.dataframe(df)
            csv = df.to_csv(index=False).encode("utf-8-sig")
            st.download_button(
                f"表 {i+1} をCSVでダウンロード",
                data=csv,
                file_name=f"table_{i+1}.csv",
                mime="text/csv",
            )
    else:
        st.info("表は見つかりませんでした。")
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?