どれ使うとええのん?
概要
Excelのみ : duckdb
PDFのみ sqlite3
PDFとExcel両方 : pfdplumber + sqlite3
いるもん
pip install duckdb
pdfのみ
import pdfplumber
import pandas as pd
import sqlite3
# PDFの表を抽出
with pdfplumber.open("data.pdf") as pdf:
table = pdf.pages[0].extract_table()
df = pd.DataFrame(table[1:], columns=table[0]) # 最初の行をヘッダーに
# SQLiteでSQL実行
conn = sqlite3.connect(":memory:")
df.to_sql("table_name", conn, index=False, if_exists="replace")
query = "SELECT * FROM table_name WHERE age > 30"
result = pd.read_sql(query, conn)
print(result)
excelのみ
import duckdb
query = "SELECT * FROM read_excel('data.xlsx', sheet='Sheet1') WHERE age > 30"
result = duckdb.query(query).to_df()
print(result)
か、
import sqlite3
import pandas as pd
df = pd.read_excel("data.xlsx")
conn = sqlite3.connect(":memory:")
df.to_sql("table_name", conn, index=False, if_exists="replace")
query = "SELECT * FROM table_name WHERE age > 30"
result = pd.read_sql(query, conn)
print(result)