import pandas as pd
import os
import logging
import json
import boto3
from botocore.config import Config
import subprocess
from pdf2image import convert_from_path
import tempfile
import base64
from PIL import Image
import io
class ExcelStructuredConverter:
def __init__(self, input_bucket, input_prefix, output_bucket, output_prefix):
"""
初期化
Args:
input_bucket (str): 入力S3バケット名
input_prefix (str): 入力S3プレフィックス
output_bucket (str): 出力S3バケット名
output_prefix (str): 出力S3プレフィックス
"""
self.input_bucket = input_bucket
self.input_prefix = input_prefix
self.output_bucket = output_bucket
self.output_prefix = output_prefix
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO)
self.s3 = boto3.client('s3')
config = Config(
retries=dict(max_attempts=10),
read_timeout=300,
connect_timeout=300
)
self.bedrock = boto3.client(
service_name='bedrock-runtime',
config=config,
region_name='us-east-1'
)
self.temp_dir = tempfile.mkdtemp()
def download_from_s3(self, key, local_path):
"""S3からファイルをダウンロード"""
try:
self.s3.download_file(self.input_bucket, key, local_path)
self.logger.info(f"S3からダウンロード完了: {key}")
except Exception as e:
self.logger.error(f"S3ダウンロードエラー: {str(e)}")
raise
def upload_to_s3(self, local_path, key):
"""S3にファイルをアップロード"""
try:
self.s3.upload_file(local_path, self.output_bucket, key)
self.logger.info(f"S3へアップロード完了: {key}")
except Exception as e:
self.logger.error(f"S3アップロードエラー: {str(e)}")
raise
def convert_excel_to_pdf(self, excel_path):
"""ExcelファイルをPDFに変換"""
try:
pdf_path = os.path.splitext(excel_path)[0] + '.pdf'
subprocess.run([
'soffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', os.path.dirname(pdf_path),
excel_path
], check=True)
return pdf_path
except Exception as e:
self.logger.error(f"PDF変換エラー: {str(e)}")
raise
def convert_pdf_to_png(self, pdf_path):
"""PDFをPNG画像に変換"""
try:
images = convert_from_path(pdf_path)
png_paths = []
for i, image in enumerate(images):
png_path = os.path.splitext(pdf_path)[0] + f'_page_{i+1}.png'
image.save(png_path, 'PNG')
png_paths.append(png_path)
return png_paths
except Exception as e:
self.logger.error(f"PNG変換エラー: {str(e)}")
raise
def encode_image_base64(self, image_path):
"""画像をbase64エンコード"""
with open(image_path, 'rb') as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def process_with_claude(self, image_path):
"""ClaudeでPNG画像を解析してJSON形式で出力"""
try:
base64_image = self.encode_image_base64(image_path)
prompt = """
この画像はエクセルデータを画像にしたものです。
すべての本文を以下の形式でJSONとして構造化してください:
{
"headers": ["列名1", "列名2", ...],
"data": [
{"列名1": "値1", "列名2": "値2", ...},
...
]
}
正確にエクセルのデータを抽出してください。
"""
body = json.dumps({
"prompt": prompt,
"max_tokens_to_sample": 4096,
"temperature": 0.0,
"top_p": 1,
"anthropic_version": "bedrock-2023-05-31",
"image": base64_image
})
response = self.bedrock.invoke_model(
modelId="anthropic.claude-3-5-sonnet-20240620-v1:0",
body=body
)
response_body = json.loads(response.get('body').read())
return json.loads(response_body.get('completion'))
except Exception as e:
self.logger.error(f"Claude APIエラー: {str(e)}")
return None
def save_to_csv(self, data, output_path):
"""JSON形式のデータをCSVとして保存"""
try:
df = pd.DataFrame(data['data'])
df.to_csv(output_path, index=False, encoding='utf-8')
self.logger.info(f"CSV保存完了: {output_path}")
except Exception as e:
self.logger.error(f"CSV保存エラー: {str(e)}")
raise
def convert_all_excel_files(self):
"""S3のExcelファイルを全て処理"""
try:
response = self.s3.list_objects_v2(
Bucket=self.input_bucket,
Prefix=self.input_prefix
)
if 'Contents' not in response:
self.logger.warning(f"Excelファイルが見つかりません: {self.input_prefix}")
return
for obj in response['Contents']:
if obj['Key'].endswith(('.xlsx', '.xls')):
self.process_single_file(obj['Key'])
except Exception as e:
self.logger.error(f"変換処理中のエラー: {str(e)}")
raise
def process_single_file(self, s3_key):
"""S3の単一のExcelファイルを処理"""
try:
filename = os.path.basename(s3_key)
base_name = os.path.splitext(filename)[0]
excel_path = os.path.join(self.temp_dir, filename)
self.download_from_s3(s3_key, excel_path)
pdf_path = self.convert_excel_to_pdf(excel_path)
png_paths = self.convert_pdf_to_png(pdf_path)
for i, png_path in enumerate(png_paths):
structured_data = self.process_with_claude(png_path)
if structured_data:
csv_path = os.path.join(self.temp_dir, f"{base_name}_page_{i+1}.csv")
df = pd.DataFrame(structured_data['data'])
df.to_csv(csv_path, index=False, encoding='utf-8')
s3_csv_key = f"{self.output_prefix}/{base_name}_page_{i+1}.csv"
self.upload_to_s3(csv_path, s3_csv_key)
os.remove(csv_path)
os.remove(png_path)
os.remove(pdf_path)
os.remove(excel_path)
except Exception as e:
self.logger.error(f"ファイル {s3_key} の処理中にエラー: {str(e)}")
raise
def main():
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
input_bucket = "input-bucket-name"
input_prefix = "excel-files/"
output_bucket = "output-bucket-name"
output_prefix = "csv-files/"
converter = ExcelStructuredConverter(
input_bucket, input_prefix,
output_bucket, output_prefix
)
converter.convert_all_excel_files()
if __name__ == "__main__":
main()
excel_to_csv_converter.py
Last updated at Posted at 2025-01-31
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme