0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

excel_to_csv_converter.py

Last updated at Posted at 2025-01-31
import pandas as pd
import os
import logging
import json
import boto3
from botocore.config import Config
import subprocess
from pdf2image import convert_from_path
import tempfile
import base64
from PIL import Image
import io

class ExcelStructuredConverter:
    def __init__(self, input_bucket, input_prefix, output_bucket, output_prefix):
        """
        初期化
        Args:
            input_bucket (str): 入力S3バケット名
            input_prefix (str): 入力S3プレフィックス
            output_bucket (str): 出力S3バケット名
            output_prefix (str): 出力S3プレフィックス
        """
        self.input_bucket = input_bucket
        self.input_prefix = input_prefix
        self.output_bucket = output_bucket
        self.output_prefix = output_prefix
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        self.s3 = boto3.client('s3')
        
        config = Config(
            retries=dict(max_attempts=10),
            read_timeout=300,
            connect_timeout=300
        )
        self.bedrock = boto3.client(
            service_name='bedrock-runtime',
            config=config,
            region_name='us-east-1'
        )
        
        self.temp_dir = tempfile.mkdtemp()
            
    def download_from_s3(self, key, local_path):
        """S3からファイルをダウンロード"""
        try:
            self.s3.download_file(self.input_bucket, key, local_path)
            self.logger.info(f"S3からダウンロード完了: {key}")
        except Exception as e:
            self.logger.error(f"S3ダウンロードエラー: {str(e)}")
            raise
            
    def upload_to_s3(self, local_path, key):
        """S3にファイルをアップロード"""
        try:
            self.s3.upload_file(local_path, self.output_bucket, key)
            self.logger.info(f"S3へアップロード完了: {key}")
        except Exception as e:
            self.logger.error(f"S3アップロードエラー: {str(e)}")
            raise

    def convert_excel_to_pdf(self, excel_path):
        """ExcelファイルをPDFに変換"""
        try:
            pdf_path = os.path.splitext(excel_path)[0] + '.pdf'
            
            subprocess.run([
                'soffice',
                '--headless',
                '--convert-to', 'pdf',
                '--outdir', os.path.dirname(pdf_path),
                excel_path
            ], check=True)
            
            return pdf_path
        except Exception as e:
            self.logger.error(f"PDF変換エラー: {str(e)}")
            raise

    def convert_pdf_to_png(self, pdf_path):
        """PDFをPNG画像に変換"""
        try:
            images = convert_from_path(pdf_path)
            png_paths = []
            
            for i, image in enumerate(images):
                png_path = os.path.splitext(pdf_path)[0] + f'_page_{i+1}.png'
                image.save(png_path, 'PNG')
                png_paths.append(png_path)
                
            return png_paths
        except Exception as e:
            self.logger.error(f"PNG変換エラー: {str(e)}")
            raise

    def encode_image_base64(self, image_path):
        """画像をbase64エンコード"""
        with open(image_path, 'rb') as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def process_with_claude(self, image_path):
        """ClaudeでPNG画像を解析してJSON形式で出力"""
        try:
            base64_image = self.encode_image_base64(image_path)
            
            prompt = """
            この画像はエクセルデータを画像にしたものです。
            すべての本文を以下の形式でJSONとして構造化してください:
            
            {
                "headers": ["列名1", "列名2", ...],
                "data": [
                    {"列名1": "値1", "列名2": "値2", ...},
                    ...
                ]
            }
            
            正確にエクセルのデータを抽出してください。
            """
            
            body = json.dumps({
                "prompt": prompt,
                "max_tokens_to_sample": 4096,
                "temperature": 0.0,
                "top_p": 1,
                "anthropic_version": "bedrock-2023-05-31",
                "image": base64_image
            })
            
            response = self.bedrock.invoke_model(
                modelId="anthropic.claude-3-5-sonnet-20240620-v1:0",
                body=body
            )
            
            response_body = json.loads(response.get('body').read())
            return json.loads(response_body.get('completion'))
            
        except Exception as e:
            self.logger.error(f"Claude APIエラー: {str(e)}")
            return None
            
    def save_to_csv(self, data, output_path):
        """JSON形式のデータをCSVとして保存"""
        try:
            df = pd.DataFrame(data['data'])
            df.to_csv(output_path, index=False, encoding='utf-8')
            self.logger.info(f"CSV保存完了: {output_path}")
        except Exception as e:
            self.logger.error(f"CSV保存エラー: {str(e)}")
            raise

    def convert_all_excel_files(self):
        """S3のExcelファイルを全て処理"""
        try:
            response = self.s3.list_objects_v2(
                Bucket=self.input_bucket,
                Prefix=self.input_prefix
            )
            
            if 'Contents' not in response:
                self.logger.warning(f"Excelファイルが見つかりません: {self.input_prefix}")
                return
                
            for obj in response['Contents']:
                if obj['Key'].endswith(('.xlsx', '.xls')):
                    self.process_single_file(obj['Key'])
                    
        except Exception as e:
            self.logger.error(f"変換処理中のエラー: {str(e)}")
            raise
            
    def process_single_file(self, s3_key):
        """S3の単一のExcelファイルを処理"""
        try:
            filename = os.path.basename(s3_key)
            base_name = os.path.splitext(filename)[0]
            excel_path = os.path.join(self.temp_dir, filename)
            
            self.download_from_s3(s3_key, excel_path)

            pdf_path = self.convert_excel_to_pdf(excel_path)
            
            png_paths = self.convert_pdf_to_png(pdf_path)
            
            for i, png_path in enumerate(png_paths):
                structured_data = self.process_with_claude(png_path)
                
                if structured_data:
                    csv_path = os.path.join(self.temp_dir, f"{base_name}_page_{i+1}.csv")
                    df = pd.DataFrame(structured_data['data'])
                    df.to_csv(csv_path, index=False, encoding='utf-8')
                    
                    s3_csv_key = f"{self.output_prefix}/{base_name}_page_{i+1}.csv"
                    self.upload_to_s3(csv_path, s3_csv_key)
                    
                    os.remove(csv_path)
                
                os.remove(png_path)
            
            os.remove(pdf_path)
            os.remove(excel_path)
            
        except Exception as e:
            self.logger.error(f"ファイル {s3_key} の処理中にエラー: {str(e)}")
            raise

def main():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    
    input_bucket = "input-bucket-name"
    input_prefix = "excel-files/"
    output_bucket = "output-bucket-name"
    output_prefix = "csv-files/"
    
    converter = ExcelStructuredConverter(
        input_bucket, input_prefix,
        output_bucket, output_prefix
    )
    converter.convert_all_excel_files()

if __name__ == "__main__":
    main()
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?