More than 3 years have passed since last update.

Officeファイルを下位ディレクトリまで一括PDF変換

Posted at 2021-08-25

更新履歴

2021/08/25 新規作成

はじめに

Officeファイルの資料はタブ管理ができるブラウザで見たいので、私はいつもPDFエクスポートしていました。
しかし、先日、上司から業務で使う資料のディレクトリパスを渡されて見てみると...

なんと使う資料の総数 524 !!!!

資料すべてに目を通すわけではないですが、それでもかなりの数の資料を常時開いておかなければスムーズに業務を行えません。
そこで、Pythonを使って指定したディレクトリおよび下位ディレクトリ内のOfficeファイルを一括でPDFに変換するプログラムを作成しました。私はなんとか乗り切ることができましたが、こういった需要は私以外にも少なからずあるのかなと考えて記事にしました。

仕様

動作環境

OS : Windows 10 Pro
Anaconda : conda 4.9.2
Office 2016

comtypesモジュールがWindowsのみ対応しているため注意が必要です。

機能

変換元のディレクトリ構造を変換先ディレクトリに生成
一括PDF変換
- 下位ディレクトリまで再帰的にファイルを検索して取得
- 取得したファイルをすべてPDFに変換して変換先ディレクトリに保存
オプション：既に存在するファイルの変換をしない

処理時間

Officeファイルを開いて処理を行うので、そこまで速くありません...
1ファイルあたり十数秒ほどですので手作業よりは速いと思います。
何より面倒なPDF変換を肩代わりしてくれるのでPythonに感謝です！

ソースコード

長いので折りたたんでます

import os
import comtypes.client
import glob as g

class OfficePdfConverter :

	def __init__(self) :
		self.filelist = None
		self.num = None
		self.sourceHome = None
		self.converterList = {
			"docx": self._docxToPdf,
			"xlsx": self._xlsxToPdf,
			"pptx": self._pptxToPdf
		}

	def toPdf(self,source,destination,office,exclude_exist=False) :
		self.sourceHome = source[0]
		self.filelist = set(g.glob(source[0]+source[1],recursive=True))
		self.num = len(self.filelist)

		# check exist
		file_exist = set(g.glob(destination+"\\**\\*.pdf",recursive=True))

		files = set()
		for file in file_exist :
			file = file.replace(destination,self.sourceHome)
			file = file.replace("pdf",office)
			files.add(file)

		print(f"src--all   : {self.num}")
		print(f"dest-exist : {len(files)}")

		# exclude already existing file
		if exclude_exist :
			self.filelist = sorted(self.filelist - files)
			self.num = len(self.filelist)

		# get path of subdirectory
		subdirSet = set([os.path.split(file)[0] for file in self.filelist])

		# sort
		subdirSet = sorted(subdirSet)

		# show list of subdirectory
		print("-"*5+" subdirectory:source "+"-"*80)
		for dir in subdirSet:
			print(dir)

		# show path of destination subdirectory
		print("-"*5+" subdirectory:destination "+"-"*80)
		for dir in subdirSet:
			dir = dir.replace(self.sourceHome,destination)
			print(dir)
			os.makedirs(dir,exist_ok=True)

		# convert files to pdf
		try :
			print("-"*5+" Convert to PDF "+"-"*80)
			faliure = self.converterList[office](destination)

			print("-"*100)
			for file in faliure :
				print("fail : " + file)
			print("-"*100)
		except KeyError as e :
			print(e)

		return
	
	def _docxToPdf(self,destination) :
		word = comtypes.client.CreateObject('Word.Application')
		word.Visible = False

		failure = set()

		for src in self.filelist :
			docx, dest = self._setup(destination,src,'docx')

			# convert file to pdf
			try :
				docx = word.Documents.Open(src)
				docx.SaveAs(dest, 17)
			except Exception :
				print("!!! Error !!!")
				failure.add(src)
			finally :
				if docx is not None :
					docx.Close()

		word.Quit()

		return failure

	def _xlsxToPdf(self,destination) :
		excel = comtypes.client.CreateObject('Excel.Application')
		excel.Visible = False

		failure = set()

		for src in self.filelist :
			xlsx, dest = self._setup(destination,src,'xlsx')

			# convert file to pdf
			try :
				xlsx = excel.Workbooks.Open(src)
				xlsx.ExportAsFixedFormat(0,dest,1,0)
			except Exception :
				print("!!! Error !!!")
				failure.add(src)
			finally :
				if xlsx is not None :
					xlsx.Close()

		excel.Quit()

		return failure
	
	def _pptxToPdf(self,destination) :
		powerpoint = comtypes.client.CreateObject('Powerpoint.Application')
		powerpoint.Visible = False

		failure = set()

		for src in self.filelist :
			pptx, dest = self._setup(destination,src,'pptx')

			# convert file to pdf
			try :
				pptx = powerpoint.Presentations.Open(src)
				pptx.SaveAs(dest,32)
			except Exception :
				print("!!! Error !!!")
				failure.add(src)
			finally :
				if pptx is not None :
					pptx.Close()

		powerpoint.Quit()

		return failure

	def _setup(self,destination,src,office) :
		# avoid error in finally-statement
		extension = None

		# path of destination file
		dest = src.replace(self.sourceHome,destination)
		dest = dest.replace(office,"pdf")

		# show information of current file
		strInfo = (
			"-"*100 + "\n" +
			"@source\n" +
			" + " + os.path.split(src)[0] + "\n" +
			" + " + os.path.split(src)[1] + "\n" +
			"@destination\n" +
			" + " + os.path.split(dest)[0] + "\n" +
			" + " + os.path.split(dest)[1] + "\n" +
			"-"*100
		)

		print(strInfo)

		return extension, dest

使用例

source = ["C:\\src_home", "\\**\\[!~]*.xlsx"]
destination = "C:\\dest_home"

office = "xlsx"

converter = OfficePdfConverter()
converter.toPdf(source,destination,office,exclude_exist=True)

ここで、toPdfの第一引数sourceはリストで与えるので注意が必要です。
下の図を使って説明していきます。

まずは、source[0]について説明します。
上の図のように変換元のディレクトリ構造を変換先ディレクトリに生成するため、変換したPDFのパスはsourceをつかって取得したパスのsource[0]をdestinationに置き換えたものになります。
変換元のパスを取得した後ではどこからが変換元のディレクトリ構造と同じであるかわからなくなってしまうため、このようにリストの形で与えて変換元から変換先へのパスの置換を簡単にしています。

続いて、source[1]の特殊文字について説明します。
特殊文字**はC:\src_home以下のディレクトリおよびサブディレクトリを検索するために必要です。
これを行うために、OfficePdfConverter.toPdf()内のglob()のrecursive引数をTrueにしています。
また、特殊文字[!~]はファイル名の先頭が~であるものを取得するパスから除外しています。
これは、Officeファイルを開いている際に生成される一時ファイルを除外するためです。
変換元ディレクトリがNASなどの共有フォルダの場合、プログラムを実行した際にファイルをローカルに入れずそのまま開く人がいたりすると、余計なファイルパスを取得してしまうのを防いでいます。

特殊文字について詳しく知りたい方は、こちらをご覧ください。

おわりに

本記事をご覧いただきありがとうございました。
誰かの役に立っていれば幸いです。

参考

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up