pdf_read.py
#! /usr/bin/python
# -*- coding: utf-8 -*-
#
# pdf_read.py
#
# Oct/02/2018
#
import sys
import pdftotext
#
# ------------------------------------------------------------------
def dict_display_proc(dict_aa):
for key in sorted(dict_aa.keys()):
unit = dict_aa[key]
name = unit['name']
str_out = str(key) +"\t"+ str(name)
str_out += "\t" + str(unit['population'])
str_out += "\t" + str(unit['date_mod'])
print(str_out)
# ------------------------------------------------------------------
def dict_append_proc(dict_aa,key,name,population,date_mod):
dict_aa[key] = {'name':name,'population':population,'date_mod':date_mod}
#
return dict_aa
#
# ------------------------------------------------------------------
def pdf_to_dict_proc (file_pdf):
dict_aa = {}
#
fp_in = open(file_pdf, "rb")
pdf = pdftotext.PDF(fp_in)
fp_in.close()
#
lines = pdf[0].split("\n")
for str in lines:
line = str.rstrip()
if (5 < len (line)):
cols= line.split ()
key = cols[0]
name = cols[1]
population = cols[2]
date_mod = cols[3]
if (key[0] == "t"):
dict_aa = dict_append_proc (dict_aa,key, \
name,population,date_mod)
#
return dict_aa
# ------------------------------------------------------------------
sys.stderr.write ("*** 開始 ***\n")
file_pdf = sys.argv[1]
dict_aa = pdf_to_dict_proc (file_pdf)
#
dict_display_proc (dict_aa)
sys.stderr.write ("*** 終了 ***\n")
# ------------------------------------------------------------------
実行方法
./pdf_read.py cities.pdf
実行結果
$ ./pdf_read.py cities.pdf
*** 開始 ***
t2531 大津 92168 2003-9-30
t2532 草津 17524 2003-2-10
t2533 守山 73651 2003-6-14
t2534 栗東 62963 2003-9-9
t2535 野洲 42531 2003-8-4
t2536 甲賀 35287 2003-1-21
t2537 湖南 82956 2003-7-23
t2538 近江八幡 23784 2003-10-26
t2539 彦根 72813 2003-12-15
*** 終了 ***
Arch Linux でのライブラリーのインストール方法
yay -S python-pdftotext
次のバージョンで確認しました。
$ python --version
Python 3.8.2