こちらで行ったのと同じことを Julia で行いました。
Python3: 診療・検査医療機関の PDF を JSON に変換
次で公表されている PDF を JSON に変換します。
診療・検査医療機関の公表について
小山地区のデータを取得します。
wget https://www.pref.tochigi.lg.jp/e04/documents/20220112225313.pdf
変換
pdftotext -layout 20220112225313.pdf
#
./medical_to_json.jl 20220112225313.txt oyama_medical.json > tmp01
#
medical_to_json.jl
# ! /usr/bin/julia
#
# medical_to_json.jl
#
# Jan/24/2022
# --------------------------------------------------------------------
using JSON
# --------------------------------------------------------------------
function true_false_convert(ss_in)
rvalue = false
# if ss_in == '○'
if ss_in == "○"
rvalue = true
end
#
rvalue
end
# --------------------------------------------------------------------
function convert_to_dict(ss)
unit_aa = Dict()
unit_aa["name"] = ss[1]
if 1 < size(ss)[1]
unit_aa["postal"] = ss[2]
unit_aa["address"] = ss[3]
unit_aa["phone"] = ss[4]
unit_aa["consult"] = true_false_convert(ss[5])
unit_aa["inspect"] = true_false_convert(ss[6])
unit_aa["only"] = true_false_convert(ss[7])
end
#
if 7 < size(ss)[1]
unit_aa["others"] = true_false_convert(ss[8])
end
#
if 8 < size(ss)[1]
unit_aa["comment"] = ss[9]
end
#
unit_aa
end
# --------------------------------------------------------------------
function omit_check_proc(line)
rvalue = true
array_omit = ["診療・検査医療機関","実施内容","電話番号","医療機関名",
"(代表)","のみ可","検査","ページ",":"]
for word in array_omit
if (occursin(word,line))
rvalue = false
end
end
#
rvalue
end
# --------------------------------------------------------------------
function line_proc(line,list_aa)
index = firstindex(line)
chx = line[index]
if chx != '【' && chx != '0'
if omit_check_proc(line)
ss = split(line)
println(ss[1])
unit_aa = convert_to_dict(ss)
push!(list_aa,unit_aa)
end
end
end
# --------------------------------------------------------------------
println(stderr,"*** 開始 ***")
file_in = ARGS[1]
file_json = ARGS[2]
println(stderr,file_in)
println(stderr,file_json)
#
ff = open(file_in, "r")
string = read(ff, String)
close(ff)
lines = split(string,"\n")
#
llx = length(lines)
println(stderr,llx)
#
list_aa = []
for it=1:llx
line = lines[it]
if 10 < length(line)
line_proc(line,list_aa)
end
end
#
json_str = JSON.json(list_aa)
fp_out = open(file_json,"w")
write(fp_out,json_str)
close(fp_out)
#
println(stderr,"*** 終了 ***")
# --------------------------------------------------------------------