LoginSignup
0
0

More than 1 year has passed since last update.

名古屋市 子宮頸がん予防接種調査の調査回答データのPDFをCSV変換

Last updated at Posted at 2021-11-14

の調査回答データをCSVに変換

資料

PDFのテキスト変換方法がちがうので文字オーバーの文字が取得できないので

上記のテキストと3箇所違います

  • 2068 聞いた「た」りした 文字重複
  • 6216 過換気になったよう「だ」 文字欠損
  • 22789 しかし痛みは「消」 文字欠損

プログラム


# プログラム

import pdfplumber
import pandas as pd
import requests
from tqdm.notebook import tqdm

import pathlib
from urllib.parse import urljoin

# ダウンロード

url = "http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/"

files = [
    {"name": "kaitodeta1.pdf", "page": 155},
    {"name": "kaitodeta2.pdf", "page": 155},
    {"name": "kaitodeta3.pdf", "page": 155},
    {"name": "kaitodeta4.pdf", "page": 155},
    {"name": "kaitodeta5.pdf", "page": 175},
]

def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

# 抽出

result = []

for i, file in enumerate(files):

    link = urljoin(url, file["name"])
    f = fetch_file(link)

    pdf = pdfplumber.open(f)

    dfs = [[] for _ in range(len(varticals[i]))]

    for p, page in tqdm(enumerate(pdf.pages), desc="page"):

        q, mod = divmod(p, file["page"])

        table_settings = {
            "explicit_vertical_lines": varticals[i][q],
            "explicit_horizontal_lines": horizontals[-1],
            "snap_tolerance": 3,
            "intersection_tolerance": 15,
        }

        if mod == 0:

            table_settings["explicit_horizontal_lines"] = horizontals[i]
            crop = page.within_bbox((0, 345, page.width, page.height))
            temp = pd.DataFrame(crop.extract_table(table_settings))

        else:

            temp = pd.DataFrame(page.extract_table(table_settings))

        dfs[q].append(temp)

    result.append(pd.DataFrame(pd.concat([pd.concat(d).reset_index(drop=True) for d in dfs], axis=1).values))

# 2429 2389 3874 2584 2799

df0 = pd.DataFrame(pd.concat(result).values)

df1 = df0[df0[0].str.isnumeric()].reset_index(drop=True).copy()

df1[274] = df1[274].fillna("") + df1[275].fillna("")

df1.drop(275, axis=1, inplace=True)

df1.to_csv("result.csv", index=False, header=False)

PDFの縦・横の区切り線

区切り線抽出

# PDF

# 横線(各PDF表タイトルページのみ(kaitodeta1のみ違う)、2ページ以降は共通)
horizontals = [
    [348.226,362.506,374.746,386.986,399.226,411.466,423.706,435.946,448.186,460.426,472.666,484.906,497.146,509.386,521.626,532.666,],
    [346.186,358.426,370.666,382.906,395.146,407.386,419.626,431.866,444.106,456.346,468.586,480.826,493.066,505.306,517.546,528.586,],
    [346.186,358.426,370.666,382.906,395.146,407.386,419.626,431.866,444.106,456.346,468.586,480.826,493.066,505.306,517.546,528.586,],
    [346.186,358.426,370.666,382.906,395.146,407.386,419.626,431.866,444.106,456.346,468.586,480.826,493.066,505.306,517.546,528.586,],
    [346.186,358.426,370.666,382.906,395.146,407.386,419.626,431.866,444.106,456.346,468.586,480.826,493.066,505.306,517.546,528.586,],
    [55.186,67.426,79.666,91.906,104.146,116.386,128.626,140.866,153.106,165.346,177.586,189.826,202.066,214.306,226.546,238.786,251.026,263.266,275.506,287.746,299.986,312.226,324.466,336.706,348.946,361.186,373.426,385.666,397.906,410.146,422.386,434.626,446.866,459.106,471.346,483.586,495.826,508.066,520.306,531.346,]
]

# 縦線(各ページごとに違う)

varticals = [
    [
        [50.871,92.271,108.111,123.951,139.821,155.631,171.471,187.311,203.151,218.991,236.991,255.021,272.990,303.351,321.351,339.351,357.351,375.351,393.351,411.351,441.711,459.711,477.711,495.710,513.710,531.710,549.710,580.071,598.071,616.071,634.071,652.071,670.071,688.071,718.431,736.431,754.431,772.431,790.471,],
        [51.031,68.871,86.871,117.231,135.231,153.231,171.231,189.231,207.231,225.231,255.591,273.591,291.591,309.591,327.591,345.591,363.591,393.951,411.951,429.951,447.951,465.951,483.951,501.950,532.311,550.311,568.311,586.311,604.311,622.311,640.311,670.670,688.671,706.671,724.671,742.671,760.671,778.711,],
        [51.031,81.231,99.231,117.231,135.231,153.231,171.231,189.231,219.591,237.591,255.591,273.591,291.591,309.591,327.591,357.951,375.951,393.951,411.951,429.951,447.951,465.951,496.311,514.311,532.311,554.391,576.470,598.550,620.631,650.990,673.071,695.151,717.230,739.311,761.391,783.510,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,213.711,244.071,266.151,288.231,310.311,332.391,354.471,376.551,406.911,428.991,451.071,473.151,495.230,517.311,539.391,569.751,591.831,613.910,635.990,658.071,680.151,702.230,732.590,754.671,776.791,],
        [51.031,72.951,95.031,117.111,139.191,169.551,191.631,213.711,235.791,257.871,279.951,302.031,332.391,354.471,376.551,398.631,420.711,442.791,464.871,495.230,517.311,539.391,561.470,583.550,605.631,627.710,658.071,680.151,702.230,724.311,746.391,768.470,790.590,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,393.111,415.191,445.551,467.631,489.711,511.790,533.871,555.991,],
        [51.031,324.111,346.191,376.551,398.631,420.711,442.791,464.871,486.981,509.031,531.110,553.191,575.271,597.350,619.471,],
        [51.031,253.071,486.351,508.431,530.511,552.590,574.670,596.751,758.911,],
        [51.031,437.271,459.351,481.431,503.511,525.590,547.670,709.831],
        [51.031,284.151,306.231,332.511,469.791,491.871,594.710,616.790,638.871,741.750,],
        [51.031,113.631,135.710,337.911,626.391,648.510],
        [51.031,273.111,407.061,429.111,459.471,481.551,511.910,533.990,564.350,586.431,608.511,638.871,660.951,691.311,713.391,743.751,765.830,787.951,],
        [51.031,81.231,103.311,125.391,155.751,177.831,199.911,230.271,252.351,274.431,304.791,326.871,348.951,379.311,401.391,423.511,],
        [51.031,429.711,451.791,482.151,504.230,526.311,556.670,578.780,600.831,622.910,644.990,667.071,689.190,],
        [51.031,792.150],
        [51.031,792.150],
    ],
    [
        [50.871,92.271,108.111,123.951,139.821,155.631,171.471,187.311,203.151,218.991,236.991,255.021,272.990,303.351,321.351,339.351,357.351,375.351,393.351,411.351,441.711,459.711,477.711,495.710,513.710,531.710,549.710,580.071,598.071,616.071,634.071,652.071,670.071,688.071,718.431,736.431,754.431,772.431,790.471,],
        [51.031,68.871,86.871,117.231,135.231,153.231,171.231,189.231,207.231,225.231,255.591,273.591,291.591,309.591,327.591,345.591,363.591,393.951,411.951,429.951,447.951,465.951,483.951,501.950,532.311,550.311,568.311,586.311,604.311,622.311,640.311,670.670,688.671,706.671,724.671,742.671,760.671,778.711,],
        [51.031,81.231,99.231,117.231,135.231,153.231,171.231,189.231,219.591,237.591,255.591,273.591,291.591,309.591,327.591,357.951,375.951,393.951,411.951,429.951,447.951,465.951,496.311,514.311,532.311,554.391,576.470,598.550,620.631,650.990,673.071,695.151,717.230,739.311,761.391,783.510,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,213.711,244.071,266.151,288.231,310.311,332.391,354.471,376.551,406.911,428.991,451.071,473.151,495.230,517.311,539.391,569.751,591.831,613.910,635.990,658.071,680.151,702.230,732.590,754.671,776.791,],
        [51.031,72.951,95.031,117.111,139.191,169.551,191.631,213.711,235.791,257.871,279.951,302.031,332.391,354.471,376.551,398.631,420.711,442.791,464.871,495.230,517.311,539.391,561.470,583.550,605.631,627.710,658.071,680.151,702.230,724.311,746.391,768.470,790.590,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,395.871,417.951,448.311,470.391,492.471,514.550,536.631,558.710,737.391,759.470,789.871,],
        [51.031,72.951,95.031,117.111,139.191,161.301,183.351,205.431,227.511,249.591,271.671,293.751,495.950,755.391,777.510,],
        [51.031,72.951,95.031,117.111,139.191,341.391,600.831,622.910,644.990,667.071,689.151,711.270,],
        [51.031,253.071,512.511,534.590,637.431,659.511,681.590,784.471],
        [51.031,100.551,122.631,324.831,480.111,502.191,704.431],
        [51.031,792.150],
        [51.031,72.951,235.071,442.101,464.151,494.510,516.590,546.951,569.031,599.391,621.470,643.550,673.911,695.990,726.350,748.431,778.831,],
        [51.031,72.951,95.031,125.391,147.471,169.551,199.911,221.991,244.071,274.431,296.511,318.591,348.951,371.031,393.111,423.471,445.551,467.631,758.871,780.991,],
        [51.031,81.231,103.311,125.391,155.751,177.871,199.911,221.991,244.071,266.151,288.231,],
        [51.031,792.150],
        [51.031,792.150],
    ],
    [
        [50.871,92.271,108.111,123.951,139.821,155.631,171.471,187.311,203.151,218.991,236.991,255.021,272.990,303.351,321.351,339.351,357.351,375.351,393.351,411.351,441.711,459.711,477.711,495.710,513.710,531.710,549.710,580.071,598.071,616.071,634.071,652.071,670.071,688.071,718.431,736.431,754.431,772.431,790.471,],
        [51.031,68.871,86.871,117.231,135.231,153.231,171.231,189.231,207.231,225.231,255.591,273.591,291.591,309.591,327.591,345.591,363.591,393.951,411.951,429.951,447.951,465.951,483.951,501.950,532.311,550.311,568.311,586.311,604.311,622.311,640.311,670.670,688.671,706.671,724.671,742.671,760.671,778.711,],
        [51.031,81.231,99.231,117.231,135.231,153.231,171.231,189.231,219.591,237.591,255.591,273.591,291.591,309.591,327.591,357.951,375.951,393.951,411.951,429.951,447.951,465.951,496.311,514.311,532.311,554.391,576.470,598.550,620.631,650.990,673.071,695.151,717.230,739.311,761.391,783.510,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,213.711,244.071,266.151,288.231,310.311,332.391,354.471,376.551,406.911,428.991,451.071,473.151,495.230,517.311,539.391,569.751,591.831,613.910,635.990,658.071,680.151,702.230,732.590,754.671,776.791,],
        [51.031,72.951,95.031,117.111,139.191,169.551,191.631,213.711,235.791,257.871,279.951,302.031,332.391,354.471,376.551,398.631,420.711,442.791,464.871,495.230,517.311,539.391,561.470,583.550,605.631,627.710,658.071,680.151,702.230,724.311,746.391,768.470,790.590,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,509.031,531.110,561.470,583.550,605.631,627.710,649.790,671.911,],
        [51.031,187.551,209.631,239.991,262.071,284.151,306.231,328.311,350.421,372.471,394.551,416.631,438.711,460.791,482.911,],
        [51.031,372.511],
        [51.031,766.431,788.551],
        [51.031,72.951,95.031,117.111,139.191,460.831],
        [51.031,766.431,788.551],
        [51.031,72.951,95.031,117.111,139.191,460.831],
        [51.031,766.431,788.551],
        [51.031,153.750],
        [51.031,766.431,788.551],
        [51.031,233.071],
        [51.031,766.431,788.551],
        [51.031,213.031],
        [51.031,766.431,788.551],
        [51.031,372.511],
        [51.031,792.150],
        [51.031,72.951,394.551,594.020,616.071,646.470],
        [51.031,212.991,243.351,265.431,295.791,317.871,339.951,370.311,392.391,422.751,444.831,475.191,497.270,519.350,549.710,571.790,593.871,624.230,646.311,668.391,698.751,720.830,742.911,773.311,],
        [51.031,72.951,95.031,125.391,147.471,169.551,491.151,513.230,543.590,565.670,587.751,618.110,640.220,662.271,684.350,706.431,728.511,750.630,],
        [51.031,792.150],
    ],
    [
        [50.871,92.271,108.111,123.951,139.821,155.631,171.471,187.311,203.151,218.991,236.991,255.021,272.990,303.351,321.351,339.351,357.351,375.351,393.351,411.351,441.711,459.711,477.711,495.710,513.710,531.710,549.710,580.071,598.071,616.071,634.071,652.071,670.071,688.071,718.431,736.431,754.431,772.431,790.471,],
        [51.031,68.871,86.871,117.231,135.231,153.231,171.231,189.231,207.231,225.231,255.591,273.591,291.591,309.591,327.591,345.591,363.591,393.951,411.951,429.951,447.951,465.951,483.951,501.950,532.311,550.311,568.311,586.311,604.311,622.311,640.311,670.670,688.671,706.671,724.671,742.671,760.671,778.711,],
        [51.031,81.231,99.231,117.231,135.231,153.231,171.231,189.231,219.591,237.591,255.591,273.591,291.591,309.591,327.591,357.951,375.951,393.951,411.951,429.951,447.951,465.951,496.311,514.311,532.311,554.391,576.470,598.550,620.631,650.990,673.071,695.151,717.230,739.311,761.391,783.510,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,213.711,244.071,266.151,288.231,310.311,332.391,354.471,376.551,406.911,428.991,451.071,473.151,495.230,517.311,539.391,569.751,591.831,613.910,635.990,658.071,680.151,702.230,732.590,754.671,776.791,],
        [51.031,72.951,95.031,117.111,139.191,169.551,191.631,213.711,235.791,257.871,279.951,302.031,332.391,354.471,376.551,398.631,420.711,442.791,464.871,495.230,517.311,539.391,561.470,583.550,605.631,627.710,658.071,680.151,702.230,724.311,746.391,768.470,790.590,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,453.111,475.191,505.550,527.631,549.710,571.790,593.871,615.951,785.071,],
        [51.031,72.951,103.311,125.391,147.471,169.551,191.631,213.741,235.791,257.871,279.951,302.031,324.111,346.191,607.710,],
        [51.031,236.511,258.591,280.671,302.751,324.831,346.911,608.431],
        [51.031,319.311,341.391,363.471,385.551,407.631,429.711,691.231],
        [51.031,236.511,258.591,400.791,435.351,457.431,599.631,681.710,703.831,],
        [51.031,392.391,495.230,517.311,778.831],
        [51.031,792.150],
        [51.031,72.951,334.431,533.900,555.951,586.311,608.391,638.751,660.831,691.191,713.271,735.350,765.710,787.830,],
        [51.031,81.231,103.311,133.671,155.751,177.831,208.191,230.271,252.351,282.711,304.791,326.871,357.231,379.311,401.391,431.751,453.831,475.911,506.270,528.350,550.471,],
        [51.031,357.951,380.031,410.391,432.471,454.551,484.911,507.020,529.071,551.151,573.230,595.311,617.431,],
        [51.031,792.150],
        [51.031,792.150],
    ],
    [
        [50.871,92.271,108.111,123.951,139.821,155.631,171.471,187.311,203.151,218.991,236.991,255.021,272.990,303.351,321.351,339.351,357.351,375.351,393.351,411.351,441.711,459.711,477.711,495.710,513.710,531.710,549.710,580.071,598.071,616.071,634.071,652.071,670.071,688.071,718.431,736.431,754.431,772.431,790.471,],
        [51.031,68.871,86.871,117.231,135.231,153.231,171.231,189.231,207.231,225.231,255.591,273.591,291.591,309.591,327.591,345.591,363.591,393.951,411.951,429.951,447.951,465.951,483.951,501.950,532.311,550.311,568.311,586.311,604.311,622.311,640.311,670.670,688.671,706.671,724.671,742.671,760.671,778.711,],
        [51.031,81.231,99.231,117.231,135.231,153.231,171.231,189.231,219.591,237.591,255.591,273.591,291.591,309.591,327.591,357.951,375.951,393.951,411.951,429.951,447.951,465.951,496.311,514.311,532.311,554.391,576.470,598.550,620.631,650.990,673.071,695.151,717.230,739.311,761.391,783.510,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,213.711,244.071,266.151,288.231,310.311,332.391,354.471,376.551,406.911,428.991,451.071,473.151,495.230,517.311,539.391,569.751,591.831,613.910,635.990,658.071,680.151,702.230,732.590,754.671,776.791,],
        [51.031,72.951,95.031,117.111,139.191,169.551,191.631,213.711,235.791,257.871,279.951,302.031,332.391,354.471,376.551,398.631,420.711,442.791,464.871,495.230,517.311,539.391,561.470,583.550,605.631,627.710,658.071,680.151,702.230,724.311,746.391,768.470,790.590,],
        [51.031,81.231,103.311,125.391,147.471,169.551,191.631,452.511,474.591,504.950,527.030,549.110,571.191,593.271,615.350,775.471,],
        [51.031,72.951,103.311,125.391,147.471,169.551,191.631,213.741,235.791,257.871,279.951,302.031,324.111,346.191,627.750,],
        [51.031,443.511,465.591,487.671,509.751,531.831,553.910,756.150],
        [51.031,199.911,221.991,244.071,266.151,288.231,310.311,532.550,658.191,680.311,],
        [51.031,233.031,512.511,534.590,716.791],
        [51.031,212.991,235.071,516.590,677.391,699.510],
        [51.031,273.111,602.990,625.111],
        [51.031,253.071,682.341,704.391,734.751,756.830,787.231],
        [51.031,72.951,103.311,125.391,147.471,177.831,199.911,230.271,252.351,282.711,304.791,326.871,357.231,379.311,401.391,431.751,453.831,475.911,506.270,528.350,550.431,580.790,602.871,624.951,655.311,677.391,699.510,],
        [51.031,311.031,333.111,363.471,385.551,407.631,437.991,460.101,482.151,504.230,526.311,548.391,570.510,],
        [51.031,615.390],
    ],
]
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0