LoginSignup
0
0

More than 3 years have passed since last update.

言語処理100本ノック(2020): 23

Posted at
"""
23. セクション構造
記事中に含まれるセクション名とそのレベル(例えば”== セクション名 ==”なら1)を表示せよ.
"""

import json
import re


def get_uk_text(path):
    with open(path) as f:
        for line in f:
            line_data = json.loads(line)
            if line_data["title"] == "イギリス":
                data = line_data
                break
    return data["text"]


uk_text = get_uk_text("jawiki-country.json")
# See uk_text.txt


# ans23
def get_section(string: str) -> list:
    """
    https://docs.python.org/3/library/re.html#regular-expression-syntax

    - re.VERBOSE  allow us add command to explain the regular expression
    - re.S        allow to recognize '\n'
    - (...)       matches whatever regular expression is inside the parentheses,
    - (?:...)     a non-capturing version of regular parentheses.
    - ?           causes the resulting RE to match 0 or 1 repetitions
    - *?          the '*' qualifier is greedy.
                  Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
                  e.g. <.*> is matched against '<a> b <c>'
                  e.g. <.*?> will match only '<a>'
    Input:
        - ===主要都市===
    Return:
        - [('==', '脚注'), ...]
    """
    pattern = re.compile(
        r"""
            ^     # 行頭
            (={2,}) # キャプチャ対象、2個以上の'='
            \s*     # 余分な0個以上の空白('哲学'や'婚姻'の前後に余分な空白があるので除去)
            (.+?)   # キャプチャ対象、任意の文字が1文字以上、非貪欲(以降の条件の巻き込み防止)
            \s*     # 余分な0個以上の空白
            \1      # 後方参照、1番目のキャプチャ対象と同じ内容
            .*      # 任意の文字が0文字以上
            $       # 行末
            """,
        re.MULTILINE | re.VERBOSE,
    )
    result = re.findall(pattern, string)
    return result


def get_level(sections: list):
    result = []
    for level, value in sections:
        level = len(level) - 1
        result.append((level, value))
    return result


sections = get_section(uk_text)  # [('==', '脚注'), ...]
result = get_level(sections)  # [(1, '脚注'), ...]

for level, value in result:
    print(" " * 2 * (level - 1) + value)
# 国名
# 歴史
# 地理
#   主要都市
#   気候
# 政治
#   元首
#   法
#   内政
#   地方行政区分
#   外交・軍事

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0