LoginSignup
0
0

More than 3 years have passed since last update.

言語処理100本ノック(2020): 26

Posted at
"""
26. 強調マークアップの除去
25の処理時に,テンプレートの値からMediaWikiの強調マークアップ(弱い強調,強調,強い強調のすべて)を除去してテキストに変換せよ(参考: マークアップ早見表).
"""

import json
import re

import utils


def get_uk_text(path):
    with open(path) as f:
        for line in f:
            line_data = json.loads(line)
            if line_data["title"] == "イギリス":
                data = line_data
                break
    return data["text"]


def get_basic_info(string: str) -> str:
    """Get basic information section
    """
    pattern = re.compile(
        r"""
            ^\{\{基礎情報.*?$   # '{{基礎情報'で始まる行
            (.*?)       # キャプチャ対象、任意の0文字以上、非貪欲
            ^\}\}$      # '}}'で終わる行
        """,
        re.MULTILINE | re.DOTALL | re.VERBOSE,
    )

    return re.findall(pattern, string)[0]


def get_content(string: str) -> list:
    r"""
    https://docs.python.org/3/library/re.html#regular-expression-syntax

    RE:
        - re.X (re.VERBOSE)     Allow us add command to explain the regular expression
        - re.M (re.MULTILINE)   Apply match to each line. If not specified, only match the first line.
        - re.S (re.DOTALL)      Allow to recognize '\n'
        - ^\|       String begin with |
        - ?         Causes the resulting RE to match 0 or 1 repetitions

        - *?        The '*' qualifier is greedy.
                    Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
                    e.g. <.*> is matched against '<a> b <c>'
                    e.g. <.*?> will match only '<a>'

        - (...)     Matches whatever regular expression is inside the parentheses,
        - (?=...)   Matches if ... matches next, but doesn’t consume any of the string. This is called a lookahead assertion.
                    For example, Isaac (?=Asimov) will match 'Isaac ' only if it’s followed by 'Asimov'.
        - (?:...)   A non-capturing version of regular parentheses.

    Input:
        - '|国章リンク =([[イギリスの国章|国章]])'
    Return:
        - {'国章リンク': '([[イギリスの国章|国章]])'}
    """
    pattern = re.compile(
        r"""
            ^\|
            (.+?)
            \s*
            =
            \s*
            (.+?)
            (?:
                (?=\n\|)
                |
                (?=\n$)
                )
            """,
        re.MULTILINE | re.DOTALL | re.VERBOSE,
    )
    result = re.findall(pattern, string)
    return {k: v for k, v in result}  # dict is ordered when using python 3.7


def remove_markup(text: str) -> str:
    pattern = re.compile(
        r"""
            \'{2,5} # 2〜5個の'
        """,
        re.VERBOSE,
    )

    return re.sub(pattern, "", text)


uk_text = get_uk_text("jawiki-country.json")  # See uk_text.txt

# ans25
basic_info = get_basic_info(uk_text)
fields = get_content(basic_info)  # See 25_en_basic_info.json

# ans26
result = {k: remove_markup(v) for k, v in fields.items()}
utils.save_json(result, "26_no_markup.json")
# ('確立形態4', "現在の国号「'''グレートブリテン及び北アイルランド連合王国'''」に変更")
# ->
# ('確立形態4', '現在の国号「グレートブリテン及び北アイルランド連合王国」に変更')

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0