More than 3 years have passed since last update.

言語処理100本ノック(2020): 24

Posted at 2020-09-30

"""
24. ファイル参照の抽出
記事から参照されているメディアファイルをすべて抜き出せ．
"""

import json
import re


def get_uk_text(path):
    with open(path) as f:
        for line in f:
            line_data = json.loads(line)
            if line_data["title"] == "イギリス":
                data = line_data
                break
    return data["text"]


uk_text = get_uk_text("jawiki-country.json")
# See uk_text.txt


# ans24
def get_file(string: str) -> list:
    """
    https://docs.python.org/3/library/re.html#regular-expression-syntax

    RE:
        - re.X (re.VERBOSE)    allow us add command to explain the regular expression
        - re.M (re.MULTILINE)  apply match to each line. If not specified, only match the first line.
        - re.S (re.DOTALL)      allow to recognize '\n'
        - (...)       matches whatever regular expression is inside the parentheses,
        - (?:...)     a non-capturing version of regular parentheses.
        - ?           causes the resulting RE to match 0 or 1 repetitions
        - *?          the '*' qualifier is greedy.
                    Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
                    e.g. <.*> is matched against '<a> b <c>'
                    e.g. <.*?> will match only '<a>'

    Input:
        - '[[ファイル:2019 Greenwich Peninsula & Canary Wharf.jpg|150px]]'
    Return:
        - '2019 Greenwich Peninsula & Canary Wharf.jpg'
    """
    pattern = re.compile(
        r"""
            (?:File|ファイル)  # 非キャプチャ、'File'か'ファイル'
            :
            (.+?)            # キャプチャ対象、任意の文字1文字以上、非貪欲
            \|
            """,
        re.MULTILINE | re.VERBOSE,
    )
    result = re.findall(pattern, string)
    return result


files = get_file(uk_text)

for f in files:
    print(f)
# Royal Coat of Arms of the United Kingdom.svg
# Descriptio Prime Tabulae Europae.jpg
# Lenepveu, Jeanne d'Arc au siège d'Orléans.jpg

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up