"""
24. ファイル参照の抽出
記事から参照されているメディアファイルをすべて抜き出せ.
"""
import json
import re
def get_uk_text(path):
with open(path) as f:
for line in f:
line_data = json.loads(line)
if line_data["title"] == "イギリス":
data = line_data
break
return data["text"]
uk_text = get_uk_text("jawiki-country.json")
# See uk_text.txt
# ans24
def get_file(string: str) -> list:
"""
https://docs.python.org/3/library/re.html#regular-expression-syntax
RE:
- re.X (re.VERBOSE) allow us add command to explain the regular expression
- re.M (re.MULTILINE) apply match to each line. If not specified, only match the first line.
- re.S (re.DOTALL) allow to recognize '\n'
- (...) matches whatever regular expression is inside the parentheses,
- (?:...) a non-capturing version of regular parentheses.
- ? causes the resulting RE to match 0 or 1 repetitions
- *? the '*' qualifier is greedy.
Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
e.g. <.*> is matched against '<a> b <c>'
e.g. <.*?> will match only '<a>'
Input:
- '[[ファイル:2019 Greenwich Peninsula & Canary Wharf.jpg|150px]]'
Return:
- '2019 Greenwich Peninsula & Canary Wharf.jpg'
"""
pattern = re.compile(
r"""
(?:File|ファイル) # 非キャプチャ、'File'か'ファイル'
:
(.+?) # キャプチャ対象、任意の文字1文字以上、非貪欲
\|
""",
re.MULTILINE | re.VERBOSE,
)
result = re.findall(pattern, string)
return result
files = get_file(uk_text)
for f in files:
print(f)
# Royal Coat of Arms of the United Kingdom.svg
# Descriptio Prime Tabulae Europae.jpg
# Lenepveu, Jeanne d'Arc au siège d'Orléans.jpg
More than 3 years have passed since last update.
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme