Wordファイルからハイパーリンクだけ取り出したい。

初心者

Last updated at 2024-08-27Posted at 2024-07-22

from docx import Document

def extract_hyperlinks_with_text(docx_path):
    # Word文書を開く
    doc = Document(docx_path)
    hyperlinks = []

    # 文書のXMLデータを取得
    doc_xml = doc.part._element

    # 文書のXMLを再帰的に走査してhyperlink要素を探す
    for hyperlink in doc_xml.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink'):
        # ハイパーリンクされているテキストを取得
        text_elements = hyperlink.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t')
        text = ''.join([el.text for el in text_elements])

        # w:anchor属性からリンク先を取得
        link = hyperlink.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}anchor')
        
        # リンクが存在する場合にリストに追加
        if link:
            hyperlinks.append((text, link))

    return hyperlinks

# 使用例
docx_path = 'path_to_your_document.docx'
hyperlinks_with_text = extract_hyperlinks_with_text(docx_path)
for text, link in hyperlinks_with_text:
    print(f'Text: {text}, Hyperlink: {link}')

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up