wikidataからnetoworkxをかましてmermaid記法で紫式部の系図を作成

Posted at 2024-08-12

概要

　標題のとおりですが、以下の内容になります。

wikidataから紫式部の先祖、子孫のデータをjsonで抽出
jsonから配列データを作成し、netoworkxに入力、距離の近い（〇親等）データを抽出
mermaid記法で紫式部の系図を作成

wikidata

wikidataから、紫式部の先祖、子孫のデータをjsonで抽出します。
Wikidata Query Service を使って、以下のクエリを入力すると、
135件のデータが出力できます（2024/8/12現在）。
Q81731 が紫式部で、P22が父、P25が母で、述語の最後にアスタリスクをつけているのが ZeroOrMorePath というやつです。

SELECT DISTINCT  ?human ?father ?mother ?humanLabel ?fatherLabel ?motherLabel ?sex
WHERE
{
{wd:Q81731 wdt:P40* ?human .
    ?human wdt:P31 wd:Q5 .      
    OPTIONAL{?human wdt:P22 ?father .}
    OPTIONAL{?human wdt:P25 ?mother .}
    OPTIONAL{?human wdt:P21 ?sex .}
}
UNION
{
wd:Q81731 wdt:P22* ?human .
    ?human wdt:P31 wd:Q5 .      
    OPTIONAL{?human wdt:P22 ?father .}
    OPTIONAL{?human wdt:P25 ?mother .}
    OPTIONAL{?human wdt:P21 ?sex .}
}
UNION
{
wd:Q81731 wdt:P25* ?human .
    ?human wdt:P31 wd:Q5 .      
    OPTIONAL{?human wdt:P22 ?father .}
    OPTIONAL{?human wdt:P25 ?mother .}
    OPTIONAL{?human wdt:P21 ?sex .}
  }

 SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],ja" }
}

ちなみに、以下のURLで、上記のクエリにアクセスできます。
https://w.wiki/Atks

これを、APIでjsonで取り出し、配列を作ります。

# wikidata API からJSONデータを取得
import requests
import re
import pandas as pd

def wikidata2json(loc):
    header= {"content-type": "application/json"}
    r = requests.get(loc,headers=header)
    return r.json()
    
# JSONからリストを生成
def json2list(data):
    print(type(data))
    data_human = data['results']['bindings']
    list_human = []
    paren = re.compile('[\(\)（） ]') 
    for i in data_human:
        # print(i)
        human = i['humanLabel']['value']
        human_id = i['human']['value'].replace(r'http://www.wikidata.org/entity/', '')

        # ネストされたdictにアクセスしたいが、存在しないときはNone
        father = i.get('fatherLabel', {}).get('value', i['humanLabel']['value']+'の父')
        father_id = i.get('father', {}).get('value', human_id +'_father').replace(r'http://www.wikidata.org/entity/', '')
        mother = i.get('motherLabel', {}).get('value', i['humanLabel']['value']+'の母')
        mother_id = i.get('mother', {}).get('value', human_id +'_mother').replace(r'http://www.wikidata.org/entity/', '')
        human = paren.sub('//', human) # △などmarmed で使えない文字がいくつかある
        father = paren.sub('//', father)
        mother = paren.sub('//', mother)
        # mother = i.get('mother', {}).get('value', 0)
        sex_qid = i.get('sex', {}).get('value', 0)
        if sex_qid == 'http://www.wikidata.org/entity/Q6581072':
            sex = 2
        elif sex_qid == 'http://www.wikidata.org/entity/Q6581097':
            sex = 1
        else:
            sex = 0
        list_human.append(['FA00', human, father, mother, sex, 0, human_id, father_id, mother_id])
    return list_human

# 父、母の行が存在しない場合に作成して追加
def single_parent(df_human):
    # new_row = ['Family_ID','Individual_ID','Paternal_ID','Maternal_ID','Sex','Phenotype','human','father','mother']
    for index, row in df_human.iterrows():
        # DataFrame全体ですべての値に存在しなければTrue、そうでなければFalse
        # https://codeseterpie.com/blog/80n4bt37f/
        # 母不在の場合
        if bool(df_human.Individual_ID.ne(row['Maternal_ID']).all().all()):
            new_row = ['FA00', row['Paternal_ID'], 0, 0, 2, 0, 0, 0, 0]
            df_human.loc[str(index)+'_nomaternal'] = new_row
        # 父不在の場合
        if bool(df_human.Individual_ID.ne(row['Paternal_ID']).all().all()):
            new_row = ['FA00', row['Paternal_ID'], 0, 0, 1, 0, 0, 0, 0]
            df_human.loc[str(index)+'_nopaternal'] = new_row
    #条件にマッチしたIndexを取得
    drop_index = df_human.index[df_human['Individual_ID'] == 0]
    #条件にマッチしたIndexを削除
    df_human = df_human.drop(drop_index)
    return df_human

# spouse(配偶者)列追加
def add_spouse(df_human):
    df_human['spouse'] = df_human.apply(lambda x: f"{x['father']}-{x['mother']}", axis=1)
    return df_human

loc ='https://query.wikidata.org/sparql?query=SELECT%20DISTINCT%20%20%3Fhuman%20%3Ffather%20%3Fmother%20%3FhumanLabel%20%3FfatherLabel%20%3FmotherLabel%20%3Fsex%0AWHERE%0A%7B%0A%7Bwd%3AQ81731%20wdt%3AP40%2a%20%3Fhuman%20.%0A%20%20%20%20%3Fhuman%20wdt%3AP31%20wd%3AQ5%20.%20%20%20%20%20%20%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP22%20%3Ffather%20.%7D%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP25%20%3Fmother%20.%7D%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP21%20%3Fsex%20.%7D%0A%7D%0AUNION%0A%7B%0Awd%3AQ81731%20wdt%3AP22%2a%20%3Fhuman%20.%0A%20%20%20%20%3Fhuman%20wdt%3AP31%20wd%3AQ5%20.%20%20%20%20%20%20%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP22%20%3Ffather%20.%7D%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP25%20%3Fmother%20.%7D%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP21%20%3Fsex%20.%7D%0A%7D%0AUNION%0A%7B%0Awd%3AQ81731%20wdt%3AP25%2a%20%3Fhuman%20.%0A%20%20%20%20%3Fhuman%20wdt%3AP31%20wd%3AQ5%20.%20%20%20%20%20%20%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP22%20%3Ffather%20.%7D%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP25%20%3Fmother%20.%7D%0A%20%20%20%20OPTIONAL%7B%3Fhuman%20wdt%3AP21%20%3Fsex%20.%7D%0A%20%20%7D%0A%0A%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22%5BAUTO_LANGUAGE%5D%2Cja%22%20%7D%0A%7D&format=json'

# wikidataからjsonを取得
data =wikidata2json(loc)
# jsonから配列に変換
list_human = json2list(data)
# 配列を pandas dataframeに変換

df_human = pd.DataFrame(list_human, columns=['#Family_ID','Individual_ID','Paternal_ID','Maternal_ID','Sex','Phenotype','human','father','mother'])
df_human = single_parent(df_human)
# 配偶者蘭を追加
df_human = add_spouse(df_human)
# 重複を削除
df_human.drop_duplicates(inplace=True)

# dataframeから配列の作成
#条件にマッチしたIndexを取得
def drop_index(df):
    df_drop_index= df.index[df['spouse'] == '0-0']
    #条件にマッチしたIndexを削除
    return df.drop(df_drop_index)

# 配偶者spouseから、子個人Individual へ
person_spouse = df_human[['spouse','Individual_ID']]
person_spouse = drop_index(person_spouse)
# person_spouse = person_spouse.reindex(columns=['spouse','Individual_ID'])
# person_spouse
# 配列にする
person_spouse_list = person_spouse.values.tolist()

# 父から配偶者spouse
paternal_spouse = df_human[['Paternal_ID','spouse']]
paternal_spouse = drop_index(paternal_spouse)
# paternal_spouse
paternal_spouse_list = paternal_spouse.values.tolist()
# print(paternal_spouse_list)

# 母から配偶者
maternal_spouse = df_human[['Maternal_ID','spouse']]
maternal_spouse = drop_index(maternal_spouse)
# maternal_spouse
maternal_spouse_list = maternal_spouse.values.tolist()  #.str.spli(',')  #   .to_numpy() #.values
# print(maternal_spouse_list)

# print(person_spouse_list + paternal_spouse_list + maternal_spouse_list)
pedigree_list = person_spouse_list + paternal_spouse_list + maternal_spouse_list

networkx

このままだと、系図を描画したときに、細かすぎるので、networkx に入れて、抽出します。

# 父母が０をdrop 
df_human.columns
df_human_2 = df_human[df_human['Paternal_ID'] != 0]
df_human_2 = df_human_2[df_human_2['Maternal_ID'] != 0]
df_human_2[['Individual_ID','Paternal_ID','Maternal_ID']] #.to_dict()

# 'Paternal_ID','Maternal_ID'のタプルを作って、辞書にするdf_human_2['Paternal_ID'])))
df_human_2['tuple'] = list(zip(df_human_2['Paternal_ID'], df_human_2['Maternal_ID']))
adjacency_dict = dict(zip(df_human_2['Individual_ID'], df_human_2['tuple']))

H = nx.DiGraph(adjacency_dict)  # 有向グラフ (Directed Graph)
list(H.edges())

nx.draw(H, node_color='lightblue', with_labels = True, font_family='Meiryo')

ちなみにそのまま描画すると、以下のように意味不明です。

エッジをソースから幅優先探索で反復する。

import itertools
# 先祖と子孫
bfs_G1 = list(nx.bfs_edges(H,source="紫式部",depth_limit = 2,  reverse=True))
bfs_G2 = list(nx.bfs_edges(H,source="紫式部",depth_limit = 2,  reverse=False))
bfs_G = bfs_G1 + bfs_G2
# 多次元配列の要素を平らに
duplication = list(itertools.chain.from_iterable(bfs_G))
# 集合にして要素の重複削除
family_2 = list(set(duplication))

こんなリストができます。

['藤原雅正', '大弐三位', '高階成章の娘////藤原通宗の妻//', '藤原為信////藤原文範の子//', '藤原定方の娘////藤原雅正正室//', '藤原為時', '藤原為信の娘////藤原為時の妻//', '藤原為信の娘////藤原為時の妻//の母', '源良宗の妻', '高階為家', '紫式部']

ちなみに、こういう抽出の仕方をすると、もう少し広めのデータになります。

#  nx.ego_graphを用いて抽出する
# 指定された半径内のノードを中心とした近傍の誘導サブグラフを返す
ego_G = nx.ego_graph(H, '紫式部', radius=6, undirected=True, center=True)
family = [c for c in ego_G.nodes]

mermaid

networkxで抽出したデータから、mermaid記法のデータを作ります。

# in演算子で特定の値を含む行を抽出（isin()と同等）
df_human_3 = df_human_2[df_human_2['Individual_ID'].isin(family_2)]

# dataframe から mermaid用のリストを作る
def person2list(df):
    # 配偶者spouseから、子個人Individual へ
    person_spouse = df[['spouse','Individual_ID']]
    person_spouse = drop_index(person_spouse)
    # 配列にする
    person_spouse_list = person_spouse.values.tolist()

    # 父から配偶者spouse
    paternal_spouse = df[['Paternal_ID','spouse']]
    paternal_spouse = drop_index(paternal_spouse)
    # paternal_spouse
    paternal_spouse_list = paternal_spouse.values.tolist()

    # 母から配偶者
    maternal_spouse = df[['Maternal_ID','spouse']]
    maternal_spouse = drop_index(maternal_spouse)
    # maternal_spouse
    maternal_spouse_list = maternal_spouse.values.tolist()  #.str.spli(',')  #   .to_numpy() #.values
    # print(maternal_spouse_list)

    # print(person_spouse_list + paternal_spouse_list + maternal_spouse_list)
    return person_spouse_list + paternal_spouse_list + maternal_spouse_list

pedigree_list = person2list(df_human_3)

# 配偶者ノードに ( ):::class1　を追加
for i in range(len(pedigree_list)):
    # bool("〇〇")で〇〇が文字列(または数値)の時はtrue、空文字の時はfalseが返る
    if bool(pedigree_list[i][1]):
        #print(pedigree_list[i][1])
        if pedigree_list[i][1].startswith('Q'):
            pedigree_list[i][1] = pedigree_list[i][1] +  '( ):::class1'
            # print(pedigree_list[i][1])

# 2次元配列の重複行を削除
s = list(map(list, set(map(tuple, pedigree_list))))
pedigree_list = list(s)  # その集合からリストを作成

# mermaid用の矢印つきリストを生成する
pedigree_list_arrow = [item[0] + ' --> ' + item[1] for item in pedigree_list]

def generate_report(export_path):
    """
    Markdown のレポートを生成する
    """
    
    # 変更のない固定文を出力する
    text = """\
```mermaid
---
title: familly tree
config:
  theme: base
  themeCSS:
    .node circle {fill: #FAFAFA;}
    .label text {fill: #FAFBF9 !important;} .output {font-size:60px;}
---
graph TD
    classDef class1 fill:#fff,fill-opacity:0
"""

    # ファイルに書き出す
    with open(export_path, 'w', encoding='utf-8') as f:
        f.write(text)
        # 配列を出力
        for d in pedigree_list_arrow:
            f.write("%s\n" % d)

    return text

path = r"test.md"
generate_report(path)

できた系図は以下のとおりです。
wikidataにデータがある限りですが、一応、あっているようです。

ちなみに、複雑な方でmermaidを作って、貼ってみたところ、以下のようなエラーになりました。仕様どおりです。

ERROR Mermaidをレンダリングできませんでした
文字数は2000文字以内にしてください

nx.ego_graphを使ったバージョンはここに貼ってあるので、ご興味のある方は見てみてください。

おまけ

せっかくnetworkxに入れたので、平重盛（清盛の子）と紫式部のパスをみてみました。

print(nx.shortest_path(H, source='平重盛', target='紫式部'))
['平重盛', '高階基章の女////平清盛正室//', '高階基章', '高階為家の娘////源家実の妻//', '高階為家', '大弐三位', '紫式部']

意外と、近いですね。

おわりに

最初は、バイオインフォマティクス分野で使うpedファイルを作って、そこからgraphvizで系図が書けるかと試行錯誤しましたが、なかなかむずかしかったので、mermaidにしてしまいました。
ちなみに、こんなdotファイルを作ると、系図が書けます。

graph G {
        edge [dir=none];
        graph [splines=ortho concentrate=true];
        "紫式部" [shape=box, regular=0, color="black", style="filled" fillcolor="white"];
        "大弐三位" [shape=box, regular=0, color="black", style="filled" fillcolor="white"];
        "高階成章" [shape=box, regular=0, color="black", style="filled" fillcolor="white"];
        {spouse_紫式部_藤原宣孝 [shape=point width=0 style=invis]};
        spouse_紫式部_藤原宣孝 -- 大弐三位
         {rank=same; 紫式部 -- spouse_紫式部_藤原宣孝 -- 藤原宣孝};
        {spouse_大弐三位_高階成章 [shape=point width=0 style=invis]};
        {rank=same; 大弐三位 -- spouse_大弐三位_高階成章 -- 高階成章};
        {children_大弐三位_高階成章 [shape=point width=0 style=invis]};
        spouse_大弐三位_高階成章 -- children_大弐三位_高階成章
        "高階為家" [shape=box, regular=0, color="black", style="filled" fillcolor="white"];
        children_大弐三位_高階成章 -- 高階為家
        children_大弐三位_高階成章 -- 高階成章の娘////藤原通宗の妻//
        "源良宗の妻" [shape=oval, regular=0, color="black", style="filled" fillcolor="white"];
        children_大弐三位_高階成章 -- 源良宗の妻
        {spouse_藤原兼隆_大弐三位 [shape=point width=0 style=invis]};
        {rank=same; 藤原兼隆 -- spouse_藤原兼隆_大弐三位 -- 大弐三位};
        {children_藤原兼隆_大弐三位 [shape=point width=0 style=invis]};
        spouse_藤原兼隆_大弐三位 -- children_藤原兼隆_大弐三位
}

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up