More than 5 years have passed since last update.

HDF5へ様々なファイルの格納

Posted at 2020-03-20

はじめに

様々な実験データファイルをまとめる必要があり、階層構造をもつHDF形式でのファイルの格納方法について備忘録として記載します。

参考情報

・HDF一般
Pythonを使いHDFファイルの階層構造を把握してデータを読み込む
 意外と奥が深い、HDFの世界（Python・h5py入門）

・text、jsonファイルの保存の参考
HDF5のpython実装で文字列データを圧縮して保存するだけ
 How to use HDF5 files in Python

・excelファイルの保存
pyconjp 発表「知ろう！使おう！HDF5ファイル！」の落ち穂拾い

環境

Win10　Pro
Anaconda Python3.7

確認のためにHDFView-3.1.0を利用
HDFView

テスト内容

HDFのデータ構造（超簡単に）

HDF5 グループ = フォルダと同じ
データセット = 1つのファイル（CSVなど）と同じ

テストで作るもの

HDFファイルの下に、以下のグループ（ホルダーと同じ）を作成する。
aaa、bbb、ccc、ddd

それぞれのホルダーに
aaaにはCSVデータ、bbbにはnp.arrayデータ、 cccにはimageデータ、 dddにはtxt、JSONデータ、eeeにはexcelデータ
を入れる。
HDFView-3.1.0で確認する。

コード

import json
import numpy as np
import h5py
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

試しに入れるデータ作成

def base_func(x,a,b,c):
    y = c + a*(x - b)**2
    return y

x = np.arange(-30, 30, 1)
para = [2.0,5.0,10.0]
np.random.seed(seed=10)
y = base_func(x,para[0],para[1],para[2])+np.random.normal(0, 60, len(x))

plt.scatter(x , y)
plt.show()

# x,yを結合する
xy=np.c_[x,y]

# dataをdataframeでcsvにする
df = pd.DataFrame({'x':x,'y':y})

参考：
pandasのファイル出力形式まとめ

# CSV,json,txt,excelで書きだす。
df.to_csv('csvdata.csv',index=False)
df.to_json('jsondata2.json')
df.to_json('jsondata.txt')
df.to_excel('excel.xlsx',index=False)

groupの作成

ホルダーを作るのとほぼ同じ

hdf_file_name='test1.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    g_aaa=f.create_group('aaa')
    g_bbb=f.create_group('bbb')
    g_ccc=f.create_group('ccc')

csvを格納

CSVを一度Pandasで読み込んでnp.arrayにして書き込む

hdf_file_name='test_csv.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    g_aaa=f.create_group('aaa')

    df2 = pd.read_csv('csvdata.csv')
    g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)

attributeをつける

hdf_file_name='test_csv.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    g_aaa=f.create_group('aaa')
    
    df2 = pd.read_csv('csvdata.csv')
    g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)
    
    #-----追加-------
    g_aaa_data.attrs['year'] = '2020'
    g_aaa_data.attrs['features'] = 'Quadratic curve'
    #---------------

np.arrayを格納

hdf_file_name='test_np.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    g_bbb=f.create_group('bbb')
    g_bbb_data = f.create_dataset('bbb/npdata',data=xy)

画像を格納

matplotlib,　opencv,　pillowなどを使って画像を読み込んでnumpy形式にします。
Pythonの画像読み込み: PIL, OpenCV, scikit-image

# matplotlib
img = plt.imread('lena.jpeg')
plt.imshow(img)
print(type(img))

# pillow ->np.arrayに変換必要
from PIL import Image
img2 = Image.open('lena.jpeg')
plt.imshow(img2)
print(type(img2))
# Pllowはnp.array形式ではないので、変換する
img3= np.array(img2)
print(type(img3))

    <class 'PIL.JpegImagePlugin.JpegImageFile'>
    <class 'numpy.ndarray'>

hdf_file_name='test_img.h5'
with h5py.File(hdf_file_name, mode='w') as f: 
    g_ccc=f.create_group('ccc')
    g_ccc_data= f.create_dataset('ccc/lena',data=img)
    g_ccc_data.attrs['name'] = 'lena'
    g_ccc_data2= f.create_dataset('ccc/lena2',data=img3)  
    g_ccc_data2.attrs['name'] = 'lena2'

TextとJSONを格納

参考：
HDF5のpython実装で文字列データを圧縮して保存するだけ
 How to use HDF5 files in Python

# Text data の確認
with open('jsondata.txt') as f:
    s =f.read()
print(s)

# textを格納
with h5py.File("test_text.h5", "w") as f:
    with open('jsondata.txt') as ff:
        ss =str(ff.read())
    
    g_ddd = f.create_group("ddd")
    dt = h5py.special_dtype(vlen=str)
#     dt = h5py.string_dtype()
    g_ddd_data = g_ddd.create_dataset("text", (1, ), dtype=dt)
    g_ddd_data[0] = ss

# 格納したtextを見てみる
with h5py.File("test_text.h5", "r") as f:
    d = f["ddd/text"]
    print(d[0])
    print(type(d[0]))


    {"x":{"0":-30,"1":-29,"2":-28,"3":-27,"4":-26,"5":-25,"6":-24,"7":-23,"8":-22,"9":-21,"10":-20,"11":-19,"12":-18,"13":-17,"14":-16,"15":-15,"16":-14,"17":-13,"18":-12,"19":-11,"20":-10,"21":-9,"22":-8,"23":-7,"24":-6,"25":-5,"26":-4,"27":-3,"28":-2,"29":-1,"30":0,"31":1,"32":2,"33":3,"34":4,"35":5,"36":6,"37":7,"38":8,"39":9,"40":10,"41":11,"42":12,"43":13,"44":14,"45":15,"46":16,"47":17,"48":18,"49":19,"50":20,"51":21,"52":22,"53":23,"54":24,"55":25,"56":26,"57":27,"58":28,"59":29},"y":{"0":2539.8951902478,"1":2364.9167384639,"2":2095.2759824733,"3":2057.4969690043,"4":1969.2801584334,"5":1766.7948663569,"6":1707.9306951415,"7":1584.5129115429,"8":1468.257485856,"9":1351.5239873644,"10":1285.9815713972,"11":1234.1822424287,"12":1010.096059769,"13":1039.696444679,"14":905.7178078075,"15":836.7082567698,"16":663.8038672901,"17":666.1082127069,"18":677.0722201102,"19":457.2117068529,"20":341.3363031605,"21":297.3976622461,"22":363.96420984,"23":441.0980398427,"24":319.4214752046,"25":310.3573327985,"26":177.9489529501,"27":221.8797826256,"28":91.7251207221,"29":118.7922510916,"30":43.9609686796,"31":9.0414591537,"32":35.9624977456,"33":-10.5685208956,"34":90.5083848518,"35":21.700796754,"36":36.0125992951,"37":-2.2579402251,"38":103.3883358253,"39":-1.9181701201,"40":99.6138930736,"41":60.9476865136,"42":51.6339984142,"43":108.6397669869,"44":123.7245314638,"45":197.2381416774,"46":231.6515852362,"47":316.7301961788,"48":381.9091601794,"49":393.1547845364,"50":458.445679791,"51":539.3456522282,"52":555.60725572,"53":700.4896011956,"54":782.533484303,"55":822.2148478396,"56":1035.6822198931,"57":1033.0475362506,"58":1061.2636517284,"59":1140.2691731715}}
    <class 'str'>

# JSONを格納
with h5py.File("test_json.h5", "w") as f:
    
    with open('jsondata2.json') as ff:
        ssj =json.load(ff)
    
    g_ddd = f.create_group("ddd")
    g_ddd_data2 = g_ddd.create_dataset("json",data=json.dumps(ssj))

# JSONを取り出してみてみる
with h5py.File("test_json.h5", 'r') as f:
    metadata = json.loads(f['ddd/json'][()])
    for k in metadata:
        print('{} => {}'.format(k, metadata[k]))


    x => {'0': -30, '1': -29, '2': -28, '3': -27, '4': -26, '5': -25, '6': -24, '7': -23, '8': -22, '9': -21, '10': -20, '11': -19, '12': -18, '13': -17, '14': -16, '15': -15, '16': -14, '17': -13, '18': -12, '19': -11, '20': -10, '21': -9, '22': -8, '23': -7, '24': -6, '25': -5, '26': -4, '27': -3, '28': -2, '29': -1, '30': 0, '31': 1, '32': 2, '33': 3, '34': 4, '35': 5, '36': 6, '37': 7, '38': 8, '39': 9, '40': 10, '41': 11, '42': 12, '43': 13, '44': 14, '45': 15, '46': 16, '47': 17, '48': 18, '49': 19, '50': 20, '51': 21, '52': 22, '53': 23, '54': 24, '55': 25, '56': 26, '57': 27, '58': 28, '59': 29}
    y => {'0': 2539.8951902478, '1': 2364.9167384639, '2': 2095.2759824733, '3': 2057.4969690043, '4': 1969.2801584334, '5': 1766.7948663569, '6': 1707.9306951415, '7': 1584.5129115429, '8': 1468.257485856, '9': 1351.5239873644, '10': 1285.9815713972, '11': 1234.1822424287, '12': 1010.096059769, '13': 1039.696444679, '14': 905.7178078075, '15': 836.7082567698, '16': 663.8038672901, '17': 666.1082127069, '18': 677.0722201102, '19': 457.2117068529, '20': 341.3363031605, '21': 297.3976622461, '22': 363.96420984, '23': 441.0980398427, '24': 319.4214752046, '25': 310.3573327985, '26': 177.9489529501, '27': 221.8797826256, '28': 91.7251207221, '29': 118.7922510916, '30': 43.9609686796, '31': 9.0414591537, '32': 35.9624977456, '33': -10.5685208956, '34': 90.5083848518, '35': 21.700796754, '36': 36.0125992951, '37': -2.2579402251, '38': 103.3883358253, '39': -1.9181701201, '40': 99.6138930736, '41': 60.9476865136, '42': 51.6339984142, '43': 108.6397669869, '44': 123.7245314638, '45': 197.2381416774, '46': 231.6515852362, '47': 316.7301961788, '48': 381.9091601794, '49': 393.1547845364, '50': 458.445679791, '51': 539.3456522282, '52': 555.60725572, '53': 700.4896011956, '54': 782.533484303, '55': 822.2148478396, '56': 1035.6822198931, '57': 1033.0475362506, '58': 1061.2636517284, '59': 1140.2691731715}

excelを格納（圧縮なし）

参考：
pyconjp 発表「知ろう！使おう！HDF5ファイル！」の落ち穂拾い

with h5py.File('test_excel.h5', mode='w') as f:
    with open('excel.xlsx',mode='rb') as ef:
        eb = ef.read()
    g_eee = f.create_group("eee")
    dt = dtype=h5py.special_dtype(vlen=np.dtype('uint8'))
    eba=np.frombuffer(eb, dtype='uint8')
    g_eee_data = g_eee.create_dataset("excel",(1,), dtype=dt)
    g_eee_data[0]=eba

# HDF5から取り出してSaveする。
with h5py.File('test_excel.h5', mode='r') as f:
    d = f["eee/excel"]
   
    with open('excelout.xlsx','wb') as w:
        w.write(d[0])

比較してみる（同じ内容になっている）

今まで内容すべて

hdf_file_name='test_all.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    
    g_aaa=f.create_group('aaa')
    
    df2 = pd.read_csv('csvdata.csv')
    g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)
    g_aaa_data.attrs['colums'] = '{}'.format(df2.columns)
    g_aaa_data.attrs['year'] = '2020'
    g_aaa_data.attrs['features'] = 'Quadratic curve'   
    
    g_bbb=f.create_group('bbb')
    g_bbb_data = f.create_dataset('bbb/npdata',data=xy)
    

    g_ccc=f.create_group('ccc')
    g_ccc_data= f.create_dataset('ccc/lena',data=img)
    g_ccc_data.attrs['name'] = 'lena'
    g_ccc_data2= f.create_dataset('ccc/lena2',data=img3)  
    g_ccc_data2.attrs['name'] = 'lena2'


    g_ddd=f.create_group('ddd')
    g_eee = f.create_group('eee')
    
    
    with open('jsondata.txt') as f:
        ss =f.read() 
    dt = h5py.special_dtype(vlen=str)
    g_ddd_data=g_ddd.create_dataset('text',(1,), dtype=dt)
    g_ddd_data[0] = ss
    
    with open('jsondata2.json') as f:
        ssj =json.load(f)   
    g_ddd_dataj=g_ddd.create_dataset('json',data=json.dumps(ssj))
    
    
    with open('excel.xlsx',mode='rb') as ef:
        eb = ef.read()

    dte = dtype=h5py.special_dtype(vlen=np.dtype('uint8'))
    eba=np.frombuffer(eb, dtype='uint8')
    g_eee_data = g_eee.create_dataset("excel",(1,), dtype=dte)
    g_eee_data[0]=eba

まとめ

いろいろなファイルをHDFに格納するテストをしてみました。基本は、numpy形式にしていれる感じでした。numpyのデータタイプはいろいろあるので、そのデータタイプとHDFとのデータタイプについて少し調べてみたいと思います。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up