##はじめに
様々な実験データファイルをまとめる必要があり、階層構造をもつHDF形式でのファイルの格納方法について備忘録として記載します。
####参考情報
・HDF一般
Pythonを使いHDFファイルの階層構造を把握してデータを読み込む
意外と奥が深い、HDFの世界(Python・h5py入門)
・text、jsonファイルの保存の参考
HDF5のpython実装で文字列データを圧縮して保存するだけ
How to use HDF5 files in Python
・excelファイルの保存
pyconjp 発表「知ろう!使おう!HDF5ファイル!」の落ち穂拾い
####環境
Win10 Pro
Anaconda Python3.7
確認のためにHDFView-3.1.0を利用
HDFView
##テスト内容
###HDFのデータ構造(超簡単に)
HDF5 グループ = フォルダと同じ
データセット = 1つのファイル(CSVなど)と同じ
###テストで作るもの
HDFファイルの下に、以下のグループ(ホルダーと同じ)を作成する。
aaa、bbb、ccc、ddd
それぞれのホルダーに
aaaにはCSVデータ、bbbにはnp.arrayデータ、 cccにはimageデータ、 dddにはtxt、JSONデータ、eeeにはexcelデータ
を入れる。
HDFView-3.1.0で確認する。
###コード
import json
import numpy as np
import h5py
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
####試しに入れるデータ作成
def base_func(x,a,b,c):
y = c + a*(x - b)**2
return y
x = np.arange(-30, 30, 1)
para = [2.0,5.0,10.0]
np.random.seed(seed=10)
y = base_func(x,para[0],para[1],para[2])+np.random.normal(0, 60, len(x))
plt.scatter(x , y)
plt.show()
# x,yを結合する
xy=np.c_[x,y]
#dataをdataframeでcsvにする
df = pd.DataFrame({'x':x,'y':y})
#CSV,json,txt,excelで書きだす。
df.to_csv('csvdata.csv',index=False)
df.to_json('jsondata2.json')
df.to_json('jsondata.txt')
df.to_excel('excel.xlsx',index=False)
####groupの作成
ホルダーを作るのとほぼ同じ
hdf_file_name='test1.h5'
with h5py.File(hdf_file_name, mode='w') as f:
g_aaa=f.create_group('aaa')
g_bbb=f.create_group('bbb')
g_ccc=f.create_group('ccc')
####csvを格納
CSVを一度Pandasで読み込んでnp.arrayにして書き込む
hdf_file_name='test_csv.h5'
with h5py.File(hdf_file_name, mode='w') as f:
g_aaa=f.create_group('aaa')
df2 = pd.read_csv('csvdata.csv')
g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)
####attributeをつける
hdf_file_name='test_csv.h5'
with h5py.File(hdf_file_name, mode='w') as f:
g_aaa=f.create_group('aaa')
df2 = pd.read_csv('csvdata.csv')
g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)
#-----追加-------
g_aaa_data.attrs['year'] = '2020'
g_aaa_data.attrs['features'] = 'Quadratic curve'
#---------------
####np.arrayを格納
hdf_file_name='test_np.h5'
with h5py.File(hdf_file_name, mode='w') as f:
g_bbb=f.create_group('bbb')
g_bbb_data = f.create_dataset('bbb/npdata',data=xy)
####画像を格納
matplotlib, opencv, pillowなどを使って画像を読み込んでnumpy形式にします。
Pythonの画像読み込み: PIL, OpenCV, scikit-image
#matplotlib
img = plt.imread('lena.jpeg')
plt.imshow(img)
print(type(img))
#pillow ->np.arrayに変換必要
from PIL import Image
img2 = Image.open('lena.jpeg')
plt.imshow(img2)
print(type(img2))
#Pllowはnp.array形式ではないので、変換する
img3= np.array(img2)
print(type(img3))
<class 'PIL.JpegImagePlugin.JpegImageFile'>
<class 'numpy.ndarray'>
hdf_file_name='test_img.h5'
with h5py.File(hdf_file_name, mode='w') as f:
g_ccc=f.create_group('ccc')
g_ccc_data= f.create_dataset('ccc/lena',data=img)
g_ccc_data.attrs['name'] = 'lena'
g_ccc_data2= f.create_dataset('ccc/lena2',data=img3)
g_ccc_data2.attrs['name'] = 'lena2'
####TextとJSONを格納
参考:
HDF5のpython実装で文字列データを圧縮して保存するだけ
How to use HDF5 files in Python
#Text data の確認
with open('jsondata.txt') as f:
s =f.read()
print(s)
#textを格納
with h5py.File("test_text.h5", "w") as f:
with open('jsondata.txt') as ff:
ss =str(ff.read())
g_ddd = f.create_group("ddd")
dt = h5py.special_dtype(vlen=str)
# dt = h5py.string_dtype()
g_ddd_data = g_ddd.create_dataset("text", (1, ), dtype=dt)
g_ddd_data[0] = ss
#格納したtextを見てみる
with h5py.File("test_text.h5", "r") as f:
d = f["ddd/text"]
print(d[0])
print(type(d[0]))
{"x":{"0":-30,"1":-29,"2":-28,"3":-27,"4":-26,"5":-25,"6":-24,"7":-23,"8":-22,"9":-21,"10":-20,"11":-19,"12":-18,"13":-17,"14":-16,"15":-15,"16":-14,"17":-13,"18":-12,"19":-11,"20":-10,"21":-9,"22":-8,"23":-7,"24":-6,"25":-5,"26":-4,"27":-3,"28":-2,"29":-1,"30":0,"31":1,"32":2,"33":3,"34":4,"35":5,"36":6,"37":7,"38":8,"39":9,"40":10,"41":11,"42":12,"43":13,"44":14,"45":15,"46":16,"47":17,"48":18,"49":19,"50":20,"51":21,"52":22,"53":23,"54":24,"55":25,"56":26,"57":27,"58":28,"59":29},"y":{"0":2539.8951902478,"1":2364.9167384639,"2":2095.2759824733,"3":2057.4969690043,"4":1969.2801584334,"5":1766.7948663569,"6":1707.9306951415,"7":1584.5129115429,"8":1468.257485856,"9":1351.5239873644,"10":1285.9815713972,"11":1234.1822424287,"12":1010.096059769,"13":1039.696444679,"14":905.7178078075,"15":836.7082567698,"16":663.8038672901,"17":666.1082127069,"18":677.0722201102,"19":457.2117068529,"20":341.3363031605,"21":297.3976622461,"22":363.96420984,"23":441.0980398427,"24":319.4214752046,"25":310.3573327985,"26":177.9489529501,"27":221.8797826256,"28":91.7251207221,"29":118.7922510916,"30":43.9609686796,"31":9.0414591537,"32":35.9624977456,"33":-10.5685208956,"34":90.5083848518,"35":21.700796754,"36":36.0125992951,"37":-2.2579402251,"38":103.3883358253,"39":-1.9181701201,"40":99.6138930736,"41":60.9476865136,"42":51.6339984142,"43":108.6397669869,"44":123.7245314638,"45":197.2381416774,"46":231.6515852362,"47":316.7301961788,"48":381.9091601794,"49":393.1547845364,"50":458.445679791,"51":539.3456522282,"52":555.60725572,"53":700.4896011956,"54":782.533484303,"55":822.2148478396,"56":1035.6822198931,"57":1033.0475362506,"58":1061.2636517284,"59":1140.2691731715}}
<class 'str'>
#JSONを格納
with h5py.File("test_json.h5", "w") as f:
with open('jsondata2.json') as ff:
ssj =json.load(ff)
g_ddd = f.create_group("ddd")
g_ddd_data2 = g_ddd.create_dataset("json",data=json.dumps(ssj))
#JSONを取り出してみてみる
with h5py.File("test_json.h5", 'r') as f:
metadata = json.loads(f['ddd/json'][()])
for k in metadata:
print('{} => {}'.format(k, metadata[k]))
x => {'0': -30, '1': -29, '2': -28, '3': -27, '4': -26, '5': -25, '6': -24, '7': -23, '8': -22, '9': -21, '10': -20, '11': -19, '12': -18, '13': -17, '14': -16, '15': -15, '16': -14, '17': -13, '18': -12, '19': -11, '20': -10, '21': -9, '22': -8, '23': -7, '24': -6, '25': -5, '26': -4, '27': -3, '28': -2, '29': -1, '30': 0, '31': 1, '32': 2, '33': 3, '34': 4, '35': 5, '36': 6, '37': 7, '38': 8, '39': 9, '40': 10, '41': 11, '42': 12, '43': 13, '44': 14, '45': 15, '46': 16, '47': 17, '48': 18, '49': 19, '50': 20, '51': 21, '52': 22, '53': 23, '54': 24, '55': 25, '56': 26, '57': 27, '58': 28, '59': 29}
y => {'0': 2539.8951902478, '1': 2364.9167384639, '2': 2095.2759824733, '3': 2057.4969690043, '4': 1969.2801584334, '5': 1766.7948663569, '6': 1707.9306951415, '7': 1584.5129115429, '8': 1468.257485856, '9': 1351.5239873644, '10': 1285.9815713972, '11': 1234.1822424287, '12': 1010.096059769, '13': 1039.696444679, '14': 905.7178078075, '15': 836.7082567698, '16': 663.8038672901, '17': 666.1082127069, '18': 677.0722201102, '19': 457.2117068529, '20': 341.3363031605, '21': 297.3976622461, '22': 363.96420984, '23': 441.0980398427, '24': 319.4214752046, '25': 310.3573327985, '26': 177.9489529501, '27': 221.8797826256, '28': 91.7251207221, '29': 118.7922510916, '30': 43.9609686796, '31': 9.0414591537, '32': 35.9624977456, '33': -10.5685208956, '34': 90.5083848518, '35': 21.700796754, '36': 36.0125992951, '37': -2.2579402251, '38': 103.3883358253, '39': -1.9181701201, '40': 99.6138930736, '41': 60.9476865136, '42': 51.6339984142, '43': 108.6397669869, '44': 123.7245314638, '45': 197.2381416774, '46': 231.6515852362, '47': 316.7301961788, '48': 381.9091601794, '49': 393.1547845364, '50': 458.445679791, '51': 539.3456522282, '52': 555.60725572, '53': 700.4896011956, '54': 782.533484303, '55': 822.2148478396, '56': 1035.6822198931, '57': 1033.0475362506, '58': 1061.2636517284, '59': 1140.2691731715}
####excelを格納(圧縮なし)
参考:
pyconjp 発表「知ろう!使おう!HDF5ファイル!」の落ち穂拾い
with h5py.File('test_excel.h5', mode='w') as f:
with open('excel.xlsx',mode='rb') as ef:
eb = ef.read()
g_eee = f.create_group("eee")
dt = dtype=h5py.special_dtype(vlen=np.dtype('uint8'))
eba=np.frombuffer(eb, dtype='uint8')
g_eee_data = g_eee.create_dataset("excel",(1,), dtype=dt)
g_eee_data[0]=eba
#HDF5から取り出してSaveする。
with h5py.File('test_excel.h5', mode='r') as f:
d = f["eee/excel"]
with open('excelout.xlsx','wb') as w:
w.write(d[0])
####今まで内容すべて
hdf_file_name='test_all.h5'
with h5py.File(hdf_file_name, mode='w') as f:
g_aaa=f.create_group('aaa')
df2 = pd.read_csv('csvdata.csv')
g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)
g_aaa_data.attrs['colums'] = '{}'.format(df2.columns)
g_aaa_data.attrs['year'] = '2020'
g_aaa_data.attrs['features'] = 'Quadratic curve'
g_bbb=f.create_group('bbb')
g_bbb_data = f.create_dataset('bbb/npdata',data=xy)
g_ccc=f.create_group('ccc')
g_ccc_data= f.create_dataset('ccc/lena',data=img)
g_ccc_data.attrs['name'] = 'lena'
g_ccc_data2= f.create_dataset('ccc/lena2',data=img3)
g_ccc_data2.attrs['name'] = 'lena2'
g_ddd=f.create_group('ddd')
g_eee = f.create_group('eee')
with open('jsondata.txt') as f:
ss =f.read()
dt = h5py.special_dtype(vlen=str)
g_ddd_data=g_ddd.create_dataset('text',(1,), dtype=dt)
g_ddd_data[0] = ss
with open('jsondata2.json') as f:
ssj =json.load(f)
g_ddd_dataj=g_ddd.create_dataset('json',data=json.dumps(ssj))
with open('excel.xlsx',mode='rb') as ef:
eb = ef.read()
dte = dtype=h5py.special_dtype(vlen=np.dtype('uint8'))
eba=np.frombuffer(eb, dtype='uint8')
g_eee_data = g_eee.create_dataset("excel",(1,), dtype=dte)
g_eee_data[0]=eba
##まとめ
いろいろなファイルをHDFに格納するテストをしてみました。基本は、numpy形式にしていれる感じでした。numpyのデータタイプはいろいろあるので、そのデータタイプとHDFとのデータタイプについて少し調べてみたいと思います。