11
12

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

HDF5へ様々なファイルの格納

Posted at

##はじめに
様々な実験データファイルをまとめる必要があり、階層構造をもつHDF形式でのファイルの格納方法について備忘録として記載します。

####参考情報
・HDF一般
Pythonを使いHDFファイルの階層構造を把握してデータを読み込む
意外と奥が深い、HDFの世界(Python・h5py入門)

・text、jsonファイルの保存の参考
HDF5のpython実装で文字列データを圧縮して保存するだけ
How to use HDF5 files in Python

・excelファイルの保存
pyconjp 発表「知ろう!使おう!HDF5ファイル!」の落ち穂拾い

####環境
Win10 Pro
Anaconda Python3.7

確認のためにHDFView-3.1.0を利用
HDFView

##テスト内容
###HDFのデータ構造(超簡単に)
HDF5 グループ = フォルダと同じ
データセット = 1つのファイル(CSVなど)と同じ
###テストで作るもの
HDFファイルの下に、以下のグループ(ホルダーと同じ)を作成する。
aaa、bbb、ccc、ddd

それぞれのホルダーに
aaaにはCSVデータ、bbbにはnp.arrayデータ、 cccにはimageデータ、 dddにはtxt、JSONデータ、eeeにはexcelデータ
を入れる。
HDFView-3.1.0で確認する。

###コード

import json
import numpy as np
import h5py
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline 

####試しに入れるデータ作成

def base_func(x,a,b,c):
    y = c + a*(x - b)**2
    return y

x = np.arange(-30, 30, 1)
para = [2.0,5.0,10.0]
np.random.seed(seed=10)
y = base_func(x,para[0],para[1],para[2])+np.random.normal(0, 60, len(x))

plt.scatter(x , y)
plt.show()

output_4_0.png

# x,yを結合する
xy=np.c_[x,y]

#dataをdataframeでcsvにする
df = pd.DataFrame({'x':x,'y':y})

参考:
pandasのファイル出力形式まとめ

#CSV,json,txt,excelで書きだす。
df.to_csv('csvdata.csv',index=False)
df.to_json('jsondata2.json')
df.to_json('jsondata.txt')
df.to_excel('excel.xlsx',index=False)

####groupの作成
ホルダーを作るのとほぼ同じ

hdf_file_name='test1.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    g_aaa=f.create_group('aaa')
    g_bbb=f.create_group('bbb')
    g_ccc=f.create_group('ccc')

test1-1.jpg

####csvを格納
CSVを一度Pandasで読み込んでnp.arrayにして書き込む

hdf_file_name='test_csv.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    g_aaa=f.create_group('aaa')

    df2 = pd.read_csv('csvdata.csv')
    g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)

####attributeをつける

hdf_file_name='test_csv.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    g_aaa=f.create_group('aaa')
    
    df2 = pd.read_csv('csvdata.csv')
    g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)
    
    #-----追加-------
    g_aaa_data.attrs['year'] = '2020'
    g_aaa_data.attrs['features'] = 'Quadratic curve'
    #---------------      

test_csv.PNG

####np.arrayを格納

hdf_file_name='test_np.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    g_bbb=f.create_group('bbb')
    g_bbb_data = f.create_dataset('bbb/npdata',data=xy)

test_np.PNG

####画像を格納
matplotlib, opencv, pillowなどを使って画像を読み込んでnumpy形式にします。
Pythonの画像読み込み: PIL, OpenCV, scikit-image

#matplotlib
img = plt.imread('lena.jpeg')
plt.imshow(img)
print(type(img))

output_22_1.png

#pillow ->np.arrayに変換必要
from PIL import Image
img2 = Image.open('lena.jpeg')
plt.imshow(img2)
print(type(img2))
#Pllowはnp.array形式ではないので、変換する
img3= np.array(img2)
print(type(img3))
    <class 'PIL.JpegImagePlugin.JpegImageFile'>
    <class 'numpy.ndarray'>

output_23_1.png

hdf_file_name='test_img.h5'
with h5py.File(hdf_file_name, mode='w') as f: 
    g_ccc=f.create_group('ccc')
    g_ccc_data= f.create_dataset('ccc/lena',data=img)
    g_ccc_data.attrs['name'] = 'lena'
    g_ccc_data2= f.create_dataset('ccc/lena2',data=img3)  
    g_ccc_data2.attrs['name'] = 'lena2'

test_img.PNG

####TextとJSONを格納
参考:
HDF5のpython実装で文字列データを圧縮して保存するだけ
How to use HDF5 files in Python

#Text data の確認
with open('jsondata.txt') as f:
    s =f.read()
print(s)
#textを格納
with h5py.File("test_text.h5", "w") as f:
    with open('jsondata.txt') as ff:
        ss =str(ff.read())
    
    g_ddd = f.create_group("ddd")
    dt = h5py.special_dtype(vlen=str)
#     dt = h5py.string_dtype()
    g_ddd_data = g_ddd.create_dataset("text", (1, ), dtype=dt)
    g_ddd_data[0] = ss
#格納したtextを見てみる
with h5py.File("test_text.h5", "r") as f:
    d = f["ddd/text"]
    print(d[0])
    print(type(d[0]))

    {"x":{"0":-30,"1":-29,"2":-28,"3":-27,"4":-26,"5":-25,"6":-24,"7":-23,"8":-22,"9":-21,"10":-20,"11":-19,"12":-18,"13":-17,"14":-16,"15":-15,"16":-14,"17":-13,"18":-12,"19":-11,"20":-10,"21":-9,"22":-8,"23":-7,"24":-6,"25":-5,"26":-4,"27":-3,"28":-2,"29":-1,"30":0,"31":1,"32":2,"33":3,"34":4,"35":5,"36":6,"37":7,"38":8,"39":9,"40":10,"41":11,"42":12,"43":13,"44":14,"45":15,"46":16,"47":17,"48":18,"49":19,"50":20,"51":21,"52":22,"53":23,"54":24,"55":25,"56":26,"57":27,"58":28,"59":29},"y":{"0":2539.8951902478,"1":2364.9167384639,"2":2095.2759824733,"3":2057.4969690043,"4":1969.2801584334,"5":1766.7948663569,"6":1707.9306951415,"7":1584.5129115429,"8":1468.257485856,"9":1351.5239873644,"10":1285.9815713972,"11":1234.1822424287,"12":1010.096059769,"13":1039.696444679,"14":905.7178078075,"15":836.7082567698,"16":663.8038672901,"17":666.1082127069,"18":677.0722201102,"19":457.2117068529,"20":341.3363031605,"21":297.3976622461,"22":363.96420984,"23":441.0980398427,"24":319.4214752046,"25":310.3573327985,"26":177.9489529501,"27":221.8797826256,"28":91.7251207221,"29":118.7922510916,"30":43.9609686796,"31":9.0414591537,"32":35.9624977456,"33":-10.5685208956,"34":90.5083848518,"35":21.700796754,"36":36.0125992951,"37":-2.2579402251,"38":103.3883358253,"39":-1.9181701201,"40":99.6138930736,"41":60.9476865136,"42":51.6339984142,"43":108.6397669869,"44":123.7245314638,"45":197.2381416774,"46":231.6515852362,"47":316.7301961788,"48":381.9091601794,"49":393.1547845364,"50":458.445679791,"51":539.3456522282,"52":555.60725572,"53":700.4896011956,"54":782.533484303,"55":822.2148478396,"56":1035.6822198931,"57":1033.0475362506,"58":1061.2636517284,"59":1140.2691731715}}
    <class 'str'>
#JSONを格納
with h5py.File("test_json.h5", "w") as f:
    
    with open('jsondata2.json') as ff:
        ssj =json.load(ff)
    
    g_ddd = f.create_group("ddd")
    g_ddd_data2 = g_ddd.create_dataset("json",data=json.dumps(ssj))
#JSONを取り出してみてみる
with h5py.File("test_json.h5", 'r') as f:
    metadata = json.loads(f['ddd/json'][()])
    for k in metadata:
        print('{} => {}'.format(k, metadata[k]))

    x => {'0': -30, '1': -29, '2': -28, '3': -27, '4': -26, '5': -25, '6': -24, '7': -23, '8': -22, '9': -21, '10': -20, '11': -19, '12': -18, '13': -17, '14': -16, '15': -15, '16': -14, '17': -13, '18': -12, '19': -11, '20': -10, '21': -9, '22': -8, '23': -7, '24': -6, '25': -5, '26': -4, '27': -3, '28': -2, '29': -1, '30': 0, '31': 1, '32': 2, '33': 3, '34': 4, '35': 5, '36': 6, '37': 7, '38': 8, '39': 9, '40': 10, '41': 11, '42': 12, '43': 13, '44': 14, '45': 15, '46': 16, '47': 17, '48': 18, '49': 19, '50': 20, '51': 21, '52': 22, '53': 23, '54': 24, '55': 25, '56': 26, '57': 27, '58': 28, '59': 29}
    y => {'0': 2539.8951902478, '1': 2364.9167384639, '2': 2095.2759824733, '3': 2057.4969690043, '4': 1969.2801584334, '5': 1766.7948663569, '6': 1707.9306951415, '7': 1584.5129115429, '8': 1468.257485856, '9': 1351.5239873644, '10': 1285.9815713972, '11': 1234.1822424287, '12': 1010.096059769, '13': 1039.696444679, '14': 905.7178078075, '15': 836.7082567698, '16': 663.8038672901, '17': 666.1082127069, '18': 677.0722201102, '19': 457.2117068529, '20': 341.3363031605, '21': 297.3976622461, '22': 363.96420984, '23': 441.0980398427, '24': 319.4214752046, '25': 310.3573327985, '26': 177.9489529501, '27': 221.8797826256, '28': 91.7251207221, '29': 118.7922510916, '30': 43.9609686796, '31': 9.0414591537, '32': 35.9624977456, '33': -10.5685208956, '34': 90.5083848518, '35': 21.700796754, '36': 36.0125992951, '37': -2.2579402251, '38': 103.3883358253, '39': -1.9181701201, '40': 99.6138930736, '41': 60.9476865136, '42': 51.6339984142, '43': 108.6397669869, '44': 123.7245314638, '45': 197.2381416774, '46': 231.6515852362, '47': 316.7301961788, '48': 381.9091601794, '49': 393.1547845364, '50': 458.445679791, '51': 539.3456522282, '52': 555.60725572, '53': 700.4896011956, '54': 782.533484303, '55': 822.2148478396, '56': 1035.6822198931, '57': 1033.0475362506, '58': 1061.2636517284, '59': 1140.2691731715}

test_json.PNG

####excelを格納(圧縮なし)
参考:
pyconjp 発表「知ろう!使おう!HDF5ファイル!」の落ち穂拾い

with h5py.File('test_excel.h5', mode='w') as f:
    with open('excel.xlsx',mode='rb') as ef:
        eb = ef.read()
    g_eee = f.create_group("eee")
    dt = dtype=h5py.special_dtype(vlen=np.dtype('uint8'))
    eba=np.frombuffer(eb, dtype='uint8')
    g_eee_data = g_eee.create_dataset("excel",(1,), dtype=dt)
    g_eee_data[0]=eba

test_excel.PNG

#HDF5から取り出してSaveする。
with h5py.File('test_excel.h5', mode='r') as f:
    d = f["eee/excel"]
   
    with open('excelout.xlsx','wb') as w:
        w.write(d[0])

比較してみる(同じ内容になっている)
excel_comp.PNG

####今まで内容すべて

hdf_file_name='test_all.h5'
with h5py.File(hdf_file_name, mode='w') as f:
    
    g_aaa=f.create_group('aaa')
    
    df2 = pd.read_csv('csvdata.csv')
    g_aaa_data=f.create_dataset('aaa/csvdata',data=df2.values)
    g_aaa_data.attrs['colums'] = '{}'.format(df2.columns)
    g_aaa_data.attrs['year'] = '2020'
    g_aaa_data.attrs['features'] = 'Quadratic curve'   
    
    g_bbb=f.create_group('bbb')
    g_bbb_data = f.create_dataset('bbb/npdata',data=xy)
    

    g_ccc=f.create_group('ccc')
    g_ccc_data= f.create_dataset('ccc/lena',data=img)
    g_ccc_data.attrs['name'] = 'lena'
    g_ccc_data2= f.create_dataset('ccc/lena2',data=img3)  
    g_ccc_data2.attrs['name'] = 'lena2'


    g_ddd=f.create_group('ddd')
    g_eee = f.create_group('eee')
    
    
    with open('jsondata.txt') as f:
        ss =f.read() 
    dt = h5py.special_dtype(vlen=str)
    g_ddd_data=g_ddd.create_dataset('text',(1,), dtype=dt)
    g_ddd_data[0] = ss
    
    with open('jsondata2.json') as f:
        ssj =json.load(f)   
    g_ddd_dataj=g_ddd.create_dataset('json',data=json.dumps(ssj))
    
    
    with open('excel.xlsx',mode='rb') as ef:
        eb = ef.read()

    dte = dtype=h5py.special_dtype(vlen=np.dtype('uint8'))
    eba=np.frombuffer(eb, dtype='uint8')
    g_eee_data = g_eee.create_dataset("excel",(1,), dtype=dte)
    g_eee_data[0]=eba

test_all.PNG

##まとめ
いろいろなファイルをHDFに格納するテストをしてみました。基本は、numpy形式にしていれる感じでした。numpyのデータタイプはいろいろあるので、そのデータタイプとHDFとのデータタイプについて少し調べてみたいと思います。

11
12
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
11
12

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?