LoginSignup
6
8

More than 3 years have passed since last update.

python ビッグデータを取り扱ったときのメモ

Last updated at Posted at 2019-07-27

こちらは個人メモレベルです。調べたことを箇条書き的に記載しています。

■目的

数10GBのファイルを読み込みデータ解析する。
(1)(再)読み込み速度を早くする。
(2)メモリ消費量を抑える。
(3)解析性能を向上させる。

■メモリ消費量調査方法

import   memory_profiler 

#メモリ消費を見たいメソッドの上に@profileを記述
#以下はクラスメソッドに記述した例
@classmethod 
@memory_profiler.profile 
def main_process(cls, f_filter,...): 
出力例
Line#  Mem usage   Increment   Line Contents 
314    29.8 MiB    0.0 MiB     @classmethod 
315                            @memory_profiler.profile 
316                            def main_process(cls, 
329    29.8 MiB    0.0 MiB     cls.section = section 
330    29.8 MiB    0.0 MiB     cls.section_map.clear() 

※ループの中で何回も呼び出される行のメモリ消費量は最大値が出力される模様

■オブジェクトサイズ調査方法

from  pympler import asizeof 

print('size of 10 = %d' % asizeof(10)) #バイトサイズが返ってくる 
24

■要素の集合を定義する方法

tuple
('abc_1234561', datetime.datetime(2019, 1, 20, 10, 20), 12345, 67890, 1234.5678) 
#書き込み不可
#要素名で参照不可
named_tuple
nt = namedtuple("REPORT","name count value1 value2 va1ue3") 
nt('abc_123456', datetime.datetime(2019, 1, 20, 10, 20), 12345, 67890, 1234.5678) 
#書き込み不可
#要素名で参照可
array.array
import array 
array.array('I',[123, 456, day]) 
#全ての要素が同一の型であること
numpy.array
import numpy 
numpy.array([1,2, 3]) 
#要素がすべて数値であること
list
ls=[abc_1234561, datetime.datetime(2019, 1, 20, 10, 20), 12345, 67890, 1234.5678] 
#要素名でアクセス不可
class
class Report_slot: 
__slots__ = ('name', 'count','value1','value2','va1ue3') 
def __init__(self, name, count, value1, va1ue2, value3): 
  self.name = name 
  self.count = count 
  self.value1 = value1 
  self.va1ue2 = va1ue2 
  self.va1ue3 = value3 
#要素名でアクセス可能
#__slots__でプロパティを限定することでオブジェクトサイズを減らす。

■要素の集合の容量、アクセス性能

要素数は5とし、以下の型で定義した場合のオブジェクトサイズ、アクセス速度(sec/100000回)を調査。

'abc_123456', datetime.datetime(2019, 1, 20, 10, 20), 12345, 67890, 1234.5678
種類 オブジェクトサイズ 生成 参照
tuple 168 0.048 0.026
named_tuple 80 0.159 0.064
list 176 0.080 0.026
class 424 0.176 0.034
class(slot) 224 0.146 0.038

■クラスについて

Javaのstaticメソッド、abstractメソッド相当の定義方法

class DataAnalysis: 
  @classmethod 
  @abstractmethod 
  def master_map(cls, file, target_filter): 
    pass 

■オブジェクトのシリアライズ

同一のファイルを何度も読み込む必要がある場合、読み込んだ結果を
バイナリファイルとして書き出すことで性能を向上させることができる。

(1)生成

書き込み単位を分けることでchunkを指定することができる。

import pickle 
# chunk1 
with open('pickle_file', 'wb') as f: 
  pickle.dump([[1, 2], [3, 4], [5, 6]]), f) 
# chunk2 
with open('pickle_file', 'wb') as f: 
  pickle.dump([[7, 8], [9, 10], [11, 12]]), f) 
(2)読み込み

書き出したオブジェクトの合成が不要なケース。

f = open(f_xx, 'rb') 
  while 1: 
    try: 
      unpickler = pickle.Unpickler(f) 
      list_items  = unpickler.load() 
      # 合成が必要な場合ここでlistを結合(extend)する。 
      print(list_items) 
    except EOFError: 
      break 
print 1回目: [[1, 2], [3, 4], [5, 6]] 
print 2回目: [[7, 8], [9, 10], [11, 12]] 

■pythonファイルのパッケージ化

複数のpythonファイルをパッケージ化する。

(1)フォルダ構成

init.pyを定義すること。

Pycharm project/ 
  hoge(ルート)/ 
    __init__.py 
    hogesum.py 
    submodule/ 
      __init__.py 
      hogesum_a.py 
      hogesum_b.py 
      hogesum_c.py
(2)明示的な相対import

Pycharmではimport文を絶対参照で記述する場合、プロジェクト直下にパッケージを配置しなければならない。
また配置を変更したりパッケージ名を変更した場合、importのパスが通らなくなる。
このため、パッケージがどこに配置されても良いように、パッケージ内のモジュールは相対importする。

(a)hogesum.pyでsubmodule下のhogesum_cをimportする。
from submodule import hogesum_c as panalysis

※Pycharmで「Unresolved reference」の警告が表示される場合、
hogeフォルダを右クリックして「Mark Directory As」→ 「Source Root」を選択すると警告が消える。

(b)hogesum_c.pyでhogesum_a.pyで定義されたクラスをimportする
from .hogesum_a import DataAnalysis
from .hogesum_a import Filter

■テスト用のコード(追記)

(1)numpy.arrayなどのテスト
numpy_test.py
# -*- coding: utf-8 -*-
# Work with  Python 3.5
# Script numpy_test.py
from pympler import asizeof
import numpy as np
import memory_profiler


@memory_profiler.profile
def main():
    arry = np.arange(1000000).reshape((1000, 1000))
    aarry_t = arry.T
    print('arry row = %d col = %d size = %d' % (len(arry),len(arry[0]), asizeof.asizeof(arry)))
    print('...processing. It takes a minute or more.')
    arry2 = [[i+j*1000 for i in range(1000)] for j in range(1000)]
    arry2_t = list(map(list, zip(*arry2)))
    print('arry2 row = %d col = %d size = %d' % (len(arry2), len(arry2[0]), asizeof.asizeof(arry2)))
if __name__ == "__main__":
    main()
(2)各種要素定義のテスト
array_test.py
# -*- coding: utf-8 -*-
# Work with  Python 3.5
# Script array_test.py
import array
import sys
import datetime
import pickle
from pympler import asizeof
from collections import namedtuple
import time
from collections import deque

class hoge:
    __slots__ = ('a', 'b', 'c')
    def __init__(self, day, f):
        self.a = 123
        self.b = day
        self.c = f

class hoge2:
    def __init__(self, day, f):
        self.a = 123
        self.b = day
        self.c = f

class hoge3:
    def __init__(self,d, day, f):
        self.a = 123
        self.b = day
        self.c = f
        self.d = d

def main(argv):
    x_day = datetime.datetime.strptime('2019/1/20 10:20', '%Y/%m/%d %H:%M')
    day = int(time.mktime(x_day.timetuple()))
    f1 = float('123.33')
    f2 = float('456.33')
    x0 = array.array('l', [123, 456, day])
    x1 = []
    x2 = []
    x3 = []
    for i in range(0, 2):
        x1.append([array.array('i', [123, 456, day]), f1])
    for i in range(0, 2):
        x2.append([123, 456, day, f1])
    binary = pickle.dumps('Hello, World!')
    bb = pickle.loads(binary)
    print(bb)
    del bb
    print('---')
    y1 = pickle.dumps(x1)
    print(pickle.loads(y1))

    z1 = hoge(day, f1)
    z2 = hoge2(day, f1)

    # objgraph.show_refs([x1], filename='sample-graph.png')
    # objgraph.show_most_common_types()
    print('x_day size=%d' % asizeof.asizeof(x_day))
    print('day size=%d' % asizeof.asizeof(day))
    print('x1 size=%d' % asizeof.asizeof(x1))
    print('x2 size=%d' % asizeof.asizeof(x2))
    print('array element size=%d' % asizeof.asizeof([array.array('l', [123, 456, day]), f1]))
    print('list element size=%d' % asizeof.asizeof([123, 456, day, f1]))
    print('class slot element size=%d' % asizeof.asizeof(z1))
    print('class element size=%d' % asizeof.asizeof(z2))
    Foo = namedtuple("xoo", "A B C D")
    ff = Foo(123, 456, day, f1)
    print('nametuple size=%d' % asizeof.asizeof(ff))
    print('A=%d' % ff.A)
    print('a element size=%d' % asizeof.asizeof('a'))
    print('dict element size=%d' % asizeof.asizeof({123, 456, day, f1}))
    print('dict with key element size=%d' % asizeof.asizeof({'A':123, 'D':456, 'B':day, 'C':f1}))

    print('a=%d' % x1[0][0][0])
    print('123 size=%d' % asizeof.asizeof(123))
    print('x1[0][0][0] size=%d' % asizeof.asizeof(x1[0][0][0]))
    x4 = []
    start = time.time()
    for i in range(0,4000000):
        x4.append([i, 456, day, f1])
    elapsed_time = time.time() - start
    print("list = %f sec"%elapsed_time)
    x5 = []
    start = time.time()
    x5_append =  x5.append
    for i in range(0, 4000000):
        x5_append(Foo(i, 456, day, f1))
    elapsed_time = time.time() - start
    print("nametapple = %f sec" % elapsed_time)
    x6 = []
    start = time.time()
    for i in range(0, 4000000):
        x6.append(hoge3(i, day, f1))
    elapsed_time = time.time() - start
    print("class = %f sec" % elapsed_time)
    x7 = []
    start = time.time()
    x7_append = x7.append
    for i in range(0, 4000000):
        x7_append([i, 456, day, f1])
    elapsed_time = time.time() - start
    print("list = %f sec" % elapsed_time)

    x8 = deque()
    start = time.time()
    x8_append = x8.append
    for i in range(0, 4000000):
        x8_append([i, 456, day, f1])
    elapsed_time = time.time() - start
    print("deque = %f sec" % elapsed_time)

        # muppy.print_summary()


if __name__ == "__main__":
    main(sys.argv[1:])
(3)各種データセーブ/ロードのテスト
elements_type_and_save_test.py
# -*- coding: utf-8 -*-
# Work with  Python 3.5
# Script elements_type_and_save_test.py
import sys
import pickle
from collections import namedtuple
import time

info_tuple = namedtuple('info_tuple', 'col0 col1 col2')


def save_tuple():
    start = time.time()
    t_list = []
    for idx in range(0, 100000):
        t_list.append(info_tuple(idx, 'abc', 'def'))
    with open('C:\\temp\\tuple.pickle', mode='wb') as f:
        pickle.dump(t_list, f)
    with open('C:\\temp\\tuple.pickle', mode='ab') as f:
        pickle.dump(t_list, f)
    with open('C:\\temp\\tuple.pickle', mode='ab') as f:
        pickle.dump(t_list, f)
    elapsed_time = time.time() - start
    print("save tuple = %f sec" % elapsed_time)


def load_tuple():
    start = time.time()
    t_list = []
    with open('C:\\temp\\tuple.pickle', mode='rb') as f:
        t_list = pickle.load(f)
    elapsed_time = time.time() - start
    print("load tuple = %f sec" % elapsed_time)
    start = time.time()
    count = -1
    for data in t_list:
        count = data.col0
    elapsed_time = time.time() - start
    print("read tuple = %f sec count = %d" % (elapsed_time, count))


def save_list():
    start = time.time()
    l_list = []
    for idx in range(0, 100000):
        l_list.append([idx, 'abc', 'def'])
    with open('C:\\temp\\list.pickle', mode='wb') as f:
        pickle.dump(l_list, f)
    with open('C:\\temp\\list.pickle', mode='ab') as f:
        pickle.dump(l_list, f)
    with open('C:\\temp\\list.pickle', mode='ab') as f:
        pickle.dump(l_list, f)
    elapsed_time = time.time() - start
    print("save list = %f sec" % elapsed_time)


def load_list():
    start = time.time()
    t_list = []
    with open('C:\\temp\\list.pickle', mode='rb') as f:
        t_list = pickle.load(f)
    elapsed_time = time.time() - start
    print("load list = %f sec" % elapsed_time)
    start = time.time()
    count = -1
    for data in t_list:
        count = data[0]
    elapsed_time = time.time() - start
    print("read list = %f sec count = %d" % (elapsed_time, count))


def save_csv():
    start = time.time()
    f = open('C:\\temp\\csv_data.csv', 'w')
    for idx in range(0, 100000):
        f.writelines(str(idx) + ',' + 'abc'+',' + 'def'+'\n')
    f.close()
    elapsed_time = time.time() - start
    print("save csv = %f sec" % elapsed_time)


def load_csv():
    start = time.time()
    t_list = []
    f = open('C:\\temp\\csv_data.csv', 'r')
    for line in f:
        cols = line[:-1].split(',')
        t_list.append(cols)
    f.close()
    elapsed_time = time.time() - start
    print("load csv = %f sec" % elapsed_time)
    start = time.time()
    count = -1
    for data in t_list:
        count = int(data[0])
    elapsed_time = time.time() - start
    print("read csv = %f sec count = %d" % (elapsed_time, count))


def main(argv):
    save_tuple()
    save_list()
    save_csv()
    load_tuple()
    load_list()
    load_csv()


if __name__ == "__main__":
    main(sys.argv[1:])

(4)データロードと入力のテスト

load_and_read_test.py
# -*- coding: utf-8 -*-
# Work with  Python 3.5
# elements_type_and_save_test.pyの実行結果を使用する
# Script load_and_read_test.py
import sys
import pickle
import time


def pickle_load(unpickler):
    obj = []
    obj_append = obj.append
    while 1:
        try:
            obj_append(unpickler.load())
        except EOFError:
            break
    return obj


def load_list():
    start = time.time()
    t_list = []
    f = open('C:\\temp\\list.pickle', 'rb')
    unpickler = pickle.Unpickler(f)
    count = 0
    for col in [flatten for inner in pickle_load(unpickler) for flatten in inner]:
        count += 1
    print('count=%d' % count)

    f.close()
    f = open('C:\\temp\\list.pickle', 'rb')
    unpickler = pickle.Unpickler(f)
    while 1:
        try:
            t_list = unpickler.load()
            print('len=%d' % len(t_list))
        except EOFError:
            break
    f.close()

    f = open('C:\\temp\\list.pickle', 'rb')
    unpickler = pickle.Unpickler(f)
    count = 0
    for t_items in t_list:
        count = t_items[0]
    f.close()
    print('count=%d' % count)
    elapsed_time = time.time() - start
    print("load list = %f sec" % elapsed_time)
    start = time.time()
    count = -1
    for data in t_list:
        count = data[0]
    elapsed_time = time.time() - start
    print("read list = %f sec count = %d" % (elapsed_time, count))


def load_csv():
    start = time.time()
    t_list = []
    f = open('C:\\temp\\csv_data.csv', 'r')
    for line in f:
        cols = line[:-1].split(',')
        t_list.append(cols)
    f.close()
    elapsed_time = time.time() - start
    print("load csv = %f sec" % elapsed_time)
    start = time.time()
    count = -1
    for data in t_list:
        count = int(data[0])
    elapsed_time = time.time() - start
    print("read csv = %f sec count = %d" % (elapsed_time, count))


def main(argv):
    load_list()
    load_csv()


if __name__ == "__main__":
    main(sys.argv[1:])
(4)メモリプロファイル
memory_profile_test.py
# -*- coding: utf-8 -*-
# Work with  Python 3.5
# elements_type_and_save_test.pyの実行結果を使用する
# Script memory_profile_test.py -m memory_profiler
import sys
import pickle
import time
import memory_profiler


def pickle_load(unpickler):
    obj = []
    obj_append = obj.append
    while 1:
        try:
            obj_append(unpickler.load())
        except EOFError:
            break
    return obj


def load_list():
    start = time.time()
    t_list = []
    f = open('C:\\temp\\list.pickle', 'rb')
    unpickler = pickle.Unpickler(f)
    count = 0
    for col in [flatten for inner in pickle_load(unpickler) for flatten in inner]:
        count += 1
    print('count=%d' % count)

    f.close()
    f = open('C:\\temp\\list.pickle', 'rb')
    unpickler = pickle.Unpickler(f)
    while 1:
        try:
            t_list = unpickler.load()
            print('len=%d' % len(t_list))
        except EOFError:
            break
    f.close()

    f = open('C:\\temp\\list.pickle', 'rb')
    unpickler = pickle.Unpickler(f)
    count = 0
    for t_list in unpickler.load():
        count += 1
    f.close()
    print('count=%d' % count)
    elapsed_time = time.time() - start
    print("load list = %f sec" % elapsed_time)
    start = time.time()
    count = -1
    for data in t_list:
        count = data[0]
    elapsed_time = time.time() - start
    print("read list = %f sec count = %d" % (elapsed_time, count))


@memory_profiler.profile
def load_csv():
    start = time.time()
    t_list = []
    f = open('C:\\temp\\csv_data.csv', 'r')
    for line in f:
        cols = line[:-1].split(',')
        t_list.append(cols)
    f.close()
    elapsed_time = time.time() - start
    print("load csv = %f sec" % elapsed_time)
    start = time.time()
    count = -1
    for data in t_list:
        count = int(data[0])
    elapsed_time = time.time() - start
    print("read csv = %f sec count = %d" % (elapsed_time, count))
    print("...processing. It may take over 30 minutes.")

    for i in range(10000):
        p = [j for j in range(10001-i)]
        p.clear()
        del p
    del t_list

def main(argv):
    # load_list()
    load_csv()


if __name__ == "__main__":
    main(sys.argv[1:])
(5)オブジェクトサイズの調査
size_test.py
# -*- coding: utf-8 -*-
# Work with  Python 3.5
# Script size_test.py
import array
import sys
import datetime
from pympler import asizeof
from collections import namedtuple
import time

nt = namedtuple("DATA1", "id count value1 value2 value3")


class Data1_slot:
    __slots__ = ('id', 'count', 'value1','value2','value3')
    def __init__(self, id, count, value1, value2, value3):
        self.id = id
        self.count = count
        self.value1 = value1
        self.value2 = value2
        self.value3 = value3
class Data1:
    def __init__(self, id, count, value1, value2, value3):
        self.id = id
        self.count = count
        self.value1 = value1
        self.value2 = value2
        self.value3 = value3

class hoge2:
    def __init__(self, day, f):
        self.a = 123
        self.b = day
        self.c = f

class hoge3:
    def __init__(self,d, day, f):
        self.a = 123
        self.b = day
        self.c = f
        self.d = d


def element_list_measure(a,b,c,d,e):
    n_element = [a, b, int(c), d, float(e)]
    print('element list size = %d byte : \'%s\'' % (asizeof.asizeof(n_element), n_element))
    start = time.time()
    for i in range(100000):
        n_element = [a, b, int(c), i, float(e)]
    elapsed_time = time.time() - start
    print("time %f sec : create list element * 100000" % elapsed_time)
    start = time.time()
    for i in range(100000):
        e1 = n_element[0]
        e2 = n_element[1]
        e3 = n_element[2]
        e4 = n_element[3]
        e5 = n_element[4]
    elapsed_time = time.time() - start
    print("time %f sec : read list element * 100000" % elapsed_time)
    return n_element


def element_tuple_measure(a,b,c,d,e):
    n_element = (a, b, int(c), d, float(e))
    print('element tuple size = %d byte : \'%s\'' % (asizeof.asizeof(n_element), n_element))
    start = time.time()
    for i in range(100000):
        n_element = (a, b, int(c), i, float(e))
    elapsed_time = time.time() - start
    print("time %f sec : create tuple element * 100000" % elapsed_time)
    start = time.time()
    for i in range(100000):
        e1 = n_element[0]
        e2 = n_element[1]
        e3 = n_element[2]
        e4 = n_element[3]
        e5 = n_element[4]
    elapsed_time = time.time() - start
    print("time %f sec : read tuple element * 100000" % elapsed_time)
    return n_element


def element_namedtuple_measure(a,b,c,d,e):
    n_element = nt(a, b, int(c), d, float(e))
    print('element namedtuple size = %d byte : \'%s\'' % (asizeof.asizeof(n_element), n_element))
    start = time.time()
    for i in range(100000):
        n_element = nt(a, b, int(c), i, float(e))
    elapsed_time = time.time() - start
    print("time %f sec : create namedtuple element * 100000" % elapsed_time)
    start = time.time()
    for i in range(100000):
        e1 = n_element.id
        e2 = n_element.count
        e3 = n_element.value1
        e4 = n_element.value2
        e5 = n_element.value3
    elapsed_time = time.time() - start
    print("time %f sec : read namedtuple element * 100000" % elapsed_time)
    return n_element


def element_class_measure(a,b,c,d,e):
    n_element = Data1(a, b, int(c), d, float(e))
    print('element class size = %d byte : \'%s\'' % (asizeof.asizeof(n_element), n_element))
    start = time.time()
    for i in range(100000):
        n_element = Data1(a, b, int(c), i, float(e))
    elapsed_time = time.time() - start
    print("time %f sec : create class element * 100000" % elapsed_time)
    start = time.time()
    for i in range(100000):
        e1 = n_element.id
        e2 = n_element.count
        e3 = n_element.value1
        e4 = n_element.value2
        e5 = n_element.value3
    elapsed_time = time.time() - start
    print("time %f sec : read class element * 100000" % elapsed_time)
    return n_element


def element_class_slot_measure(a,b,c,d,e):
    n_element = Data1_slot(a, b, int(c), d, float(e))
    print('element class slot size = %d byte : \'%s\'' % (asizeof.asizeof(n_element), n_element))
    start = time.time()
    for i in range(100000):
        n_element = Data1_slot(a, b, int(c), i, float(e))
    elapsed_time = time.time() - start
    print("time %f sec : create class slot element * 100000" % elapsed_time)
    start = time.time()
    for i in range(100000):
        e1 = n_element.id
        e2 = n_element.count
        e3 = n_element.value1
        e4 = n_element.value2
        e5 = n_element.value3
    elapsed_time = time.time() - start
    print("time %f sec : read class slot element * 100000" % elapsed_time)
    return n_element


def list_measure(e_name, obj):
    for m in [1, 2, 10, 100, 1000, 10000]:
        ls = []
        for i in range(m):
            ls.append(obj)
        print('list[%s] size %d byte : list * %d \'%s\'' % (e_name, asizeof.asizeof(ls),m, ls[0:1]))
    start = time.time()
    ls = []
    for i in range(1000000):
        ls.append(obj)
    elapsed_time = time.time() - start
    print("time %f sec : list append" % elapsed_time)
    start = time.time()
    ls = []
    ls_append = ls.append
    for i in range(1000000):
        ls_append(obj)
    elapsed_time = time.time() - start
    print("time %f sec : list_append" % elapsed_time)
    start = time.time()
    for i in range(1000000):
        a = ls[i]
    elapsed_time = time.time() - start
    print("time %f sec : list read" % elapsed_time)


def map_measure(e_name, obj):
    for m in [1, 2, 10, 100, 1000, 10000]:
        map = {}
        for i in range(m):
            map['ABC_' + str(i + 10000)] = obj[i]
        print('map[%s] size %d byte len = %d ' % (e_name, asizeof.asizeof(map), m))
    start = time.time()
    map = {}
    for i in range(100000):
        map['ABC_' + str(i + 100000)] = obj[i]
    elapsed_time = time.time() - start
    print("time %f sec : map append * 100000" % elapsed_time)


def main(argv):
    x_day = datetime.datetime.strptime('2019/1/20 10:20', '%Y/%m/%d %H:%M')
    day = int(time.mktime(x_day.timetuple()))
    f1 = float('123.33')
    f2 = float('456.33')
    x0 = array.array('l', [123, 456, day])
    x1 = []
    x2 = []
    x3 = []



    data1 = 'abc_123456,2019/1/20 10:20,12345,67890,1234.5678,1,2,'
    dict = 'xyz_123456,xyz,123456,ABC,DEF'
    rireki = 'def_123456,prop3,2019/1/21 10:20'
    print('list size')
    cols1 = data1[:-1].split(',')
    d1 = datetime.datetime.strptime(cols1[1], '%Y/%m/%d %H:%M')
    # n_element = [cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4])]
    n_element = element_list_measure(cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    cols2 = dict[:-1].split(',')
    m_element = [cols2[0],cols2[1],cols2[2],cols2[3],cols2[4]]
    cols3 = rireki[:-1].split(',')
    d3 = datetime.datetime.strptime(cols3[2], '%Y/%m/%d %H:%M')
    r_element = [cols3[0],cols3[1],d3]
    print('要素サイズ \'%s\': %dbyte' % (n_element[0], asizeof.asizeof(n_element[0]) ))
    print('要素サイズ \'%s\': %dbyte' % (str(n_element[1]), asizeof.asizeof(n_element[1])))
    print('要素サイズ \'%d\': %dbyte' % (n_element[2], asizeof.asizeof(n_element[2])))
    print('要素サイズ \'%d\': %dbyte' % (n_element[3], asizeof.asizeof(n_element[3])))
    print('要素サイズ \'%f\': %dbyte' % (n_element[4], asizeof.asizeof(n_element[4])))
    # print('listサイズ データ1 = %dbyte : %s' % ( asizeof.asizeof(n_element), n_element))
    print('listサイズ データ2 = %dbyte : %s' % (asizeof.asizeof(m_element), m_element))
    print('listサイズ データ3 = %dbyte : %s' % (asizeof.asizeof(r_element), r_element))
    t_n_element = (cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    t_n_element = element_tuple_measure(cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    t_m_element = (cols2[0], cols2[1], cols2[2], cols2[3], cols2[4])
    t_r_element = (cols3[0], cols3[1], d3)
    print('tupleサイズ データ1 = %dbyte : %s' % (asizeof.asizeof(t_n_element), t_n_element))
    print('tupleサイズ データ2 = %dbyte : %s' % (asizeof.asizeof(t_m_element), t_m_element))
    print('tupleサイズ データ3 = %dbyte : %s' % (asizeof.asizeof(t_r_element), t_r_element))
    mt = namedtuple("DATA2", "id sub1 sub2 item1 item2")
    rt = namedtuple("DATA3", "id item3 date")
    # nt_n_element = nt(cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    nt_n_element = element_namedtuple_measure(cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    nt_m_element = mt(cols2[0], cols2[1], cols2[2], cols2[3], cols2[4])
    nt_r_element = rt(cols3[0], cols3[1], d3)
    print('namedtupleサイズ データ1 = %dbyte : %s' % (asizeof.asizeof(nt_n_element), nt_n_element))
    print('namedtupleサイズ データ2 = %dbyte : %s' % (asizeof.asizeof(nt_m_element), nt_m_element))
    print('namedtupleサイズ データ3 = %dbyte : %s' % (asizeof.asizeof(nt_r_element), nt_r_element))
    # c_n_element = Nippo(cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    c_n_element = element_class_measure(cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    print('classサイズ データ1 = %dbyte : %s' % (asizeof.asizeof(c_n_element), c_n_element))
    cs_n_element = Data1_slot(cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    cs_n_element = element_class_slot_measure(cols1[0], d1, int(cols1[2]), int(cols1[3]), float(cols1[4]))
    print('class_slotサイズ データ1 = %dbyte : %s' % (asizeof.asizeof(cs_n_element), cs_n_element))
    list_measure('None', None)
    list_measure('data1 list', n_element)
    list_measure('data1 tuple', t_n_element)
    list_measure('data1 namedtuple', nt_n_element)
    list_measure('data1 class', c_n_element)
    list_measure('data1 class slot', cs_n_element)
    list_measure('data2 list]', m_element)
    list_measure('data3 list]', r_element)
    nlist1 = [n_element for i in range(10000)]
    nlist1_2 = [n_element for i in range(20000)]
    mlist1 = [m_element for i in range(10000)]
    rlist1 = [r_element for i in range(10000)]
    nlist2 = [t_n_element for i in range(10000)]
    mlist2 = [t_m_element for i in range(10000)]
    rlist2 = [t_r_element for i in range(10000)]
    nlist3 = [nt_n_element for i in range(10000)]
    mlist3 = [nt_m_element for i in range(10000)]
    rlist3 = [nt_r_element for i in range(10000)]
    '''
    print('list[list]*20000 データ1 = %dbyte : %s' % (asizeof.asizeof(nlist1_2), nlist1_2[0:1]))
    print('list[tuple]*10000 データ1 = %dbyte : %s' % (asizeof.asizeof(nlist2), nlist2[0:1]))
    print('list[namedtuple]*10000 データ1 = %dbyte : %s' % (asizeof.asizeof(nlist3), nlist3[0:1]))
    print('list[list]*10000 データ2 = %dbyte : %s' % (asizeof.asizeof(mlist1), mlist1[0:1]))
    print('list[tuple]*10000 データ2 = %dbyte : %s' % (asizeof.asizeof(mlist2), mlist2[0:1]))
    print('list[namedtuple]*10000 データ2 = %dbyte : %s' % (asizeof.asizeof(mlist3), mlist3[0:1]))
    print('list[list]*10000 データ3 = %dbyte : %s' % (asizeof.asizeof(rlist1), rlist1[0:1]))
    print('list[tuple]*10000 データ3 = %dbyte : %s' % (asizeof.asizeof(rlist2), rlist2[0:1]))
    print('list[namedtuple]*10000 データ3 = %dbyte : %s' % (asizeof.asizeof(rlist3), rlist3[0:1]))
    '''

    map_measure('None', [None for i in range(100000)])
    map_measure('list[list]', [n_element for i in range(100000)])
    map_measure('list[named_tuple]', [nt_n_element for i in range(100000)])

    nmap1 = {}
    for i in range(10000):
        nmap1['ABC_' + str(i + 10000)] = nlist1[i]
    print('map[list]*10000 = %dbyte : %s ......' % (asizeof.asizeof(nmap1), list(nmap1.items())[0:5]))
    nmap2 = {}
    for i in range(10000):
        nmap2['ABC_' + str(i + 10000)] = nlist2[i]
    print('map[tuple]*10000 = %dbyte : %s ......' % (asizeof.asizeof(nmap2), list(nmap2.items())[0:5]))
    nmap3 = {}
    for i in range(10000):
        nmap3['ABC_' + str(i + 10000)] = nlist3[i]
    print('map[namedtuple]*10000(推定byte=216*length*40) = %dbyte : %s ......' % (asizeof.asizeof(nmap3), list(nmap3.items())[0:5]))

    # objgraph.show_refs([x1], filename='sample-graph.png')
    # objgraph.show_most_common_types()
    print('x_day size=%d' % asizeof.asizeof(x_day))
    print('day size=%d' % asizeof.asizeof(day))
    print('x1 size=%d' % asizeof.asizeof(x1))
    print('x2 size=%d' % asizeof.asizeof(x2))
    print('array element size=%d' % asizeof.asizeof([array.array('l', [123, 456, day]), f1]))
    print('list element size=%d' % asizeof.asizeof([123, 456, day, f1]))
    Foo = namedtuple("xyz", "A B C D")
    ff = Foo(123, 456, day, f1)
    print('nametuple size=%d' % asizeof.asizeof(ff))
    print('A=%d' % ff.A)
    print('a element size=%d' % asizeof.asizeof('a'))
    print('data2 element size=%d' % asizeof.asizeof({123, 456, day, f1}))
    print('data2 with key element size=%d' % asizeof.asizeof({'A':123, 'D':456, 'B':day, 'C':f1}))

    print('123 size=%d' % asizeof.asizeof(123))

if __name__ == "__main__":
    main(sys.argv[1:])
6
8
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
6
8