概要
- データベースにインポートするCSVデータを自作する。
- データセットはKaggleから取ってこれたりもするが、今回はカラムの指定があったのでテキトーに作るわけにはいかず、データセットを1から自作する。
- 生成する値が全てランダムで良いわけではなかったため(なるべく実データに近づける必要があるため)全部をFakerさんにお任せするわけにはいかない。
-
customer_id
を join させたいため、2つのデータセットを作成する。
参考
上記二つのライブラリについては前回の記事を参照して下さい。
環境
# Pythonのバージョンを確認
python --version
> Python 3.7.3
# Fakerのインストール
> pip install faker
# posutoのインストール
> pip install posuto
# cutletのインストール
> pip install cutlet
テーブル・カラム情報
customer_data
レコード数:1000000
customer_id | name | date_of_birth | area_code | zipcode | prefecture | address | age | sex | credit_card | credit_card_expire |
---|
store_sales_data
レコード数:1000
order_id | order_date | ship_date | shipping_mode | customer_id | product_id | category | sub_category | product_name | total_price | quantity | discount_rate | profit |
---|
実装
customer_data
テーブルの作成
必要なものをインポート
from datetime import datetime, timedelta
from xmlrpc.client import DateTime
from faker import Factory
import random
import posuto
import math
import pandas as pd
import cutlet
romaji = cutlet.Cutlet()
faker = Factory.create()
faker_jp = Factory.create('ja_JP')
定数の定義
NUM_SALES_RECORDS = 1_000_000
NUM_CUSTOMER_RECORDS = 1_000
NUM_PRODUCTS = 10_000
NUM_CATEGORIES = 10
NUM_SUB_CATEGORIES_PER_CATEGORY = 10
customer_id
の生成
1〜1000までのランダムな整数を生成
def gen_customer_id(amount=NUM_CUSTOMER_RECORDS, randomize=False):
customer_id_list = []
gen_customer_id_list_customer = []
random.seed(32)
for i in range(NUM_CUSTOMER_RECORDS):
if randomize:
customer_id_list.append(str(random.randint(0, NUM_CUSTOMER_RECORDS)))
else:
customer_id_list.append(str(i))
return customer_id_list
name
の生成
ここでは Faker のname()
を使用して、名前をランダムに生成
def gen_name():
name_list = []
for i in range(NUM_CUSTOMER_RECORDS):
name_list.append(faker.name())
return name_list
date_of_birth
とage
の生成
誕生日を生成したら年齢はそれに伴わないとおかしいため、ここではFakerを使用するのを断念。(Fakerはランダムに年齢を生成してしまうため)
def gen_date_of_birth_and_age():
date_of_birth_list = []
ages = []
start_date = datetime(1970, 1, 1)
end_date = datetime(2010, 1, 1)
for i in range(NUM_CUSTOMER_RECORDS):
birth_date = faker.date_time_between_dates(start_date, end_date)
time_format = '%Y-%m-%d'
birth_date_str = datetime.strftime(birth_date, time_format)
date_of_birth_list.append(birth_date_str)
ages.append(calculate_age(birth_date))
return date_of_birth_list, ages
def calculate_age(birthday:DateTime):
diff:timedelta = datetime.now() - birthday
return (int)(divmod(diff.total_seconds(), 31536000)[0])
area_code
の生成
100〜999までのランダムな整数を生成
def gen_area_code():
area_code_list = []
random.seed(32)
for i in range(NUM_CUSTOMER_RECORDS):
area_code_list.append(random.randint(100, 999))
return area_code_list
zipcode
、address
、prefecture
を生成
ここでは、KEN_ALL.CSVを読み込んで郵便番号を抽出。
その抽出した郵便番号を、posuto
を使用して都道府県と住所にマッピング。
(今回はその後、cutlet
を使用してローマ字変換している)
def gen_zipcode_address_and_prefecture():
zipcodes = gen_zipcode()
addresses = []
prefectures = []
for zipcode in zipcodes:
zipcode = posuto.get(zipcode)
prefecture = zipcode.prefecture
address_romaji = romaji.romaji(str(zipcode))
addresses.append(address_romaji)
prefecture_romaji = romaji.romaji(prefecture)
prefectures.append(prefecture_romaji)
return zipcodes, addresses, prefectures
def gen_zipcode():
zip_zipcode = []
df = pd.read_csv('ken_all.csv', header=0, sep=',', dtype = {"zipcode":"string"})
df = df["zipcode"]
for i in df:
if len(i) != 7:
continue
zip_zipcode.append(i)
random.shuffle(zip_zipcode)
return zip_zipcode[:NUM_CUSTOMER_RECORDS]
sex
とmail
を生成
ここでは、Faker のprofile()
を使用して性別とメールアドレスをランダムに生成
def gen_sex():
sex_list = []
for i in range(NUM_CUSTOMER_RECORDS):
s = faker.profile(["sex"])
sex = s["sex"]
sex_list.append(sex)
return sex_list
def gen_mail():
mail_list = []
faker.seed(32)
for i in range(NUM_CUSTOMER_RECORDS):
s = faker.profile(["mail"])
sex = s["mail"]
mail_list.append(sex)
return mail_list
credit_card
とcredit_card_expire
を生成
ここでは、Faker のcredit_card_number()
とcredit_card_expire()
を使用して、クレジットカード番号と有効期限をタンダムに生成
def gen_credit_card():
credit_card_list = []
for i in range(NUM_CUSTOMER_RECORDS):
credit_card_list.append(faker.credit_card_number())
return credit_card_list
def gen_credit_card_expire():
credit_card_expire_list = []
for i in range(NUM_CUSTOMER_RECORDS):
credit_card_expire_list.append(faker.credit_card_expire())
return credit_card_expire_list
tore_sales_data
テーブルの作成
order_id
を生成
完成がJP-2017-6453571
このような形になるようにしたいので、リストに対象の年を詰めてランダムに取り出す+1111111〜9999999のランダムな整数を生成する
def gen_order_id():
order_id_list = []
year = ["2017", "2018", "2019", "2020"]
random.seed(32)
for i in range(NUM_SALES_RECORDS):
y = random.choice(year)
order_id_list.append(f"JP-{y}-"+str(random.randint(1111111, 9999999)))
return order_id_list
order_date
とship_date
の生成
ship_date は order_date の後でないとおかしいので、この2つともただランダムに日付を生成するわけにはいかない。
そこで、ここでは便利な Faker のdate_time_between_dates()
を使用して order_date を生成し、その日付に60日足した日付を shipping_date にすることにする
def gen_order_ship_date():
output = []
start_date = datetime(2017, 1, 1)
end_date = datetime(2020, 1, 1)
for i in range(NUM_SALES_RECORDS):
order_date = faker.date_time_between_dates(start_date, end_date)
shipping_date = order_date + timedelta(days=random.randint(1, 60))
time_format = '%Y-%m-%d'
order_date_str = datetime.strftime(order_date, time_format)
shipping_date_str = datetime.strftime(shipping_date, time_format)
output.append((order_date_str, shipping_date_str))
return output
shipping_mode
の生成
今回は以下4種類のshipping modeの指定があったので、この4種類からランダムに抽出。
def gen_shipping_mode():
shipping_mode = []
mode = ["regular delivery", "second class", "same-day delivery", "first class"]
for i in range(NUM_SALES_RECORDS):
shipping_mode.append(random.choice(mode))
return shipping_mode
product_id
、category
、sub_category
、product_name
の生成
これらはお互いが関係付いている必要があるため、ここではセットで(まとめて)生成
def gen_product_sets():
output = []
for i in range(NUM_PRODUCTS):
product_id = f"product-id-{i}"
product_name = f"product-name-{i}"
category = random.randint(0, NUM_CATEGORIES)
sub_category = random.randint(0, NUM_SUB_CATEGORIES_PER_CATEGORY)
category_str = f"cat-{category}"
sub_category_str = f"cat{category}-subcategory-{sub_category}"
output.append(
[
product_id,
category_str,
sub_category_str,
product_name,
]
)
return output
total_price
とprofit
の生成
total_price は profit より大きくないとおかしいので、そこも配慮して生成する
def gen_total_price_and_profit():
gen_sales_list = []
profit_list = []
for i in range(NUM_SALES_RECORDS):
sales = random.randint(3000200, 99999900)
profit = random.randint(-3000100, sales)
gen_sales_list.append(sales/100.0)
profit_list.append(profit/100.0)
return gen_sales_list, profit_list
quantity
の生成
1〜10までの整数をランダムに生成
def gen_quantity():
quantity_list = []
random.seed(32)
for i in range(NUM_SALES_RECORDS):
quantity_list.append(random.randint(1, 11))
return quantity_list
discount_rate
を生成
ここでは0〜0.9までのランダムな少数を生成する
def gen_discount_rate():
quantity_list = []
for i in range(NUM_SALES_RECORDS):
q = random.uniform(0, 0.9)
q = math.floor(q * 10) / 10
if q == 0.0:
quantity_list.append(0)
else:
quantity_list.append(q)
return quantity_list
コードの全体像
from datetime import datetime, timedelta
from xmlrpc.client import DateTime
from faker import Factory
import random
import posuto
import math
import pandas as pd
import cutlet
romaji = cutlet.Cutlet()
faker = Factory.create()
faker_jp = Factory.create('ja_JP')
NUM_SALES_RECORDS = 1_000_000
NUM_CUSTOMER_RECORDS = 1_000
NUM_PRODUCTS = 10_000
NUM_CATEGORIES = 10
NUM_SUB_CATEGORIES_PER_CATEGORY = 10
"""-----------------------------customer_data_table-----------------------------"""
def gen_customer_data():
output = []
customer_id = gen_customer_id()
name = gen_name()
area_code = gen_area_code()
zipcode, address, prefecture = gen_zipcode_address_and_prefecture()
date_of_birth, age = gen_date_of_birth_and_age()
period = gen_period()
sex = gen_sex()
mail = gen_mail()
mobile_number = gen_mobile_number()
phone_number = gen_phone_number()
credit_card = gen_credit_card()
credit_card_expire = gen_credit_card_expire()
for ci,n,dob,ac,z,p,ad,a,pr,s,m,mu,pn,cc,cce in zip(customer_id,name,date_of_birth,area_code,zipcode,prefecture,address,age,period,sex,mail,mobile_number,phone_number,credit_card,credit_card_expire):
customer_id_col = ci
name_col = n
date_of_birth_col = dob
area_code_col = ac
zipcode_col = z
prefecture_col = p
address_col = ad
age_col = a
period_col = pr
sex_col = s
mail_col = m
mobile_number_col = mu
phone_number_col = pn
credit_card_col = cc
credit_card_expire_col = cce
data = [
customer_id_col,
name_col,
date_of_birth_col,
area_code_col,
zipcode_col,
prefecture_col,
address_col,
age_col,
period_col,
sex_col,
mail_col,
mobile_number_col,
phone_number_col,
credit_card_col,
credit_card_expire_col
]
output.append(gen_data_record(data))
return output
def gen_customer_id(amount=NUM_CUSTOMER_RECORDS, randomize = False):
customer_id_list = []
gen_customer_id_list_customer = []
random.seed(32)
for i in range(NUM_CUSTOMER_RECORDS):
if randomize:
customer_id_list.append(str(random.randint(0, NUM_CUSTOMER_RECORDS)))
else:
customer_id_list.append(str(i))
return customer_id_list
def gen_name():
name_list = []
for i in range(NUM_CUSTOMER_RECORDS):
name_list.append(faker.name())
return name_list
def gen_date_of_birth_and_age():
date_of_birth_list = []
ages = []
start_date = datetime(1970, 1, 1)
end_date = datetime(2010, 1, 1)
for i in range(NUM_CUSTOMER_RECORDS):
birth_date = faker.date_time_between_dates(start_date, end_date)
time_format = '%Y-%m-%d'
birth_date_str = datetime.strftime(birth_date, time_format)
date_of_birth_list.append(birth_date_str)
ages.append(calculate_age(birth_date))
return date_of_birth_list, ages
def calculate_age(birthday:DateTime):
diff:timedelta = datetime.now() - birthday
return (int)(divmod(diff.total_seconds(), 31536000)[0])
def gen_area_code():
area_code_list = []
random.seed(32)
for i in range(NUM_CUSTOMER_RECORDS):
area_code_list.append(random.randint(100, 999))
return area_code_list
def gen_zipcode_address_and_prefecture():
zipcodes = gen_zipcode()
addresses = []
prefectures = []
for zipcode in zipcodes:
zipcode = posuto.get(zipcode)
prefecture = zipcode.prefecture
address_romaji = romaji.romaji(str(zipcode))
addresses.append(address_romaji)
prefecture_romaji = romaji.romaji(prefecture)
prefectures.append(prefecture_romaji)
return zipcodes, addresses, prefectures
def gen_zipcode():
zip_zipcode = []
zip_zipcode_list = []
df = pd.read_csv('ken_all.csv', header=0, sep=',', dtype = {"zipcode":"string"})
df = df["zipcode"]
for i in df:
if len(i) != 7:
continue
zip_zipcode.append(i)
random.shuffle(zip_zipcode)
return zip_zipcode[:NUM_CUSTOMER_RECORDS]
def gen_sex():
sex_list = []
for i in range(NUM_CUSTOMER_RECORDS):
s = faker.profile(["sex"])
sex = s["sex"]
sex_list.append(sex)
return sex_list
def gen_mail():
mail_list = []
faker.seed(32)
for i in range(NUM_CUSTOMER_RECORDS):
s = faker.profile(["mail"])
sex = s["mail"]
mail_list.append(sex)
return mail_list
def gen_credit_card():
credit_card_list = []
for i in range(NUM_CUSTOMER_RECORDS):
credit_card_list.append(faker.credit_card_number())
return credit_card_list
def gen_credit_card_expire():
credit_card_expire_list = []
for i in range(NUM_CUSTOMER_RECORDS):
credit_card_expire_list.append(faker.credit_card_expire())
return credit_card_expire_list
"""-----------------------------store_sales_data_table-----------------------------"""
def gen_order_id():
order_id_list = []
year = ["2017", "2018", "2019", "2020"]
random.seed(32)
for i in range(NUM_SALES_RECORDS):
y = random.choice(year)
order_id_list.append(f"JP-{y}-"+str(random.randint(1111111, 9999999)))
return order_id_list
def gen_order_ship_date():
output = []
start_date = datetime(2017, 1, 1)
end_date = datetime(2020, 1, 1)
for i in range(NUM_SALES_RECORDS):
order_date = faker.date_time_between_dates(start_date, end_date)
# shipping_date = faker.date_time_between_dates(order_date, end_date)
shipping_date = order_date + timedelta(days=random.randint(1, 60))
time_format = '%Y-%m-%d'
order_date_str = datetime.strftime(order_date, time_format)
shipping_date_str = datetime.strftime(shipping_date, time_format)
output.append((order_date_str, shipping_date_str))
return output
def gen_shipping_mode():
shipping_mode = []
mode = ["regular delivery", "second class", "same-day delivery", "first class"]
for i in range(NUM_SALES_RECORDS):
shipping_mode.append(random.choice(mode))
return shipping_mode
def gen_product_sets():
output = []
product_ids = []
product_names = []
categories = []
sub_categories = []
for i in range(NUM_PRODUCTS):
product_id = f"product-id-{i}"
product_name = f"product-name-{i}"
category = random.randint(0, NUM_CATEGORIES)
sub_category = random.randint(0, NUM_SUB_CATEGORIES_PER_CATEGORY)
category_str = f"cat-{category}"
sub_category_str = f"cat{category}-subcategory-{sub_category}"
output.append(
[
product_id,
category_str,
sub_category_str,
product_name,
]
)
return output
def gen_total_price():
gen_sales_list = []
random.seed(32)
for i in range(1, 10001):
gen_sales_list.append(random.randint(30002, 999999))
return gen_sales_list
def gen_total_price_and_profit():
gen_sales_list = []
profit_list = []
for i in range(NUM_SALES_RECORDS):
sales = random.randint(3000200, 99999900)
profit = random.randint(-3000100, sales)
gen_sales_list.append(sales/100.0)
profit_list.append(profit/100.0)
return gen_sales_list, profit_list
def gen_quantity():
quantity_list = []
random.seed(32)
for i in range(NUM_SALES_RECORDS):
quantity_list.append(random.randint(1, 11))
return quantity_list
def gen_discount_rate():
quantity_list = []
for i in range(NUM_SALES_RECORDS):
q = random.uniform(0, 0.9)
q = math.floor(q * 10) / 10
if q == 0.0:
quantity_list.append(0)
else:
quantity_list.append(q)
return quantity_list
def gen_data_record(data_list):
return ','.join([str(col) for col in data_list]) + "\n"
def write_csv(path, data_list):
with open(path, mode='w') as writer:
writer.writelines(data_list)
def gen_store_sales_data():
output = []
order_id = gen_order_id()
order_ship_date = gen_order_ship_date()
order_date = [row[0] for row in order_ship_date]
ship_date = [row[1] for row in order_ship_date]
shipping_mode = gen_shipping_mode()
customer_id = gen_customer_id(amount=NUM_SALES_RECORDS, randomize=True)
product_set = gen_product_sets()
total_price, profit = gen_total_price_and_profit()
quantity = gen_quantity()
discount_rate = gen_discount_rate()
for row_idx in range(NUM_SALES_RECORDS):
product = random.choice(product_set)
customer = random.choice(customer_id)
output.append(','.join([
order_id[row_idx],
order_date[row_idx],
ship_date[row_idx],
shipping_mode[row_idx],
customer,
*product,
str(total_price[row_idx]),
str(quantity[row_idx]),
str(discount_rate[row_idx]),
str(profit[row_idx])
]) + '\n')
return output
if __name__ == '__main__':
customer_data_table = gen_customer_data()
write_csv('customer_data_table.csv', customer_data_table)
store_sales_data_table = gen_store_sales_data()
write_csv('store_sales_data_table.csv', store_sales_data_table)
結果
80,Daniel Turner,1975/04/09,179,0392761,Aomori ken,Aomori ken Kamikita gun nana ko chou ni Morika no ato,70,70,M,jamesjohnson@gmail.com,050-9371-2148,03-8298-6385,503843700157,12/23
949,Kathleen Cortez,1984/07/15,318,8802104,Miyazaki ken,Miyazaki ken Miyazaki shi Ukita,39,30,M,teresa51@gmail.com,080-4663-4831,048-615-2050,6587590828073435,04/26
219,Jordan Cameron,1979/11/05,248,3212404,Tochigi ken,Tochigi ken Nikkou shi Todoroki,34,30,M,dfranklin@gmail.com,050-4429-2450,06-7053-6128,340812706008536,02/25
149,Theresa Macias,1990/08/18,410,9970126,Yamagata ken,Yamagata ken Tsuruoka shi Haguro chouya ara machi,67,60,F,angeladavid@gmail.com,070-3362-1305,03-8527-1630,2237284999109433,10/27
311,Aaron Campos,2017/07/10,815,1560052,Tokyo to,Tokyo to Setagaya ku Kyoudou,70,70,F,paul38@yahoo.com,050-0159-0949,0798-03-2332,4646170272104,09/26
JP-2017-4695006,2019-01-13,2019-01-21,regular delivery,80,product-id-1,cat-1,cat-1-sub-cat-1,product-name-1,111184,2,0,-446
JP-2018-6196760,2019-05-06,2019-06-05,second class,949,product-id-1,cat-1,cat-1-sub-cat-2,product-name-1,253995,4,0.1,4048.7
JP-2018-9436846,2017-08-05,2017-09-23,second class,219,product-id-1,cat-1,cat-1-sub-cat-3,product-name-1,181673,3,0.2,7002.8
JP-2017-1757877,2018-06-28,2018-08-19,same-day delivery,149,product-id-1,cat-1,cat-1-sub-cat-4,product-name-1,347855,5,0.8,26707.8
JP-2017-6559014,2017-01-07,2017-01-27,second class,311,product-id-1,cat-1,cat-1-sub-cat-5,product-name-1,762279,4,0.4,13376.3
完成。結構真面目にCSVデータを自作してみました!