python worksheet
# The Snowpark package is required for Python Worksheets.
# You can add more packages by selecting them using the Packages control and then importing them.
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import col
def distribution(session: snowpark.Session):
tableName = 'signate.data.train_230624'
df_sn = session.table(tableName)
return df_sn # Table()型
def describe(session: snowpark.Session):
tableName = 'signate.data.train_230624'
df_sn = session.table(tableName)
return df_sn.describe() # Table()型
def plot(session: snowpark.Session):
session.add_packages('matplotlib')
tableName = 'signate.data.train_230624'
df_sn = session.table(tableName)
df = snowpark.DataFrame.to_pandas(df_sn)
df["AGE"].plot.hist()
return df_sn # Table()型
def main(session: snowpark.Session):
# Your code goes here, inside the "main" handler.
tableName = 'signate.data.train_230624'
df_sn = session.table(tableName)
df = snowpark.DataFrame.to_pandas(df_sn)
df.columns = [x.upper() for x in df.columns]
df = df.replace({'DEFAULT': {'yes':1, 'no':0 }})
df = df.replace({'HOUSING': {'yes':1, 'no':0 }})
df = df.replace({'LOAN': {'yes':1, 'no':0 }})
session.write_pandas(df, "TRAIN", database="SIGNATE", schema="DATA", overwrite=True)
df_sn = session.table("signate.data.train")
#df_sn.write.mode("overwrite").saveAsTable("signate.data.train")
df_sn.show()
# Return value will appear in the Results tab.
return df_sn
python worksheet
# The Snowpark package is required for Python Worksheets.
# You can add more packages by selecting them using the Packages control and then importing them.
# Anacondaを使う場合はAnacondaの有効化が必要
# 有効化していないでto_pandasとかするとエラーが出る(pyarrowないよ、みたいな)
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import col
def main(session: snowpark.Session):
# Your code goes here, inside the "main" handler.
tableName = 'train'
df = session.table(tableName)
# Print a sample of the dataframe to standard output.
print("### conf whether pyarrow is installed ###")
tableName = 'information_schema.packages'
dataframe = session.table(tableName).filter(col("language") == 'python')
dataframe.filter(col("PACKAGE_NAME")=="pyarrow").show()
print("### describe ###")
df.describe().show()
print("\n### count ###")
print(df.count())
print("\n### sample ###")
df.sample(n=3).show()
print("\n### explain ###")
print(df.explain())
print("\n### first ###")
[print(s) for s in df.first(3)]
print("\n### show ###")
print(df.show(3))
print("\n### col ###")
print(df.select(col("job")).collect())
# pandasからsnowflake table
df[["ID","JOB","BALANCE"]].sample(n=3).write.mode("overwrite").save_as_table("my_table", table_type="temporary")
session.table("my_table").collect()
print("\n### to_pandas ###")
df_pd = snowpark.DataFrame.to_pandas(df)
print(df_pd)
print("\n")
print(df_pd.shape)
print("\n")
print(df_pd["JOB"].value_counts())
print("\n")
print(df_pd.info())
print("\n")
print(df_pd.isnull().sum())
import pandas as pd
pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)
print(df_pd.corr(numeric_only='True'))
# pandasからsnowflake table(相関係数TBLを作ってHeatmap表示したかったんだけど…)
import numpy as np
corr_index = df_pd.select_dtypes(include=np.number).columns
df_corr = df_pd.corr(numeric_only='True')
df_corr["index"] = corr_index
df2 = session.create_dataframe(df_corr)
df2.write.mode("overwrite").save_as_table("corr_table", table_type="temporary")
session.table("corr_table").collect()
#session.write_pandas(df, "TRAIN", database="SIGNATE", schema="DATA", overwrite=True)
#df_sn.write.mode("overwrite").saveAsTable("signate.data.train")
print("\n### before matplotlib ###")
#import os
#os.environ['MPLCONFIGDIR'] = os.getcwd() + "/configs/"
#import matplotlib as mpl
import matplotlib.pyplot as plt
#print(mpl.get_configdir())
#print(mpl.matplotlib_fname())
print("\n### matplotlib result ###")
plt.plot(df_pd["Y"])
plt.show()
use_columns = ['DURATION', 'PDAYS', 'PREVIOUS', 'JOB', 'EDUCATION', 'MARITAL','DEFAULT', 'HOUSING','LOAN']
y = df_pd['Y']
train = df_pd[use_columns]
# pdaysは-1とそれ以外で分ける
train.loc[train['PDAYS']>=0, 'PDAYS'] = 1
train.loc[~train['PDAYS']>=0, 'PDAYS'] = 0
# previousは0とそれ以外で分ける
train.loc[train['PREVIOUS']!=0, 'PREVIOUS'] = 1
#カテゴリ変数のダミー変数化
train = pd.get_dummies(train, drop_first=True)
#test = pd.get_dummies(test, drop_first=True)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
train_ss = pd.DataFrame(ss.fit_transform(train), columns=train.columns)
# Return value will appear in the Results tab.
return df2