google spreadseet - df
google spreadseetとgoogle colabの連携
!pip install gspread
from google.colab import auth
import gspread
from google.auth import default
#認証
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
#スプレッドシート取得
url = "スプレッドシートのURL"
ss = gc.open_by_url(url)
worksheet_list = ss.worksheets()
for worksheet in worksheet_list:
print(worksheet)
#<Worksheet 'シート1' id:0>
google spreadseet→df
#シート取得
worksheet = ss.worksheet('シート1')
df = pd.DataFrame(worksheet.get_all_values()[1:],
columns=worksheet.get_all_values()[0])
df.head(5)
JSONL - df
df→jsonl
df[['in','out']].to_json('file.jsonl', force_ascii=False, orient='records', lines=True)
jsonl→df
df = pd.read_json('file.jsonl', orient='records', lines=True)
df.head()
Gitの非公開リポジトリClone
%env TOKEN=パーソナルアクセストークン
!git clone https://$$TOKEN@github.com/ユーザ名/リポジトリ名.git
dfの操作
train,test,validデータ分割
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df,test_size=0.7,random_state=0)
df_test, df_valid = train_test_split(df_test,test_size=0.5,random_state=0)
print('train:',len(df_train))
print('test:',len(df_test))
print('valid:',len(df_valid))
重複した行を削除
df = df.drop_duplicates()
#特定の列
df = df.drop_duplicates(subset='列名')