中央値を求めたい時はPERCENTILE_CONTを用いる。
PERCENTILE_CONT (md.magazine_cnt, 0.5) OVER () AS median_1
結果を縦に結合するのはunionを用いる。
全結合はunion all
重複なし結合はunion distinct
A left join B
はAの行を残すjoin法
DATE型から特定の月、日などを取り出したい時はEXTRACTを用いる。
SELECT EXTRACT(DAY FROM DATE '2013-12-25') AS the_day;
これならば例えば25が出力される。
月別、週別調査コード
with contents_view as(
select user_id,magazine_id, timestamp
from `ganma~*`
where _TABLE_SUFFIX between "20220401" and "20220430"
)
,
subscription_user as (
select
user.user_id,user.purchase_date,user.expire_date,
case
when DATE(expire_date)<=DATE_ADD(DATE(purchase_date), INTERVAL 1 MONTH) then 1
when DATE(expire_date)>DATE_ADD(DATE(purchase_date), INTERVAL 1 MONTH) then 0
end as churn_flag
from
`ganma-backend-production.kizukai.recommendation_subscription_users_20220430` as user
where
DATE(user.purchase_date)<= '2022-04-30'
and
DATE(user.expire_date)>= '2022-04-01'
)
,
middle_table as(
select
su.user_id,count(distinct(cv.magazine_id)) as magazine_cnt,count(cv.magazine_id) as story_cnt
from
subscription_user as su
left join
contents_view as cv
on su.user_id=cv.user_id
group by su.user_id
)
select
max(mt.magazine_cnt) as mg_max,
avg(mt.magazine_cnt) as mg_avg,
max(mt.story_cnt) as st_max,
avg(mt.story_cnt) as st_avg
from
middle_table as mt
limit 1
abテスト時間帯別調査
with contents_view as(
select
user_id,magazine_id, timestamp,story_number
from `ga~_*`
where _TABLE_SUFFIX between "20220325" and "20220427"
)
,
magazine_id_name as(
select
magazine_id,magazine_title
from `ganma-~_20220428`
)
,
subscription_user as (
select
user.userid,
case
when experimentVariant="1" then 1
else 0
end as variant_flg
from
`ganma-backend-production.kizukai.exp_02_users_20220425` as user
)
,
kizukai_table as(
select
cv.user_id,cv.magazine_id, cv.timestamp,cv.story_number,EXTRACT(HOUR FROM cv.timestamp) as hour
from
contents_view as cv
left join
subscription_user as su
on cv.user_id=su.userid
where variant_flg=1
)
,
baseline_table as(
select
cv.user_id,cv.magazine_id, cv.timestamp,cv.story_number,EXTRACT(HOUR FROM cv.timestamp) as hour
from
contents_view as cv
left join
subscription_user as su
on cv.user_id=su.userid
where variant_flg=0
)
,
kizukai_timezone as (
select
user_id,
case
when hour <4 then 1
else 0
end as time_zone_0_4
,
case
when (hour >=4 )and(hour<8) then 1
else 0
end as time_zone_4_8
,
case
when (hour >=8 )and(hour<12) then 1
else 0
end as time_zone_8_12
,
case
when (hour >=12 )and(hour<16) then 1
else 0
end as time_zone_12_16
,
case
when (hour >=16 )and(hour<20) then 1
else 0
end as time_zone_16_20
,
case
when (hour >=20 )and(hour<24) then 1
else 0
end as time_zone_20_24
from
kizukai_table
)
,
baseline_timezone as (
select
user_id,
case
when (hour <4) then 1
else 0
end as time_zone_0_4
,
case
when (hour >=4 )and(hour<8) then 1
else 0
end as time_zone_4_8
,
case
when (hour >=8 )and(hour<12) then 1
else 0
end as time_zone_8_12
,
case
when (hour >=12 )and(hour<16) then 1
else 0
end as time_zone_12_16
,
case
when (hour >=16 )and(hour<20) then 1
else 0
end as time_zone_16_20
,
case
when (hour >=20 )and(hour<24) then 1
else 0
end as time_zone_20_24
from
baseline_table
)
,
kizukai_agg as (
select
sum(time_zone_0_4) as sum_0_4,sum(time_zone_4_8) as sum_4_8,sum(time_zone_8_12) as sum_8_12,sum(time_zone_12_16) as sum_12_16,
sum(time_zone_16_20) as sum_16_20,sum(time_zone_20_24) as sum_20_24,count(user_id) as kizukai_sum
from
kizukai_timezone
)
,
baseline_agg as (
select
sum(time_zone_0_4) as sum_0_4,sum(time_zone_4_8) as sum_4_8,sum(time_zone_8_12) as sum_8_12,sum(time_zone_12_16) as sum_12_16,
sum(time_zone_16_20) as sum_16_20,sum(time_zone_20_24) as sum_20_24,count(user_id) as baseline_sum
from
baseline_timezone
)
select
sum_0_4,sum_4_8,sum_8_12,sum_12_16,sum_16_20,sum_20_24,kizukai_sum,sum_0_4/kizukai_sum as rate_0_4,
sum_4_8/kizukai_sum as rate_4_8,sum_8_12/kizukai_sum as rate_8_12,sum_12_16/kizukai_sum as rate_12_16,
sum_16_20/kizukai_sum as rate_16_20,sum_20_24/kizukai_sum as rate_20_24
from
kizukai_agg
cm_kikkake
with cm_user as(
select
*
from
`ganma-backend-production.kizukai.cm_kikkake_users_20220324`
)
,
subscription_user as (
select
user.user_id,user.user_create_time,user.expire_date,
EXTRACT(MONTH FROM DATE(user.user_create_time)) as month,EXTRACT(YEAR FROM DATE(user.user_create_time)) as year,
case
when DATE(expire_date)<=DATE_ADD(DATE(purchase_date), INTERVAL 1 MONTH) then 1
when DATE(expire_date)>DATE_ADD(DATE(purchase_date), INTERVAL 1 MONTH) then 0
end as churn_flag
from
`ganma-backend-production.kizukai.recommendation_subscription_users_20220324` as user
)
select
su.year,su.month,count(*) as number
from
cm_user as cu
left join
subscription_user as su
on cu.user_id=su.user_id
group by year,month
order by year,month
with contents_view as(
select user_id,magazine_id, timestamp,MOD((EXTRACT(HOUR FROM timestamp)+9),24) as hour
from `ganma-backend-production.kizukai.recommendation_story_views_*`
where _TABLE_SUFFIX between "20220201" and "20220403"
)
,
magazine_table as(
select
magazine_id,magazine_title
from `ganma-backend-production.kizukai.recommendation_magazines_20220519`
)
,
subscription_user as (
select
user.user_id,user.purchase_date,user.expire_date,
case
when DATE(expire_date)<=DATE_ADD(DATE(purchase_date), INTERVAL 1 MONTH) then 1
when DATE(expire_date)>DATE_ADD(DATE(purchase_date), INTERVAL 1 MONTH) then 0
end as churn_flag
,
from
`ganma-backend-production.kizukai.recommendation_subscription_users_20220519` as user
where
DATE(user.purchase_date)>= '2022-02-01'
and
DATE(user.purchase_date)<= '2022-02-28'
)
,
middle_table as (
select
cv.user_id,cv.magazine_id,cv.timestamp,su.purchase_date,su.expire_date,su.churn_flag,cv.hour
from
subscription_user as su
left join
contents_view as cv
on su.user_id=cv.user_id
)
,
middle_table2 as (
select
mt.user_id,mt.magazine_id,mt.timestamp,mt.purchase_date,mt.expire_date,mt.churn_flag,mt.hour,
case
when DATE(mt.timestamp)<=DATE_ADD(DATE(mt.purchase_date), INTERVAL 1 month) then 1
else 0
end as one_month_read_cnt
,
case
when DATE(mt.timestamp)<=DATE_ADD(DATE(mt.purchase_date), INTERVAL 1 month) then magazine_id
else "nan"
end as one_month_read_magazine
,
case
when mt.timestamp<=TIMESTAMP_ADD(mt.purchase_date, INTERVAL 3 day) then 1
else 0
end as first_3days
,
case
when mt.timestamp<=TIMESTAMP_ADD(mt.purchase_date, INTERVAL 3 day) then 0
when mt.timestamp>TIMESTAMP_ADD(mt.purchase_date, INTERVAL 6 day) then 0
else 1
end as first_3_6days
,
case
when mt.timestamp<=TIMESTAMP_ADD(mt.purchase_date, INTERVAL 7 day) then 1
else 0
end as first_week
,
case
when mt.timestamp<TIMESTAMP_ADD(mt.purchase_date, INTERVAL 7 day) then 0
when mt.timestamp>TIMESTAMP_ADD(mt.purchase_date, INTERVAL 14 day) then 0
else 1
end as second_week
,
case
when mt.timestamp<TIMESTAMP_ADD(mt.purchase_date, INTERVAL 14 day) then 0
when mt.timestamp>TIMESTAMP_ADD(mt.purchase_date, INTERVAL 21 day) then 0
else 1
end as third_week
,
case
when mt.timestamp<TIMESTAMP_ADD(mt.purchase_date, INTERVAL 21 day) then 0
when mt.timestamp>TIMESTAMP_ADD(mt.purchase_date, INTERVAL 28 day) then 0
else 1
end as forth_week
,
case
when mt.expire_date<TIMESTAMP_ADD(mt.timestamp, INTERVAL 3 day) then 1
else 0
end as last_3days
,
case
when hour <4 then 1
else 0
end as time_zone_0_4
,
case
when (hour >=4 )and(hour<8) then 1
else 0
end as time_zone_4_8
,
case
when (hour >=8 )and(hour<12) then 1
else 0
end as time_zone_8_12
,
case
when (hour >=12 )and(hour<16) then 1
else 0
end as time_zone_12_16
,
case
when (hour >=16 )and(hour<20) then 1
else 0
end as time_zone_16_20
,
case
when (hour >=20 )and(hour<24) then 1
else 0
end as time_zone_20_24
from
middle_table as mt
)
,
middle_table3 as(
select
mt2.user_id,mt2.magazine_id,mt.magazine_title,mt2.one_month_read_cnt,mt2.one_month_read_magazine,mt2.timestamp,mt2.purchase_date,mt2.expire_date,mt2.churn_flag,mt2.hour,mt2.first_week,mt2.second_week,mt2.third_week,
mt2.forth_week,mt2.time_zone_0_4,time_zone_4_8,time_zone_8_12,time_zone_12_16,time_zone_16_20,time_zone_20_24,mt2.first_3days,mt2.last_3days,mt2.first_3_6days
from
middle_table2 as mt2
left join
magazine_table as mt
on mt2.magazine_id=mt.magazine_id
),
middle_table4 as(
select
*,
case
when (mt3.magazine_title="山田くんとLv999の恋をする") then 1
else 0
end as title_1
,
case
when (mt3.magazine_title="地雷なんですか?地原さん") then 1
else 0
end as title_2
,
case
when (mt3.magazine_title="七瀬さんの恋が異常") then 1
else 0
end as title_3
,
case
when (mt3.magazine_title="外れたみんなの頭のネジ") then 1
else 0
end as title_4
,
case
when (mt3.magazine_title="神様ですげェむ") then 1
else 0
end as title_5
,
case
when (mt3.magazine_title="女子力高めな獅子原くん") then 1
else 0
end as title_6
,
case
when (mt3.magazine_title="多数欠[第1部]") then 1
else 0
end as title_7
from
middle_table3 as mt3
)
,
table_ as (
select
user_id,count(distinct(mt4.one_month_read_magazine)) as magazine_cnt,count(mt4.one_month_read_cnt) as story_cnt,sum(first_week) as sum_first_week,sum(second_week) as sum_second_week,
sum(third_week) as sum_third_week,sum(forth_week) as sum_forth_week,sum(time_zone_0_4) as sum_0_4,sum(time_zone_4_8) as sum_4_8,sum(time_zone_8_12) as sum_8_12,
sum(time_zone_12_16) as sum_12_16,sum(time_zone_16_20) as sum_16_20,sum(time_zone_20_24) as sum_20_24,
sum(title_1) as sum_title1,sum(title_2) as sum_title2,sum(title_3) as sum_title3,sum(title_4) as sum_title4,
sum(title_5) as sum_title5,sum(title_6) as sum_title6,sum(title_7) as sum_title7,sum(first_3days) as sum_first3days,
sum(last_3days) as sum_last3days,sum(first_3_6days) as sum_first3_6days,
case
when (sum(churn_flag) >=1 ) then 1
else 0
end as churn_flag_
from
middle_table4 as mt4
group by user_id
order by story_cnt
)
select
*,sum_second_week-sum_first_week as first_lug,sum_third_week-sum_second_week as second_lug,
sum_forth_week-sum_third_week as third_lug,sum_first3_6days-sum_first3days as most_first_lug
from
table_
parse_datesを用いて指定した列をdatetime型で読み込むことができる。
df = pd.read_csv('data/data_3.csv', parse_dates=['date'])
df.info()
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
Column Non-Null Count Dtype
0 date 3 non-null datetime64[ns]
1 product 3 non-null object
2 price 3 non-null int64
dtypes: datetime64ns, int64(1), object(1)
memory usage: 200.0+ bytes
Now, the DataFrame should look like:
extractを用いて日付けに対応するpartを取得できる。
SELECT EXTRACT(DAYOFWEEK FROM DATE ('2017-12-22'))
=> 6 (金曜日)
datetimeの丸め方
raw_data["timestamp"].dt.round("D")
applyにおけるargsの使い方
def cal_R(x, col, df):
if x <= df[col][0.25]:
return 1
elif x <= df[col][0.50]:
return 2
elif x <= df[col][0.75]:
return 3
else:
return 4
rfm['R_score'] = rfm.recency.apply(cal_R, args=('recency', quantiles_dict))
引数を指定するときに用いる。
累積和はcumsum
decil["Cumsum"]=decil.Totalscore.cumsum()
新しい累積和/総和の列を作りたい時は以下
cumprod=pd.DataFrame(decil["Cumsum"]/sum(decil["Totalscore"]))
cumprod=cumprod.rename(columns={"Cumsum":"Cumprod"},inplace=True)
decil=pd.concat([decil,cumprod],axis=1)
ヒートマップ
import matplotlib.pyplot as plt
import seaborn as sns
df_flights_pivot = pd.pivot_table(data=data_, values='churn_flag',
columns='F_score', index='R_score')
plt.figure(figsize=(12, 9))
sns.heatmap(df_flights_pivot, annot=True, fmt='g', cmap='Blues')
Bigquery 日付の差分
date_diff('2018-12-20', '2018-11-10', DAY) as diff_day,
#output 40
with contents_view as(
select user_id,magazine_id,EXTRACT(DAY FROM DATE (timestamp)) as day,timestamp
from `ganma-backend-production.kizukai.recommendation_story_views_*`
where _TABLE_SUFFIX between "20220401" and "20220430"
)
,
subscription_user as (
select
user.user_id,user.purchase_date,user.expire_date,
case
when DATE(expire_date)<='2022-05-31'then 1
else 0
end as churn_flag
from
`ganma-backend-production.kizukai.recommendation_subscription_users_20220601` as user
where
DATE(user.expire_date)>= '2022-05-01'
and
DATE(user.purchase_date)<='2022-03-31'
)
,
middle_table as (
select
cv.user_id,cv.magazine_id,cv.day,cv.timestamp,su.purchase_date,su.expire_date,su.churn_flag
from
subscription_user as su
left join
contents_view as cv
on su.user_id=cv.user_id
)
,
md_2 as (
select
mt.user_id,max(day) as last_read,count(distinct(day)) as unique_day,count(distinct(magazine_id)) as unique_magazine,
count(magazine_id) as read_cnt,sum(churn_flag) as churn_flag
from
middle_table as mt
group by user_id
)
select
user_id,30-last_read as recency,unique_day as frequenct_day,unique_magazine as frequency_magazine,
read_cnt as frequency_read,
case
when churn_flag>0 then 1
else 0
end as churn_flag_
from
md_2
where 30-last_read>=0
order by recency
UTCから普通の
concat
print(df1)
# A B C
# ONE A1 B1 C1
# TWO A2 B2 C2
# THREE A3 B3 C3
# C D
# TWO C2 D2
# THREE C3 D3
# FOUR C4 D4
df_concat = pd.concat([df1, df2])
print(df_concat)
# A B C D
# ONE A1 B1 C1 NaN
# TWO A2 B2 C2 NaN
# THREE A3 B3 C3 NaN
# TWO NaN NaN C2 D2
# THREE NaN NaN C3 D3
# FOUR NaN NaN C4 D4