はじめに
Google Analyticsとは,googleが提供するWedアクセス解析ツールです。
Wedサイトへの訪問者のアクセス地域,時間,使用デバイス,ブラウザなど様々な情報を取得できます。
GUIも充実しているのでそれだけでも十分といえば十分ですが,無料でAPIも提供されているので試しにAPI接続してデータベースを作ってみました。
#API接続しMySQLへ保存
1.アナリティクス API を有効にする
Google API コンソールからAPIを有効にしておきます.
2.Google クライアント ライブラリをインストールする
sudo pip install --upgrade google-api-python-client
3.実際に接続してみる
main関数に日付と取得したい情報を入力として,そのDataFrameを返す。
'''fetch_analytics_data.py'''
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = '<REPLACE_WITH_JSON_FILE>'
VIEW_ID = '<ID_NUMBER>'
def initialize_analyticsreporting():
"""Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics, start_day, end_day, dimension_list):
"""Queries the Analytics Reporting API V4.
Args:
analytics: An authorized Analytics Reporting API V4 service object.
Returns:
The Analytics Reporting API V4 response.
"""
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': start_day, 'endDate': end_day}],
# 'metrics': [{'expression': 'ga:sessions'}],
'dimensions': dimension_list
# [{'name': 'ga:country'},
# {'name': 'ga:browser'},
# {'name': 'ga:city'},
# {'name': 'ga:sex'}
# ]
}]
}
).execute()
def fetch_columns_name(response):
col_name = [i.lstrip('ga:') for i in list(list(list(response.values())[0][0].values())[0].values())[0]]
# col_name.append('visitor')
return col_name
def split_dimensions(response, col_name):
df = pd.json_normalize(response.get('reports'), sep=' ')["data rows"]
df = pd.json_normalize(df[0])
# split dimensions
for i in range(len(df['dimensions'][0])):
new_feature = [df['dimensions'][j][i] for j in range(df.shape[0])]
feature_name = col_name[i]
df[feature_name] = new_feature
df.drop(["dimensions"], axis=1, inplace=True)
# fetch visitor (not use now)
# df["visitor"] = [int(list(df['metrics'][i][0].values())[0][0]) for i in range(df.shape[0])]
df.drop(['metrics'], axis=1, inplace=True)
return df
def main(start_day, end_day, dim_list):
analytics = initialize_analyticsreporting()
response = get_report(analytics, start_day, end_day, dim_list)
col_name = fetch_columns_name(response)
df = split_dimensions(response, col_name)
return df
if __name__ == "main":
main()
4.Databaseに書き出す
無料版では一度に引ける情報量に制限があるので,4回に分けて取得しています。
import pandas as pd
import fetch_analytics_data as fa
from database_connect import database_engine
import sys
from sqlalchemy import create_engine
# -----------------------------------------------------------------
dim_list1 = [
# Geo Network
{'name': 'ga:country'},
{'name': 'ga:city'},
{'name': "ga:continentId"},
## Platform of Device
{'name': "ga:browser"},
## Users
{'name': 'ga:userType'},
{'name': 'ga:sessionCount'},
{'name': 'ga:daysSinceLastSession'},
# {'name': 'ga:userDefinedValue'},
# {'name': 'ga:userBucket'},
## session
{'name': 'ga:sessionDurationBucket'},
## Time
{'name': 'ga:dateHourMinute'},
]
# ----------------------------------------------------------------
dim_list2 = [
## Geo Network
{'name': 'ga:city'},
## Platform of Device
# {'name':"ga:operatingSystem"},
# {'name':'ga:mobileDeviceBranding'},
# {'name':'ga:mobileDeviceModel'},
# {'name':"ga:mobileInputSelector"},
# {'name':"ga:mobileDeviceInfo"},
# {'name':'ga:mobileDeviceMarketingName'},
{'name': 'ga:deviceCategory'},
## Page Tracking
{'name': 'ga:pagePath'},
{'name': 'ga:exitPagePath'},
{'name': 'ga:pageDepth'},
# {'name':'ga:pageTitle'},
## Time
{'name': 'ga:dateHourMinute'},
]
# ---------------------------------------------------------------
dim_list3 = [
# Geo Network
{'name': 'ga:city'},
## Traffic Sources
{'name': "ga:referralPath"},
# {'name': "ga:campaign"}, # all not set
{'name': "ga:source"},
{'name': "ga:medium"},
{'name': "ga:sourceMedium"},
{'name': "ga:hasSocialSourceReferral"},
## Time
{'name': 'ga:dateHourMinute'},
]
# -----------------------------------------------------------------
dim_list4 = [
## Geo Network
{'name': 'ga:city'},
## Platform of Device
{'name': "ga:operatingSystem"},
{'name': 'ga:mobileDeviceBranding'},
{'name': 'ga:mobileDeviceModel'},
{'name': "ga:mobileInputSelector"},
{'name': "ga:mobileDeviceInfo"},
## Time
{'name': 'ga:dateHourMinute'},
]
# -----------------------------------------------------------
def database_engine():
db_url = '<Database_URL>'
engine = create_engine(db_url)
return engine
def main():
start_day = sys.argv[1]
end_day = sys.argv[1]
df1 = fa.main(start_day, end_day, dim_list1)
df2 = fa.main(start_day, end_day, dim_list2)
df3 = fa.main(start_day, end_day, dim_list3)
df4 = fa.main(start_day, end_day, dim_list4)
# merge
df = pd.merge(df1, df2, on=['city','dateHourMinute'], how='outer').merge(df3, on=['city','dateHourMinute'], how='outer').merge(df4, on=['city','dateHourMinute'], how='outer')
en = database_engine()
df.to_sql('analytics_table', con=en, if_exists='append', index=False)
if __name__ == '__main__':
main()