Oracle Data Science Cloudで使用できる、adsについて。
結論
targetとして指定したSeriesの型がfloatであれば回帰。intであれば分類。
決定ロジック
パッケージの場所
adsパッケージのコードを見ていく。
import sys
sys.path
['/tmp/dask-worker-space/worker-_4k3q1mv',
'/home/datascience/conda/mlcpuv1/lib/python36.zip',
'/home/datascience/conda/mlcpuv1/lib/python3.6',
'/home/datascience/conda/mlcpuv1/lib/python3.6/lib-dynload',
'',
'/home/datascience/conda/mlcpuv1/lib/python3.6/site-packages', <=ここ
'/home/datascience/conda/mlcpuv1/lib/python3.6/site-packages/IPython/extensions',
'/home/datascience/.ipython']
変数「ml_task_type」で決まる。
ads/automl/driver.py
def get_ml_task_type(X, y, classes):
target_type = TypeDiscoveryDriver().discover(y.name, y)
if isinstance(target_type, DiscreteTypedFeature):
if len(classes) == 2:
if helper.is_text_data(X):
ml_task_type = utils.ml_task_types.BINARY_TEXT_CLASSIFICATION
else:
ml_task_type = utils.ml_task_types.BINARY_CLASSIFICATION
else:
if helper.is_text_data(X):
ml_task_type = utils.ml_task_types.MULTI_CLASS_TEXT_CLASSIFICATION
else:
ml_task_type = utils.ml_task_types.MULTI_CLASS_CLASSIFICATION
elif isinstance(target_type, ContinuousTypedFeature):
ml_task_type = utils.ml_task_types.REGRESSION
else:
raise TypeError("AutoML for target type ({0}) is not yet available"
.format(target_type.meta_data["type"]))
return ml_task_type
TypeDiscoveryDriver().discover(y.name, y) の型で決まる
from ads.type_discovery.type_discovery_driver import TypeDiscoveryDriver
ads/automl/type_discovery/type_discovery_driver.py
class TypeDiscoveryDriver:
#
# takes a pandas series
#
def discover(self, name, s, is_target=False):
:
:
if is_target and ContinuousDetector._target_is_continuous(s):
return ContinuousTypedFeature.build(name, s)
ContinuousDetector._target_is_continuous(s) がTrueなら回帰
from ads.type_discovery.continuous_detector import ContinuousDetector
ads/automl/type_discovery/continuous_detector.py
class ContinuousDetector(AbstractTypeDiscoveryDetector):
@staticmethod
def _target_is_continuous(series):
if str(series.dtype) in ['float16', 'float32', 'float64']:
return True # treat target variable as continuous
elif str(series.dtype) in ['int16', 'int32', 'int64']:
if series.nunique() >= 20:
return True # treat target variable as continuous
return False