Hi
I need to build a model factory for clustered time series models. I had a look at this accelerator, but I am still having trouble.
Specifically: how can I pass the clusters found by a clustering model as the segments for a segmented time series project?
thanks
Little
Hey @LittleNeutrino!
I've got some code here from our DataRobot Python API client to build a time series clustering project, select a clustering model from the leaderboard, and use it as the segmenter in a time series project. Note that the segment specification can be passed in the SegmentationTask.create() function as a time series clustering model package (as shown below), or as a feature name in the training dataset using the input user_defined_segment_id_column (see docs here).
#Import packages:
import datarobot as dr # recommended package version 3.1.0 or later
from datarobot import Dataset
from datarobot import Project
from datarobot import FeatureSettings
from datarobot import DatetimePartitioningSpecification
from datarobot.enums import UnsupervisedTypeEnum
from datarobot.enums import AUTOPILOT_MODE
from datarobot import Model
from datarobot.models import ModelPackage
from datarobot import SegmentationTask
from datarobot import PredictionServer
from datarobot import Deployment
from datarobot import BatchPredictionJob
#Create the time series clustering project
##here we create the project directly from an AI Catalog dataset, but of course it could be created through an existing data connection, from a local file, or with a pandas dataframe, as well.
cluster_project = dataset.create_project(project_name = "<project name>")
autopilot_cluster_list = [2,3,5,7,10]
cluster_featurelist = cluster_project.create_featurelist(
"cluster",
[
datetime_partition_column,
multiseries_id_column,
target
]
)
cluster_spec = DatetimePartitioningSpecification(
use_time_series=True,
datetime_partition_column=datetime_partition_column,
multiseries_id_columns=[multiseries_id_column],
disable_holdout=True,
number_of_backtests=1)
cluster_project.analyze_and_model(
partitioning_method=cluster_spec,
mode=AUTOPILOT_MODE.FULL_AUTO,
worker_count=-1,
unsupervised_mode=True,
unsupervised_type=UnsupervisedTypeEnum.CLUSTERING,
featurelist_id=cluster_featurelist.id,
autopilot_cluster_list=autopilot_cluster_list
)
#select a cluster blueprint from the leaderboard
cluster_models = cluster_project.get_models()
cluster_top_model = cluster_models[0]
#Create a model package from the top cluster model
cluster_model_package = ModelPackage.create(model_id=cluster_top_model.id)
#Create the segmented time series project
segment_project = dataset.create_project(project_name = "<project name>")
feature_settings = [FeatureSettings(feat_name, known_in_advance=True) for feat_name in kia_features]
segment_spec = dr.DatetimePartitioningSpecification(
use_time_series=True,
datetime_partition_column=datetime_partition_column,
multiseries_id_columns=[multiseries_id_column],
feature_settings=feature_settings
)
segment_task_results = SegmentationTask.create(
project_id=segment_project.id,
target=target,
use_time_series=True,
datetime_partition_column=datetime_partition_column,
multiseries_id_columns=[multiseries_id_column],
#include the cluster model package as the segmenter here!
model_package_id=cluster_model_package.id
)
segment_task = segment_task_results['completedJobs'][0]
segment_project.analyze_and_model(
target=target,
partitioning_method=segment_spec,
mode=AUTOPILOT_MODE.QUICK,
worker_count=-1,
segmentation_task_id=segment_task.id
)