## Example of cross validation

In [1]:
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import autosklearn.classification

In [2]:
def main():
 X, y = sklearn.datasets.load_digits(return_X_y=True)
 X_train, X_test, y_train, y_test = \
 sklearn.model_selection.train_test_split(X, y, random_state=1)

 automl = autosklearn.classification.AutoSklearnClassifier(
 time_left_for_this_task=120, per_run_time_limit=30,
 tmp_folder='/tmp/autoslearn_sequential_example_tmp',
 output_folder='/tmp/autosklearn_sequential_example_out',
 # Do not construct ensembles in parallel to avoid using more than one
 # core at a time. The ensemble will be constructed after auto-sklearn
 # finished fitting all machine learning models.
 ensemble_size=0, delete_tmp_folder_after_terminate=False)
 automl.fit(X_train, y_train, dataset_name='digits')
 # This call to fit_ensemble uses all models trained in the previous call
 # to fit to build an ensemble which can be used with automl.predict()
 automl.fit_ensemble(y_train, ensemble_size=50)

 print(automl.show_models())
 predictions = automl.predict(X_test)
 print(automl.sprint_statistics())
 print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))


if __name__ == '__main__':
 main()

[(0.980000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),
(0.020000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'liblinear_svc', 'imputation:strategy': 'median', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'polynomial', 'rescaling:__choice__': 'minmax', 'classifier:liblinear_svc:C': 2.4244459875201874, 'classifier:liblinear_svc:dual': 'False', 'classifier:liblinear_svc:fit_intercept': 'True', 'classifier:liblinear_svc:intercept_scaling': 1, 'classifier:liblinear_svc:loss': 'squared_hinge', 'classifier:liblinear_svc:multi_class': 'ovr', 'classifier:liblinear_svc:penalty': 'l2', 'classifier:liblinear_svc:tol': 0.0014473335587607684, 'preprocessor:polynomial:degree': 3, 'preprocessor:polynomial:include_bias': 'False', 'preprocessor:polynomial:interaction_only': 'True'},
dataset_properties={
 'task': 2,
 'sparse': False,
 'multilabel': False,
 'multiclass': True,
 'target_type': 'classifi

## Example feature types 

In [3]:
try:
 import openml
except ImportError:
 print("#"*80 + """
 To run this example you need to install openml-python:
 pip install git+https://github.com/renatopp/liac-arff
 pip install requests xmltodict
 pip install git+https://github.com/openml/openml-python@develop --no-deps\n""" +
 "#"*80)
 raise


def main():
 # Load adult dataset from openml.org, see https://www.openml.org/t/2117
 openml.config.apikey = '610344db6388d9ba34f6db45a3cf71de'

 task = openml.tasks.get_task(2117)
 train_indices, test_indices = task.get_train_test_split_indices()
 X, y = task.get_X_and_y()

 X_train = X[train_indices]
 y_train = y[train_indices]
 X_test = X[test_indices]
 y_test = y[test_indices]

 dataset = task.get_dataset()
 _, _, categorical_indicator = dataset.\
 get_data(target=task.target_name, return_categorical_indicator=True)

 # Create feature type list from openml.org indicator and run autosklearn
 feat_type = ['Categorical' if ci else 'Numerical'
 for ci in categorical_indicator]

 cls = autosklearn.classification.\
 AutoSklearnClassifier(time_left_for_this_task=120,
 per_run_time_limit=30)
 cls.fit(X_train, y_train, feat_type=feat_type)

 predictions = cls.predict(X_test)
 print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))


if __name__ == "__main__":
 main()

################################################################################
 To run this example you need to install openml-python:
 pip install git+https://github.com/renatopp/liac-arff
 pip install requests xmltodict
 pip install git+https://github.com/openml/openml-python@develop --no-deps
################################################################################


ModuleNotFoundError: No module named 'openml'

### After doing the following

In [None]:
#pip install git+https://github.com/renatopp/liac-arff
#pip install requests xmltodict
# pip install git+https://github.com/openml/openml-python@develop --no-deps

In [5]:
try:
 import openml
except ImportError:
 print("#"*80 + """
 To run this example you need to install openml-python:
 pip install git+https://github.com/renatopp/liac-arff
 pip install requests xmltodict
 pip install git+https://github.com/openml/openml-python@develop --no-deps\n""" +
 "#"*80)
 raise


def main():
 # Load adult dataset from openml.org, see https://www.openml.org/t/2117
 openml.config.apikey = '610344db6388d9ba34f6db45a3cf71de'

 task = openml.tasks.get_task(2117)
 train_indices, test_indices = task.get_train_test_split_indices()
 X, y = task.get_X_and_y()

 X_train = X[train_indices]
 y_train = y[train_indices]
 X_test = X[test_indices]
 y_test = y[test_indices]

 dataset = task.get_dataset()
 _, _, categorical_indicator = dataset.\
 get_data(target=task.target_name, return_categorical_indicator=True)

 # Create feature type list from openml.org indicator and run autosklearn
 feat_type = ['Categorical' if ci else 'Numerical'
 for ci in categorical_indicator]

 cls = autosklearn.classification.\
 AutoSklearnClassifier(time_left_for_this_task=120,
 per_run_time_limit=30)
 cls.fit(X_train, y_train, feat_type=feat_type)

 predictions = cls.predict(X_test)
 print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))


if __name__ == "__main__":
 main()

Accuracy score 0.851523236334


## Example of holdout 

In [6]:
def main():
 X, y = sklearn.datasets.load_digits(return_X_y=True)
 X_train, X_test, y_train, y_test = \
 sklearn.model_selection.train_test_split(X, y, random_state=1)

 automl = autosklearn.classification.AutoSklearnClassifier(
 time_left_for_this_task=120, per_run_time_limit=30,
 tmp_folder='/tmp/autoslearn_holdout_example_tmp',
 output_folder='/tmp/autosklearn_holdout_example_out',
 disable_evaluator_output=False)
 automl.fit(X_train, y_train, dataset_name='digits')

 # Print the final ensemble constructed by auto-sklearn.
 print(automl.show_models())
 predictions = automl.predict(X_test)
 # Print statistics about the auto-sklearn run such as number of
 # iterations, number of models failed with a time out.
 print(automl.sprint_statistics())
 print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))


if __name__ == '__main__':
 main()

You are already timing task: index_run5


[(0.960000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),
(0.020000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'gradient_boosting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'standardize', 'classifier:gradient_boosting:learning_rate': 0.03627152792976942, 'classifier:gradient_boosting:loss': 'deviance', 'classifier:gradient_boosting:max_depth': 10, 'classifier:gradient_boosting:max_features': 4.211238636565405, 'classifier:gradient_boosting:max_leaf_nodes': 'None', 'classifier:gradient_boosting:min_samples_leaf': 15, 'classifier:gradient_boosting:min_samples_split': 16, 'classifier:gradient_boosting:min_weight_fraction_leaf': 0.0, 'classifier:gradient_boosting:n_estimators': 340, 'classifier:gradient_boosting:subsample': 0.6289005711340923, 'one_hot_encoding:minimum_fraction': 0.0002148748655476835},
dat

## Example metrics 

In [8]:
import numpy as np
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import autosklearn.classification
import autosklearn.metrics



def accuracy(solution, prediction):
 # function defining accuracy
 return np.mean(solution == prediction)


def accuracy_wk(solution, prediction, dummy):
 # function defining accuracy and accepting an additional argument
 assert dummy is None
 return np.mean(solution == prediction)


def main():

 X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
 X_train, X_test, y_train, y_test = \
 sklearn.model_selection.train_test_split(X, y, random_state=1)

 # Print a list of available metrics
 print("Available CLASSIFICATION metrics autosklearn.metrics.*:")
 print("\t*" + "\n\t*".join(autosklearn.metrics.CLASSIFICATION_METRICS))

 print("Available REGRESSION autosklearn.metrics.*:")
 print("\t*" + "\n\t*".join(autosklearn.metrics.REGRESSION_METRICS))

 # First example: Use predefined accuracy metric
 print("#"*80)
 print("Use predefined accuracy metric")
 cls = autosklearn.classification.\
 AutoSklearnClassifier(time_left_for_this_task=60,
 per_run_time_limit=30, seed=1)
 cls.fit(X_train, y_train, metric=autosklearn.metrics.accuracy)

 predictions = cls.predict(X_test)
 print("Accuracy score {:g} using {:s}".
 format(sklearn.metrics.accuracy_score(y_test, predictions),
 cls._automl._automl._metric.name))

 # Second example: Use own accuracy metric
 print("#"*80)
 print("Use self defined accuracy accuracy metric")
 accuracy_scorer = autosklearn.metrics.make_scorer(name="accu",
 score_func=accuracy,
 greater_is_better=True,
 needs_proba=False,
 needs_threshold=False)
 cls = autosklearn.classification.\
 AutoSklearnClassifier(time_left_for_this_task=60,
 per_run_time_limit=30, seed=1)
 cls.fit(X_train, y_train, metric=accuracy_scorer)

 predictions = cls.predict(X_test)
 print("Accuracy score {:g} using {:s}".
 format(sklearn.metrics.accuracy_score(y_test, predictions),
 cls._automl._automl._metric.name))

 # Third example: Use own accuracy metric with additional argument
 print("#"*80)
 print("Use self defined accuracy with additional argument")
 accuracy_scorer = autosklearn.metrics.make_scorer(name="accu_add",
 score_func=accuracy_wk,
 greater_is_better=True,
 needs_proba=False,
 needs_threshold=False,
 dummy=None)
 cls = autosklearn.classification.\
 AutoSklearnClassifier(time_left_for_this_task=60,
 per_run_time_limit=30, seed=1)
 cls.fit(X_train, y_train, metric=accuracy_scorer)

 predictions = cls.predict(X_test)
 print("Accuracy score {:g} using {:s}".
 format(sklearn.metrics.accuracy_score(y_test, predictions),
 cls._automl._automl._metric.name))


if __name__ == "__main__":
 main()

Available CLASSIFICATION metrics autosklearn.metrics.*:
	*accuracy
	*balanced_accuracy
	*roc_auc
	*average_precision
	*log_loss
	*pac_score
	*precision
	*precision_macro
	*precision_micro
	*precision_samples
	*precision_weighted
	*recall
	*recall_macro
	*recall_micro
	*recall_samples
	*recall_weighted
	*f1
	*f1_macro
	*f1_micro
	*f1_samples
	*f1_weighted
Available REGRESSION autosklearn.metrics.*:
	*r2
	*mean_squared_error
	*mean_absolute_error
	*median_absolute_error
################################################################################
Use predefined accuracy metric


You are already timing task: index_run2
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3


Accuracy score 0.965035 using accuracy
################################################################################
Use self defined accuracy accuracy metric


You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2


Accuracy score 0.951049 using accu
################################################################################
Use self defined accuracy with additional argument


You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2


Accuracy score 0.951049 using accu_add


## Example in parallel 

In [9]:
import multiprocessing
import shutil
from autosklearn.metrics import accuracy
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.constants import *

tmp_folder = '/tmp/autosklearn_parallel_example_tmp'
output_folder = '/tmp/autosklearn_parallel_example_out'


for dir in [tmp_folder, output_folder]:
 try:
 shutil.rmtree(dir)
 except OSError as e:
 pass


def spawn_classifier(seed, dataset_name):
 """Spawn a subprocess.
 auto-sklearn does not take care of spawning worker processes. This
 function, which is called several times in the main block is a new
 process which runs one instance of auto-sklearn.
 """

 # Use the initial configurations from meta-learning only in one out of
 # the four processes spawned. This prevents auto-sklearn from evaluating
 # the same configurations in four processes.
 if seed == 0:
 initial_configurations_via_metalearning = 25
 else:
 initial_configurations_via_metalearning = 0

 # Arguments which are different to other runs of auto-sklearn:
 # 1. all classifiers write to the same output directory
 # 2. shared_mode is set to True, this enables sharing of data between
 # models.
 # 3. all instances of the AutoSklearnClassifier must have a different seed!
 automl = AutoSklearnClassifier(
 time_left_for_this_task=60, # sec., how long should this seed fit
 # process run
 per_run_time_limit=15, # sec., each model may only take this long before it's killed
 ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm
 shared_mode=True, # tmp folder will be shared between seeds
 tmp_folder=tmp_folder,
 output_folder=output_folder,
 delete_tmp_folder_after_terminate=False,
 ensemble_size=0, # ensembles will be built when all optimization runs are finished
 initial_configurations_via_metalearning=initial_configurations_via_metalearning,
 seed=seed)
 automl.fit(X_train, y_train, dataset_name=dataset_name)

if __name__ == '__main__':
 
 X, y = sklearn.datasets.load_digits(return_X_y=True)
 X_train, X_test, y_train, y_test = \
 sklearn.model_selection.train_test_split(X, y, random_state=1)

 processes = []
 for i in range(4): # set this at roughly half of your cores
 p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits'))
 p.start()
 processes.append(p)
 for p in processes:
 p.join()

 print('Starting to build an ensemble!')
 automl = AutoSklearnClassifier(time_left_for_this_task=15,
 per_run_time_limit=15,
 ml_memory_limit=1024,
 shared_mode=True,
 ensemble_size=50,
 ensemble_nbest=200,
 tmp_folder=tmp_folder,
 output_folder=output_folder,
 initial_configurations_via_metalearning=0,
 seed=1)

 # Both the ensemble_size and ensemble_nbest parameters can be changed now if
 # necessary
 automl.fit_ensemble(y_train,
 task=MULTICLASS_CLASSIFICATION,
 metric=accuracy,
 precision='32',
 dataset_name='digits',
 ensemble_size=20,
 ensemble_nbest=50)

 predictions = automl.predict(X_test)
 print(automl.show_models())
 print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))

Starting to build an ensemble!
[(0.900000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),
(0.050000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'k_nearest_neighbors', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none', 'classifier:k_nearest_neighbors:n_neighbors': 2, 'classifier:k_nearest_neighbors:p': 1, 'classifier:k_nearest_neighbors:weights': 'distance', 'one_hot_encoding:minimum_fraction': 0.3530578080502024},
dataset_properties={
 'task': 2,
 'sparse': False,
 'multilabel': False,
 'multiclass': True,
 'target_type': 'classification',
 'signed': False})),
(0.050000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'passive_aggressive', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'random_trees_embedd

## Regression example 

In [10]:
import autosklearn.regression


def main():
 X, y = sklearn.datasets.load_boston(return_X_y=True)
 feature_types = (['numerical'] * 3) + ['categorical'] + (['numerical'] * 9)
 X_train, X_test, y_train, y_test = \
 sklearn.model_selection.train_test_split(X, y, random_state=1)

 automl = autosklearn.regression.AutoSklearnRegressor(
 time_left_for_this_task=120, per_run_time_limit=30,
 tmp_folder='/tmp/autoslearn_regression_example_tmp',
 output_folder='/tmp/autosklearn_regression_example_out')
 automl.fit(X_train, y_train, dataset_name='boston',
 feat_type=feature_types)

 print(automl.show_models())
 predictions = automl.predict(X_test)
 print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))


if __name__ == '__main__':
 main()



You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run2
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run4
You are already timing task: index_run5
You are already timing task: index_run5
You are already timing task: index_run5
You are already timing task: index_run5
You are already timing task: index_run6
You are already timing task: index_run6
You are already timing task: index_run6
You are already timing task: index_run7
You are already timing task: index_run7


[(0.660000, SimpleRegressionPipeline({'imputation:strategy': 'median', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'feature_agglomeration', 'regressor:__choice__': 'random_forest', 'rescaling:__choice__': 'standardize', 'one_hot_encoding:minimum_fraction': 0.010836306032657955, 'preprocessor:feature_agglomeration:affinity': 'euclidean', 'preprocessor:feature_agglomeration:linkage': 'ward', 'preprocessor:feature_agglomeration:n_clusters': 25, 'preprocessor:feature_agglomeration:pooling_func': 'mean', 'regressor:random_forest:bootstrap': 'False', 'regressor:random_forest:criterion': 'mse', 'regressor:random_forest:max_depth': 'None', 'regressor:random_forest:max_features': 4.418965161789183, 'regressor:random_forest:max_leaf_nodes': 'None', 'regressor:random_forest:min_samples_leaf': 2, 'regressor:random_forest:min_samples_split': 14, 'regressor:random_forest:min_weight_fraction_leaf': 0.0, 'regressor:random_forest:n_estimators': 100},
dataset_properties={

## ... and Finally Sequential 

In [11]:
def main():
 X, y = sklearn.datasets.load_digits(return_X_y=True)
 X_train, X_test, y_train, y_test = \
 sklearn.model_selection.train_test_split(X, y, random_state=1)

 automl = autosklearn.classification.AutoSklearnClassifier(
 time_left_for_this_task=120, per_run_time_limit=30,
 tmp_folder='/tmp/autoslearn_sequential_example_tmp',
 output_folder='/tmp/autosklearn_sequential_example_out',
 # Do not construct ensembles in parallel to avoid using more than one
 # core at a time. The ensemble will be constructed after auto-sklearn
 # finished fitting all machine learning models.
 ensemble_size=0, delete_tmp_folder_after_terminate=False)
 automl.fit(X_train, y_train, dataset_name='digits')
 # This call to fit_ensemble uses all models trained in the previous call
 # to fit to build an ensemble which can be used with automl.predict()
 automl.fit_ensemble(y_train, ensemble_size=50)

 print(automl.show_models())
 predictions = automl.predict(X_test)
 print(automl.sprint_statistics())
 print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))


if __name__ == '__main__':
 main()

[(0.960000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),
(0.020000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'gradient_boosting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'standardize', 'classifier:gradient_boosting:learning_rate': 0.03627152792976942, 'classifier:gradient_boosting:loss': 'deviance', 'classifier:gradient_boosting:max_depth': 10, 'classifier:gradient_boosting:max_features': 4.211238636565405, 'classifier:gradient_boosting:max_leaf_nodes': 'None', 'classifier:gradient_boosting:min_samples_leaf': 15, 'classifier:gradient_boosting:min_samples_split': 16, 'classifier:gradient_boosting:min_weight_fraction_leaf': 0.0, 'classifier:gradient_boosting:n_estimators': 340, 'classifier:gradient_boosting:subsample': 0.6289005711340923, 'one_hot_encoding:minimum_fraction': 0.0002148748655476835},
dat