Example recipes
We provide several "recipes" for downloading data from various sources and executing a range of machine learning methods with it.
Note that a "recipe" is a file in yaml format.
Model comparison usecase
# SPDX-FileCopyrightText: 2023 Springtime authors
#
# SPDX-License-Identifier: Apache-2.0
datasets:
npn_obs:
dataset: RNPN
species_ids:
functional_type: "Deciduous broadleaf" # multiple species
phenophase_ids:
name: breaking leaf buds
years: [2015, 2020]
area:
name: Washington # 500km boundingbox centered at latitude: 47.751076 and longitude: -120.740135
bbox:
[
-124.08406940413612,
45.50277198520317,
-117.39620059586387,
49.99938001479683,
]
daymet:
dataset: daymet_multiple_points
points:
source: npn_obs
years: [2015, 2020] # TODO don't duplicate
variables:
- tmin
- tmax
resample:
frequency: month
operator: median
modis:
dataset: modis_multiple_points
points:
source: npn_obs
years: [2015, 2020] # TODO don't duplicate
product: MOD15A2H # https://lpdaac.usgs.gov/products/mod15a2hv006/
bands: [Fpar_500m, Lai_500m] #8 days
resample:
frequency: month
operator: mean
# TODO if extent is not zero we need an aggregation
preparation:
dropna: True
derived:
latitude: True
experiment:
experiment_type: regression # --> pycaret.regression.RegressionExperiment
setup:
target: "breaking leaf buds_doy"
train_size: 0.75
preprocess: false
normalize: true
normalize_method: zscore # i.e. default
fold_strategy: kfold # i.e. default
fold: 10
fold_shuffle: true
session_id: 123 # control randomness for reproducibility
index: False # Exclude year and geomentry columns
# categorical_features:
# - site_id
# max_encoding_ohe: 250
# create_model:
# estimator: lr
# estimator: 'sklearn.svm.SVR'
# cross_validation: true
init_kwargs:
sklearn.svm.SVR:
kernel: 'rbf'
gamma: 0.1
C: 100.0
epsilon: 0.1
interpret.glassbox.ExplainableBoostingRegressor:
interactions: 0
validation_size: 0
outer_bags: 1
min_samples_leaf: 1
compare_models:
include:
- 'lr' # linear regression
- 'rf' # random forest regressor
- 'sklearn.svm.SVR'
- 'interpret.glassbox.ExplainableBoostingRegressor'
- merf.MERF # Must be instantiated before passing to pycaret; how to specify args?
# fit_kwargs are given to each estimator.fit method
# so if estimator does not understand it crashes and is skipped.
fit_kwargs:
MERF:
fixed_effects: [] # "the rest" after removing cluster, random effects, and target columns
random_effects: []
cluster_column: "latitude"
cross_validation: true
n_select: 5
# errors: 'ignore'
# errors: 'raise'
# create_model:
# estimator: 'merf.MERF'
# fit_kwargs:
# fixed_effects: ['Lai_500m_12'] # "the rest" after removing cluster, random effects, and target columns
# random_effects: [ "tmin_2", "tmin_3", "tmin_3"]
# cluster_column: "latitude"
plots:
- error
- residuals