Example recipes

We provide several "recipes" for downloading data from various sources and executing a range of machine learning methods with it.

Note that a "recipe" is a file in yaml format.

Model comparison usecase
# SPDX-FileCopyrightText: 2023 Springtime authors
#
# SPDX-License-Identifier: Apache-2.0

datasets:
  npn_obs:
    dataset: RNPN
    species_ids:
      functional_type: "Deciduous broadleaf" # multiple species
    phenophase_ids:
        name: breaking leaf buds
    years: [2015, 2020]
    area:
      name: Washington # 500km boundingbox centered at latitude: 47.751076 and longitude: -120.740135
      bbox:
        [
          -124.08406940413612,
          45.50277198520317,
          -117.39620059586387,
          49.99938001479683,
        ]
  daymet:
    dataset: daymet_multiple_points
    points:
      source: npn_obs
    years: [2015, 2020] # TODO don't duplicate
    variables:
      - tmin
      - tmax
    resample:
      frequency: month
      operator: median
  modis:
    dataset: modis_multiple_points
    points:
      source: npn_obs
    years: [2015, 2020] # TODO don't duplicate
    product: MOD15A2H # https://lpdaac.usgs.gov/products/mod15a2hv006/
    bands: [Fpar_500m, Lai_500m] #8 days
    resample:
      frequency: month
      operator: mean
# TODO if extent is not zero we need an aggregation
preparation:
  dropna: True
  derived:
    latitude: True
experiment:
  experiment_type: regression  # --> pycaret.regression.RegressionExperiment
  setup:
    target: "breaking leaf buds_doy"
    train_size: 0.75
    preprocess: false
    normalize: true
    normalize_method: zscore  # i.e. default
    fold_strategy: kfold  # i.e. default
    fold: 10
    fold_shuffle: true
    session_id: 123  # control randomness for reproducibility
    index: False # Exclude year and geomentry columns
    # categorical_features:
    #   - site_id
    # max_encoding_ohe: 250

  # create_model:
  #   estimator: lr
      # estimator: 'sklearn.svm.SVR'
  #   cross_validation: true

  init_kwargs:
    sklearn.svm.SVR:
      kernel: 'rbf'
      gamma: 0.1
      C: 100.0
      epsilon: 0.1
    interpret.glassbox.ExplainableBoostingRegressor:
      interactions: 0
      validation_size: 0
      outer_bags: 1
      min_samples_leaf: 1
  compare_models:
    include:
      - 'lr'  # linear regression
      - 'rf'  # random forest regressor
      - 'sklearn.svm.SVR'
      - 'interpret.glassbox.ExplainableBoostingRegressor'
      - merf.MERF  # Must be instantiated before passing to pycaret; how to specify args?
    # fit_kwargs are given to each estimator.fit method
    # so if estimator does not understand it crashes and is skipped.
    fit_kwargs:
      MERF:
        fixed_effects: []  # "the rest" after removing cluster, random effects, and target columns
        random_effects: []
        cluster_column: "latitude"
    cross_validation: true
    n_select: 5
    # errors: 'ignore'
    # errors: 'raise'

  # create_model:
  #   estimator: 'merf.MERF'
  #   fit_kwargs:
  #       fixed_effects: ['Lai_500m_12']  # "the rest" after removing cluster, random effects, and target columns
  #       random_effects: [ "tmin_2", "tmin_3", "tmin_3"]
  #       cluster_column: "latitude"

  plots:
    - error
    - residuals