"""Command-line script for hyperparameter search on MAIS epidemic models.
Loads a model configuration and a hyperparameter search configuration (JSON),
then runs the selected search method (grid search or CMA-ES) to minimise a
specified loss function (RMSE, MAE, or R²) against observed gold data.
Typical usage::
python run_search.py config.ini gridsearch.json \\
--fit_data ../data/fit_data/fit_me.csv \\
--return_func rmse --n_jobs 4
Results are saved as a CSV file in the directory specified by ``--out_dir``.
An optional evolution log can be written with ``-l``.
"""
import timeit
import random
import pandas as pd
import click
import datetime
import numpy as np
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../src'))
from hyperparam_search.hyperparam_utils import run_hyperparam_search
from utils.config_utils import ConfigFile
[docs]
def load_gold_data(csv_path, first_n_zeros=0, data_column=2, from_day=0, until_day=None, use_dates=False):
"""Load and pre-process observed (gold) data for model fitting.
Reads a CSV file, aligns its time index, optionally pads the beginning
with zero-valued rows, and returns a DataFrame with ``"day"`` and
``"infected"`` columns sliced to the requested day range.
Forward-fill is applied to handle any missing values in the data column.
Args:
csv_path (str): Path to the CSV file containing observed data. The
file must have either a ``"datum"`` column (parsed as dates when
``use_dates=True``) or a ``"T"`` column with day indices.
first_n_zeros (int): Number of zero-valued days to prepend. The
existing day indices are shifted by this value. Defaults to ``0``.
data_column (int or str): Either the integer column position (0-based)
or the string column name of the observed values. Defaults to
``2``.
from_day (int): First row index (0-based) to include after all
pre-processing. Defaults to ``0``.
until_day (int or None): Exclusive upper row index. ``None`` means
include all remaining rows. Defaults to ``None``.
use_dates (bool): If ``True``, derive day indices from the ``"datum"``
column (parsed as dates, with day 0 = the first date). Otherwise
use the ``"T"`` column. Defaults to ``False``.
Returns:
pandas.DataFrame: DataFrame with columns ``"day"`` (int) and
``"infected"`` (float), sliced to ``[from_day, until_day)``.
"""
df = pd.read_csv(csv_path)
if use_dates:
dates = pd.to_datetime(df["datum"])
dates = dates - dates[0]
dates = dates.apply(lambda t: t.days)
else:
dates = df["T"]
result = pd.DataFrame({"day": range(len(dates)), "infected": pd.NA})
if isinstance(data_column, int):
data_vals = df.iloc[:, data_column].to_list()
else:
data_vals = df[data_column].to_list()
result["infected"] = data_vals
#result.fillna(method='ffill', inplace=True)
result.ffill(inplace=True)
if first_n_zeros > 0:
result["day"] += first_n_zeros
first_n = [{"day": i + 1, "infected": 0} for i in range(first_n_zeros)]
result = pd.concat([pd.DataFrame(first_n), result], ignore_index=True)
result = result.iloc[from_day:until_day]
return result
@click.command()
@click.argument('filename', default="town0.ini")
@click.argument('hyperparam_filename', default="example_gridsearch.json")
@click.option('--set-random-seed/--no-random-seed', ' /-r', default=True,
help="Random seed to set for the experiment. If `run_n_times` > 1, the seed is incremented by i for "
"the i-th run.")
@click.option('--n_jobs', default=0)
@click.option('--from_day', default=1, help="Lower bound for days in the gold data DataFrame (indexed from 0).")
@click.option('--until_day', default=None, type=int, help="Upper bound for days in the gold data DataFrame "
"(python-like indexing - elements up to 'until_day - 1' are "
"selected).")
@click.option('--use_dates/--use_T', default=False, help="If True, use column 'datum' for gold data indexing, else use "
"column 'T' (default).")
@click.option('--use_config_days/--use_args_days', default=False, help="If True, use 'start_day' and 'n_days' from "
"the config file, otherwise using 'from_day' "
"and 'until_day' command line arguments.")
@click.option('--run_n_times', default=1, help="Number of times to run th experiment with specific hyperparameter "
"settings.")
@click.option('--first_n_zeros', default=0, help="Shifts gold data by this value - the first day is incremented"
" by `first_n_zeros`, and the data is padded with `first_n_zeros` days"
" with the gold values set to zero.")
@click.option('--fit_column', default='I_d', help="Data column to use for fit.")
@click.option('--fit_data', default='../data/fit_data/fit_me.csv',
help="A DataFrame that has a column named 'datum' and contains the gold data in the column `data_column`.")
@click.option('--return_func', default='rmse', help="Loss function.")
@click.option('--log_csv_file/--no_log_csv', '-l/ ', default=False)
@click.option('--out_dir', default=f'./search_{datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")}')
def run(filename, hyperparam_filename, set_random_seed, n_jobs, from_day, until_day, use_dates, use_config_days,
run_n_times, first_n_zeros, fit_column, fit_data, return_func, log_csv_file, out_dir):
random_seed = 42 if set_random_seed else random.randint(0, 10000)
cf = ConfigFile()
cf.load(filename)
print(f"Output directory for results: {out_dir}")
print(f"Running with n_jobs == {n_jobs}.")
def search_func():
try:
data_col = int(fit_column)
except ValueError:
data_col = fit_column
gold_data = load_gold_data(fit_data, first_n_zeros=first_n_zeros, data_column=data_col, use_dates=use_dates,
from_day=from_day, until_day=until_day)
# infer start day and experiment length from gold data
start_day = None if use_config_days else int(gold_data.iloc[0]["day"])
n_days = None if use_config_days else len(gold_data)
gold_data = gold_data["infected"].to_numpy()
if not os.path.exists(out_dir):
os.mkdir(out_dir)
results = run_hyperparam_search(
filename,
hyperparam_filename,
model_random_seed=random_seed,
n_jobs=n_jobs,
start_day=start_day,
n_days=n_days,
return_func=return_func,
return_func_kwargs={"y_true": gold_data, "fit_column": fit_column},
run_n_times=run_n_times,
output_file=out_dir + '/evo_log.csv' if log_csv_file else None
)
if not isinstance(results, list):
results = [results]
res_list = []
for res in results:
mean_val = np.mean(res['result'])
res_row = {**res["hyperparams"], f"mean_{return_func}": mean_val}
res_list.append(res_row)
fit_name = os.path.split(fit_data)[1].split('.')[0]
search_method = os.path.split(hyperparam_filename)[1].split('.')[0]
df = pd.DataFrame(data=res_list)
df.to_csv(os.path.join(out_dir, f'{search_method}_{return_func}_{fit_name}_seed={random_seed}.csv'))
print(timeit.timeit(search_func, number=1))
if __name__ == "__main__":
run()