Base Meta-Configuration#
This page documents utopya’s base configuration that is used as starting point when constructing the meta-configuration.
# This file provides the basic configuration for the utopya Multiverse
#
# It is read in by the Multiverse during initialization and is subsequently
# updated by other configuration files to generate the meta configuration of
# the Multiverse, which determines all details of how a run is performed.
#
# The top-level keys here are used to configure different parts of Multiverse:
# - properties of the Multiverse itself: `paths`, `perform_sweep`, ...
# - properties of attributes: `worker_manager`, `run_kwargs`, ...
# - and the parameter space that is passed on to the model instance
#
# NOTE that this configuration file documents some features in the comments.
# This cannot be exhaustive. Check the docstrings of the functions for
# further information.
---
# Multiverse configuration ....................................................
# Output paths
paths:
# Base output directory
out_dir: ~/utopya_output
# A note that is added to the output directory path
model_note: ~
# From the two above, the run directory will be created at:
# <out_dir>/<model_name>/<timestamp>_<model_note>/
# Subfolders will be: config, eval, data
# Change file permissions for certain folders.
# Keys are managed multiverse directories (run, config, data, eval) and
# values are strings representing octal permission levels for unix systems.
# If None, will keep the default permissions (typically 755).
# Be aware that subdirectories that are created later may not inherit these
# permission levels and that the permissions are set _before_ a simulation
# begins, not after it has finished, thus always requiring write permissions
# for the user level.
# Also note that this may not have an effect on Windows systems at all.
dir_permissions:
run: ~
# Make eval subdirectory group-writeable to allow evaluation of shared
# simulation output within the user's group.
eval: "775"
# Control of the backup of files that belong to a simulation
backups:
# Whether to save all involved config files granularly, i.e. one by one.
# If false, only the resulting meta_cfg is saved to the config subdirectory.
backup_cfg_files: true
# Whether to save the executable
backup_executable: false
# Whether to store git information of the project (and framework)
include_git_info: true
# Control of the model executable
executable_control:
# Whether to copy the executable to a temporary directory at the
# initialization of the Multiverse and execute it from there. This way,
# accidental changes to the executable _during_ a simulation are prevented.
#
# Note that for interpreted languages, copying the executable file alone may
# not suffice to isolate the whole model from changes.
run_from_tmpdir: true
# Whether to perfom a parameter sweep (if configured).
# Is evaluated only when calling the Multiverse.run method.
perform_sweep: false
# Whether to perform parameter validation
# For large sweeps, validation can take quite some time. For such scenarios, it
# might make sense to disable parameter validation by setting this to false.
perform_validation: true
# Parameters that are to be validated
# This is a mapping of key sequence -> Parameter object
parameters_to_validate: {}
# Skipping universes ..........................................................
# Controls whether and when universe tasks will be skipped.
#
# Example use cases and corresponding configuration options:
#
# A - Work should be done on a single machine and in one session;
# in this case, skipping should be disabled completely.
# B - Work may be joined from other machines to speed up completion;
# for this, enable skipping and skip
# C - Work should be done in two stages: first create all the output folders
# and configurations, then work on them independently. For this case,
# enable skipping and activate `skip_after_setup`.
#
skipping:
# Whether skipping is enabled at all.
#
# This should be set to false if you want to ensure that only a single
# Multiverse instance is allowed to generate simulation output.
# In that case, any skip event will instead result in an error.
enabled: false
# If true, all tasks will be skipped after their setup function ran, meaning
# that the universe output directories and config files will be created and
# can be worked on later on.
skip_after_setup: false
# How to proceed in certain situations.
# In all cases, the options are: skip, raise, continue
#
# ... if a universe directory already exists
on_existing_uni_dir: skip
#
# ... if a universe config file already exists
on_existing_uni_cfg: skip
# ... if there already is universe output.
# Note that this is only checked for DistributedMultiverse, i.e. when running
# from an already existing run directory. Otherwise this is ignored.
# Additional option here: `clear` to remove existing output
on_existing_uni_output: skip
# Reporter ....................................................................
# The Multiverse owns a Reporter object to report on the progress of the
# WorkerManager. Part of its configuration happens using its init kwargs, which
# are defined in the following.
# The rest of the configuration happens on the WorkerManager-side (see there).
reporter:
# Define report formats, which are accessible, e.g. from the WorkerManager
report_formats:
progress_bar: # Name of the report format specification
parser: progress_bar # The parser to use
write_to: stdout_noreturn # The writer to use
min_report_intv: 0.5 # Required time (in s) between writes
# -- All further kwargs on this level are passed to the parser
# Terminal width for the progress bar
# Can also be `adaptive` (poll each time), or `fixed` (poll once)
num_cols: adaptive
# The format string to use for progress information
# Available format string elements:
# - `prg` dict with various progress measures in percent:
# `total`, `active`, `skipped`, `failed`, `success`, ...
# - `cnt` dict of counters (same keys)
info_fstr: "{prg[total]:>5.1f}% "
# Example of how to access counters in format string:
# info_fstr: "finished {cnt[finished]}/{cnt[total]} "
# Whether to show time information alongside the progress bar
show_times: true
# How to display time information.
# Available keys: `elapsed`, `est_left`, `est_end`, `start`, `now`
# (see `times` parser for more information)
times_fstr: "| {elapsed:>7s} elapsed | ~{est_left:>7s} left "
times_fstr_final: "| finished in {elapsed:} "
times_kwargs:
# How to compute the estimated time left to finish the work session
# Available modes:
# - `from_start`: extrapolates from progress made since start
# - `from_buffer`: uses a buffer to store recent progress
# information and use the oldest value for
# making the estimate; see `progress_buffer_size`
mode: from_buffer
# Number of records kept for computing ETA in `from_buffer` mode.
# This is in units of parser invocations, so goes back *at least* a
# a time interval of `min_report_intv * progress_buffer_size`.
# If the reporter is called less frequently (e.g. because of a larger
# model-side `monitor_emit_interval`), this interval will be longer.
progress_buffer_size: 90
# Creates a report file containing runtime statistics
report_file:
parser: report
write_to:
file:
path: _report.txt
skip_if_dmv: false # set true for many distributed runs
min_report_intv: 10 # don't update this one too often
show_host_info: true # basic information about host machine
show_exit_codes: true # exit code overview
min_num: 2 # min. number of universes for statistics
show_distributed_run_info: true # extra info for distributed runs
distributed_status_fstr: # how to format the status information
" {progress_here:>5s} @ {host_name_short:12s} - {pid:7d}: {status:10s} ({tags})"
show_individual_runtimes: true # for large number of universes, disable
max_num_to_show: 2048 # max. number of individual times to show
task_label_singular: universe
task_label_plural: universes
# Creates a parameter sweep information file
sweep_info:
parser: pspace_info
write_to:
file:
path: _sweep_info.txt
skip_if_empty: true
skip_if_dmv: true # already have an identical sweep info file
log:
lvl: 18
skip_if_empty: true
fstr: "Sweeping over the following parameter space:\n\n{sweep_info:}"
only_for_sweep: true
# Creates a work status file, for internal use only
work_status:
parser: work_status
write_to:
file:
path: .status.yml
skip_if_dmv: false
min_report_intv: 5.
# Can define a default format to use
# default_format: ~
# Worker Manager ..............................................................
# Initialization arguments for the WorkerManager
worker_manager:
# Specify how many processes work in parallel
num_workers: auto
# can be: an int, 'auto' (== #CPUs). For values <= 0: #CPUs - num_workers
# Delay between polls [seconds]
poll_delay: 0.05
# NOTE: If this value is too low, the main thread becomes very busy.
# If this value is too high, the log output from simulations is not
# read from the line buffer frequently enough.
# How many workers to spawn each working loop iteration.
# For -1, will assign tasks to all free workers.
spawn_rate: -1
# NOTE: If simulations are short, it makes sense to set this to a large value
# or to -1, otherwise CPU utilization will be low because tasks are
# finished faster than new ones are spawned, leading to idle workers.
# If you would like to keep main thread utilization low, set to 1.
# Maximum number of lines to read from each task's stream per poll cycle.
# Choosing a value that is too large may affect poll performance in cases
# where the task generates many lines of output.
# Set to -1 to read *all* available lines from the stream upon each poll.
lines_per_poll: 50
# NOTE: If there is a lot of log output, choose a large value here. This will
# lead to higher main thread utilization, but otherwise the tasks will
# live on for a longer time, blocking the spawning of new tasks.
# Periodic task callback (in units of poll events). Set None to deactivate.
periodic_task_callback: 100
# How to react upon a simulation exiting with non-zero exit code
nonzero_exit_handling: raise
# can be: ignore, warn, warn_all, raise
# warn_all will also warn if the simulation was terminated by the frontend
# raise will lead to a SystemExit with the error code of the simulation
# How to handle keyboard interrupts
interrupt_params:
# Which signal to send to the workers
send_signal: SIGINT # can be any valid signal name
# NOTE that only SIGINT and SIGTERM lead to a graceful shutdown on C++ side
# How long to wait for workers to shut down before calling SIGKILL on them
grace_period: 5.
# WARNING Choosing a grace period that is shorter than the duration of one
# iteration step of your model might lead to corrupted HDF5 data!
# Whether to exit after working; exit code will be 128 + abs(signum)
exit: false
# In which events to save streams *during* the work session
# May be: `monitor_updated`, `periodic_callback`
save_streams_on: [monitor_updated]
# Reporters to invoke at different points of the WorkerManager's operation.
# Keys refer to events, values are lists of report format names, which can be
# defined via the WorkerManagerReporter (see `reporter.report_formats` above)
rf_spec:
before_working: [work_status, sweep_info]
while_working: [progress_bar]
task_invoked: []
task_spawned: [progress_bar]
monitor_updated: [progress_bar]
periodic: [progress_bar]
task_finished: [work_status, progress_bar, report_file]
task_skipped: [work_status, progress_bar, report_file]
after_work: [work_status, progress_bar, report_file]
after_cancel: [work_status, progress_bar, report_file]
after_fail: [work_status, report_file]
# Configuration for the WorkerManager.start_working method
run_kwargs:
# Whether task execution order should be shuffled before a run starts.
# This may help in load-balancing when some tasks systematically take longer
# than others, but it means that a potentially specified task priority is
# completely ignored (typically, all tasks have the same priority, though).
shuffle_tasks: false
# Total timeout (in s) of a run; to ignore, set to ~
timeout: ~
# A list of StopCondition objects to check during the run _for each worker_.
# The entries of the following list are OR-connected, i.e. it suffices that
# one is fulfilled for the corresponding worker to be stopped
stop_conditions: ~
# See docs for how to set these up:
# https://docs.utopia-project.org/html/usage/run/stop-conditions.html
# The defaults for the worker_kwargs
# These are passed to the setup function of each WorkerTask before spawning
worker_kwargs:
# Whether to save the streams of each Universe to a log file
save_streams: true
# This file is saved only after the WorkerTask has finished in order to
# reduce I/O operations on files
# Whether to forward the streams to stdout
forward_streams: in_single_run
# can be: true, false, or 'in_single_run' (print only in single runs)
# Whether to forward the raw stream output or only those lines that were not
# parsable to yaml, i.e.: only the lines that came _not_ from the monitor
forward_raw: true
# The log level at which the streams should be forwarded to stdout
streams_log_lvl: ~ # if None, uses print instead of the logging module
# Arguments to subprocess.Popen
popen_kwargs:
# The encoding of the streams (STDOUT, STDERR) coming from the simulation.
# NOTE If your locale is set to some other encoding, or the simulation uses
# a custom one, overwrite this value accordingly via the user config.
encoding: utf8
# Cluster mode configuration ..................................................
# Whether cluster mode is enabled
cluster_mode: false
# Parameters to configure the cluster mode
cluster_params:
# Specify the workload manager to use.
# The names of environment variables are chosen accordingly.
manager: slurm # available: slurm
# The environment to look for parameters in. If not given, uses os.environ
env: ~
# Specify the name of environment variables for each supported manager
# The resolved values are available at the top level of the dict that is
# returned by Multiverse.resolved_cluster_params
env_var_names:
slurm:
# --- Required variables ---
# ID of the job
job_id: SLURM_JOB_ID
# Number of available nodes
num_nodes: SLURM_JOB_NUM_NODES
# List of node names
node_list: SLURM_JOB_NODELIST
# Name of the current node
node_name: SLURMD_NODENAME # sic!
# This is used for the name of the run
timestamp: RUN_TIMESTAMP
# --- Optional values ---
# Name of the job
job_name: SLURM_JOB_NAME
# Account from which the job is run
job_account: SLURM_JOB_ACCOUNT
# Number of processes on current node
num_procs: SLURM_CPUS_ON_NODE
# Cluster name
cluster_name: SLURM_CLUSTER_NAME
# Custom output directory
custom_out_dir: UTOPIA_CLUSTER_MODE_OUT_DIR
# Could have more managers here, e.g.: docker
# Which parser to use to extract node names from node list
node_list_parser_params:
slurm: condensed # e.g.: node[002,004-011,016]
# Which additional info to include into the name of the run directory, i.e.
# after the timestamp and before the model directory. All information that
# is extracted from the environment variables is available as keyword
# argument to format. Should be a sequence of format strings.
additional_run_dir_fstrs: [ "job{job_id:}" ]
# Data Manager ................................................................
# The DataManager takes care of loading the data into a tree-like structure
# after the simulations are finished.
# It is based on the DataManager class from the dantro package. See there for
# full documentation.
data_manager:
# Where to create the output directory for this DataManager, relative to
# the run directory of the Multiverse.
out_dir: eval/{timestamp:}
# The {timestamp:} placeholder is replaced by the current timestamp such that
# future DataManager instances that operate on the same data directory do
# not create collisions.
# Directories are created recursively, if they do not exist.
# Define the structure of the data tree beforehand; this allows to specify
# the types of groups before content is loaded into them.
# NOTE The strings given to the Cls argument are mapped to a type using a
# class variable of the DataManager
create_groups:
- path: multiverse
Cls: MultiverseGroup
# Where the default tree cache file is located relative to the data
# directory. This is used when calling DataManager.dump and .restore without
# any arguments, as done e.g. in the Utopia CLI.
default_tree_cache_path: data/.tree_cache.d3
# Supply a default load configuration for the DataManager
# This can then be invoked using the dm.load_from_cfg() method.
load_cfg:
# Load the frontend configuration files from the config/ directory
# Each file refers to a level of the configuration that is supplied to
# the Multiverse: base <- user <- model <- run <- update
cfg:
loader: yaml # The loader function to use
glob_str: 'config/*.yml' # Which files to load
ignore: # Which files to ignore
- config/parameter_space.yml
- config/parameter_space_info.yml
- config/full_parameter_space.yml
- config/full_parameter_space_info.yml
- config/git_info_project.yml
- config/git_info_framework.yml
required: true # Whether these files are required
path_regex: config/(\w+)_cfg.yml # Extract info from the file path
target_path: cfg/{match:} # ...and use in target path
# Load the parameter space object into the MultiverseGroup attributes
pspace:
loader: yaml_to_object # Load into ObjectContainer
glob_str: config/parameter_space.yml
required: true
load_as_attr: true
unpack_data: true # ... and store as ParamSpace obj.
target_path: multiverse
# Load the configuration files that are generated for _each_ simulation
# These hold all information that is available to a single simulation and
# are in an explicit, human-readable form.
uni_cfg:
loader: yaml
glob_str: data/uni*/config.yml
required: true
path_regex: data/uni(\d+)/config.yml
target_path: multiverse/{match:}/cfg
parallel:
enabled: true
min_files: 1000
min_total_size: 1048576 # 1 MiB
# Example: Load the binary output data from each simulation.
# data:
# loader: hdf5_proxy
# glob_str: data/uni*/data.h5
# required: true
# path_regex: data/uni(\d+)/data.h5
# target_path: multiverse/{match:}/data
# enable_mapping: true # see DataManager for content -> type mapping
# # Options for loading data in parallel (speeds up CPU-limited loading)
# parallel:
# enabled: false
# # Number of processes to use; negative is deduced from os.cpu_count()
# processes: ~
# # Threshold values for parallel loading; if any is below these
# # numbers, loading will *not* be in parallel.
# min_files: 5
# min_total_size: 104857600 # 100 MiB
# The resulting data tree is then:
# └┬ cfg
# └┬ base
# ├ meta
# ├ model
# ├ run
# └ update
# └ multiverse
# └┬ 0
# └┬ cfg
# └ data
# └─ ...
# ├ 1
# ...
# Plot Manager ................................................................
# The PlotManager, also from the dantro package, supplies plotting capabilities
# using the data in the DataManager.
plot_manager:
# Save the plots to the same directory as that of the data manager
out_dir: ""
# Whether to raise exceptions for plotting errors. false: only log them
raise_exc: false
# How to handle already existing plot configuration files
cfg_exists_action: raise
# NOTE If in cluster mode, this value is set to 'skip' by the Multiverse
# Save all plot configurations alongside the plots
save_plot_cfg: true
# Include dantro's base plot configuration pool
use_dantro_base_cfg_pool: true
# Base plot configuration pools
# These specify the base plot configurations that are made available for each
# model run, updated and extended in the order specified here and themselves
# based on the dantro base config pool.
#
# In some cases, defining additional pools can be useful, e.g. to generate
# publication-ready output without redundantly defining plots or styles.
#
# This is expected to be a list of 2-tuples in form (name, dict or path).
# If the second entry is a string, it may be a format string and it will have
# access to `model_name` and the model's `paths` dict.
# If there is no file available at the given path, will warn about it and use
# an empty pool for that entry.
#
# There are some special keys, which can be used instead of the 2-tuple:
# `utopya_base`, `framework_base`, `project_base`, `model_base`
# These expand to a respective configuration file path, depending on the
# framework, project, or model that is being used.
base_cfg_pools:
- utopya_base
- framework_base
- project_base
- model_base
# Initialization arguments for all creators
shared_creator_init_kwargs:
style:
figure.figsize: [8., 5.] # 16:10
# Can set creator-specific initialization arguments here
creator_init_kwargs:
pyplot: {}
universe: {}
multiverse: {}
# Parameter Space .............................................................
# Only entries below this one will be available to the model executable.
#
# The content of the `parameter_space` level is parsed by the frontend and then
# dumped to a file, the path to which is passed to the binary as positional
# argument.
parameter_space:
# Set a default PRNG seed
seed: 42
# Number of steps to perform
num_steps: 3
# At which step the write_data method should be invoked for the first time
write_start: 0
# Starting from write_start, how frequently write_data should be called
write_every: 1
# NOTE `write_start` and `write_every` are passed along to sub-models. Every
# sub model can overwrite this entry by adding an entry in their model
# configuration level (analogous to `log_levels`.)
# Log levels
# NOTE The framework may define further levels in here but may also choose
# to ignore these entries altogether. The `model` and `backend` keys
# are those that are accessible from the utopya CLI.
log_levels:
model: info
backend: warning
# TODO Implement setting this via CLI… perhaps even more general?
# Coolest would be: allow frameworks to provide a mapping of each CLI
# debug level to an update dict.
# Monitoring
# How frequently to send a monitoring message to the frontend; note that the
# timing needs to be implemented by the model itself
monitor_emit_interval: 2.
# The path to the config file to load
# output_path: /abs/path/to/uni<#>/cfg.yml
# NOTE This entry is always added by the frontend. Depending on which
# universe is to be simulated, the <#> is set.
# Below here, the model configuration starts, i.e. the config that is used by
# a model instance. It's meant to be nested under the model name itself and
# a node of that name will always be added.
# <model_name>:
# ... more parameters ...