config.ini.in

# Bohrium version: ${BH_VERSION_STRING}

##########################################################################
# Stack configurations, which are a comma separated lists of components. #
# NB: 'stacks' is a reserved section name and 'default'                  #
#     is used when 'BH_STACKS' is unset.                                 #
#     The bridge is never part of the list                               #
##########################################################################
[stacks]
default      = bcexp_cpu, bccon, node, openmp
openmp       = bcexp_cpu, bccon, node, openmp
opencl       = bcexp_gpu, bccon, node, opencl, openmp
cuda         = bcexp_gpu, bccon, node, cuda, openmp
proxy_openmp = bcexp_cpu, bccon, proxy, node, openmp
proxy_opencl = bcexp_cpu, bccon, proxy, node, opencl, openmp
proxy_cuda   = bcexp_cpu, bccon, proxy, node, cuda, openmp

############
# Managers #
############
[node]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_vem_node${CMAKE_SHARED_LIBRARY_SUFFIX}
timing = false

[proxy]
address = localhost
port = 4200
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_vem_proxy${CMAKE_SHARED_LIBRARY_SUFFIX}
libs = ${BH_PROXY_LIBS}


#############################
# Filters - Helpers / Tools #
#############################
[pprint]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_pprint${CMAKE_SHARED_LIBRARY_SUFFIX}

###################################
# Filters - Bytecode transformers #
###################################
[bccon]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_bccon${CMAKE_SHARED_LIBRARY_SUFFIX}
collect = true
stupidmath = true
muladd = true
reduction = false
find_repeats = false
timing = false
verbose = false

[bcexp_cpu]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_bcexp${CMAKE_SHARED_LIBRARY_SUFFIX}
powk = true
sign = false
repeat = false
reduce1d = 0
timing = false
verbose = false

[bcexp_gpu]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_bcexp${CMAKE_SHARED_LIBRARY_SUFFIX}
powk = true
sign = false
repeat = false
# Transform 1d reductions into 2d reductions by array reshaping
reduce1d = 32000
timing = false
verbose = false

[noneremover]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_noneremover${CMAKE_SHARED_LIBRARY_SUFFIX}
timing = false
verbose = false

###########
# Engines #
###########
[openmp]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_ve_openmp${CMAKE_SHARED_LIBRARY_SUFFIX}
verbose = false
# Profiling statistics
prof = false
prof_filename =
# Write a Graphviz graph for each kernel
graph = false
# Directory for temporary files (e.g. /tmp/). Default: NONE, which is `boost::filesystem::temp_directory_path()`
tmp_dir = NONE
# Directory for cache files (persistent between executions). Default: NONE, which disable the cache
cache_dir = ${BIN_KERNEL_CACHE_DIR}
# Maximum number of cache files to keep in the cache dir (use -1 for infinity)
cache_file_max = 50000
# Set to true, if no files should we written to the cache. When combining Bohrium and MPI, use this option to avoid
# write conflicts by only having rank zero write to the cache dir.
cache_readonly = false
# Set the size limit of malloc cache in percentage of the unused system memory.
# NB: if the amount of unused memory cannot be determined, 20% of total memory system is used.
malloc_cache_limit = 80
# The command to execute the compiler where {OUT} is replaced with the binary file output and {IN} with the source file
compiler_cmd = "${VE_OPENMP_COMPILER_CMD} ${VE_OPENMP_COMPILER_FLG} ${VE_OPENMP_COMPILER_INC} {IN} -o {OUT}"
# JIT compile options
compiler_openmp = ${_VE_OPENMP_COMPILER_OPENMP}
compiler_openmp_simd = ${_VE_OPENMP_COMPILER_OPENMP_SIMD}
# List of extension methods
libs = ${BH_OPENMP_LIBS}
# The pre-fuser to use ('none' or 'lossy')
pre_fuser = lossy
# List of instruction fuser/transformers
fuser_list = greedy, collapse_redundant_axes
# Number of edges in the fusion graph that makes the greedy fuser use the `reshapable_first` fuser instead
greedy_threshold = 10000
# *_as_var specifies whether to hard-code variables or have them as variables
index_as_var = true
strides_as_var = true
const_as_var = true
# Monolithic combines all blocks into one shared library rather than a block-nest per shared library
monolithic = false

[opencl]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_ve_opencl${CMAKE_SHARED_LIBRARY_SUFFIX}
verbose = false
# Profiling statistics
prof = false
prof_filename =
# Write a Graphviz graph for each kernel
graph = false
# Directory for temporary files (e.g. /tmp/). Default: NONE, which is `boost::filesystem::temp_directory_path()`
tmp_dir = NONE
# Directory for cache files (persistent between executions). Default: NONE, which disable the cache
cache_dir = ${BIN_KERNEL_CACHE_DIR}
# Maximum number of cache files to keep in the cache dir (use -1 for infinity)
cache_file_max = 50000
# Set to true, if no files should we written to the cache. When combining Bohrium and MPI, use this option to avoid
# write conflicts by only having rank zero write to the cache dir.
cache_readonly = false
# Set the size limit of malloc cache in percentage of total GPU memory.
# NB: if the device is a CPU, only 10% of the total memory will be used.
malloc_cache_limit = 90
# Bohrium sort all found devices by type ('gpu', 'cpu', or 'accelerator'). Set the device number to the device
# Bohrium should use (0 means first). PS: use `python -m bohrium_api --info` the get available devices.
device_number = 0
# Additional options given to the opencl compiler. See documentation for clBuildProgram
compiler_inc_dir = "${VE_OPENCL_COMPILER_INC_DIR}"
# List of extension methods
libs = ${BH_OPENCL_LIBS}
# The pre-fuser to use ('none' or 'lossy')
pre_fuser = lossy
# List of instruction fuser/transformers
fuser_list = greedy, push_reductions_inwards, split_for_threading, collapse_redundant_axes
# Number of edges in the fusion graph that makes the greedy fuser use the `reshapable_first` fuser instead
greedy_threshold = 10000
# *_as_var specifies whether to hard-code variables or have them as variables
index_as_var = true
strides_as_var = true
const_as_var = true
# OpenCL work group sizes
work_group_size_1dx = 128
work_group_size_2dx = 32
work_group_size_2dy = 4
work_group_size_3dx = 32
work_group_size_3dy = 2
work_group_size_3dz = 2
# Maximum number of threads to use (use 0 for infinity)
num_threads = 0
# Use round robin instead of block parallelization when limiting number of threads.
num_threads_round_robin = false
# Optimize instructions for GPU access (column-major)
to_col_major = false

[cuda]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_ve_cuda${CMAKE_SHARED_LIBRARY_SUFFIX}
verbose = false
# Profiling statistics
prof = false
prof_filename =
# Write a Graphviz graph for each kernel
graph = false
# Directory for temporary files (e.g. /tmp/). Default: NONE, which is `boost::filesystem::temp_directory_path()`
tmp_dir = NONE
# Directory for cache files (persistent between executions). Default: NONE, which disable the cache
cache_dir = ${BIN_KERNEL_CACHE_DIR}
# Maximum number of cache files to keep in the cache dir (use -1 for infinity)
cache_file_max = 50000
# Set to true, if no files should we written to the cache. When combining Bohrium and MPI, use this option to avoid
# write conflicts by only having rank zero write to the cache dir.
cache_readonly = false
# Set the size limit of malloc cache in percentage of total GPU memory.
malloc_cache_limit = 90
# The command to execute the compiler where {OUT} is replaced with the binary file output and {IN} with the source file
# Additionally, {MAJOR} and {MINOR} are dynamically replaced with the compute capability version of the device
compiler_cmd = "${CUDA_NVCC_EXECUTABLE} --cubin -m64 -arch=sm_{MAJOR}{MINOR} -O3 --disable-warnings ${VE_OPENMP_COMPILER_INC} {IN} -o {OUT}"
# List of extension methods
libs = ${BH_CUDA_LIBS}
# The pre-fuser to use ('none' or 'lossy')
pre_fuser = lossy
# List of instruction fuser/transformers
fuser_list = greedy, push_reductions_inwards, split_for_threading, collapse_redundant_axes
# Number of edges in the fusion graph that makes the greedy fuser use the `reshapable_first` fuser instead
greedy_threshold = 10000
# *_as_var specifies whether to hard-code variables or have them as variables
index_as_var = true
strides_as_var = true
const_as_var = true
# CUDA work group sizes
work_group_size_1dx = 128
work_group_size_2dx = 32
work_group_size_2dy = 4
work_group_size_3dx = 32
work_group_size_3dy = 2
work_group_size_3dz = 2
# Optimize instructions for GPU access (column-major)
to_col_major = false