-
Notifications
You must be signed in to change notification settings - Fork 31
/
config.ini.in
211 lines (199 loc) · 8.35 KB
/
config.ini.in
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Bohrium version: ${BH_VERSION_STRING}
##########################################################################
# Stack configurations, which are a comma separated lists of components. #
# NB: 'stacks' is a reserved section name and 'default' #
# is used when 'BH_STACKS' is unset. #
# The bridge is never part of the list #
##########################################################################
[stacks]
default = bcexp_cpu, bccon, node, openmp
openmp = bcexp_cpu, bccon, node, openmp
opencl = bcexp_gpu, bccon, node, opencl, openmp
cuda = bcexp_gpu, bccon, node, cuda, openmp
proxy_openmp = bcexp_cpu, bccon, proxy, node, openmp
proxy_opencl = bcexp_cpu, bccon, proxy, node, opencl, openmp
proxy_cuda = bcexp_cpu, bccon, proxy, node, cuda, openmp
############
# Managers #
############
[node]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_vem_node${CMAKE_SHARED_LIBRARY_SUFFIX}
timing = false
[proxy]
address = localhost
port = 4200
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_vem_proxy${CMAKE_SHARED_LIBRARY_SUFFIX}
libs = ${BH_PROXY_LIBS}
#############################
# Filters - Helpers / Tools #
#############################
[pprint]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_pprint${CMAKE_SHARED_LIBRARY_SUFFIX}
###################################
# Filters - Bytecode transformers #
###################################
[bccon]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_bccon${CMAKE_SHARED_LIBRARY_SUFFIX}
collect = true
stupidmath = true
muladd = true
reduction = false
find_repeats = false
timing = false
verbose = false
[bcexp_cpu]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_bcexp${CMAKE_SHARED_LIBRARY_SUFFIX}
powk = true
sign = false
repeat = false
reduce1d = 0
timing = false
verbose = false
[bcexp_gpu]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_bcexp${CMAKE_SHARED_LIBRARY_SUFFIX}
powk = true
sign = false
repeat = false
# Transform 1d reductions into 2d reductions by array reshaping
reduce1d = 32000
timing = false
verbose = false
[noneremover]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_filter_noneremover${CMAKE_SHARED_LIBRARY_SUFFIX}
timing = false
verbose = false
###########
# Engines #
###########
[openmp]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_ve_openmp${CMAKE_SHARED_LIBRARY_SUFFIX}
verbose = false
# Profiling statistics
prof = false
prof_filename =
# Write a Graphviz graph for each kernel
graph = false
# Directory for temporary files (e.g. /tmp/). Default: NONE, which is `boost::filesystem::temp_directory_path()`
tmp_dir = NONE
# Directory for cache files (persistent between executions). Default: NONE, which disable the cache
cache_dir = ${BIN_KERNEL_CACHE_DIR}
# Maximum number of cache files to keep in the cache dir (use -1 for infinity)
cache_file_max = 50000
# Set to true, if no files should we written to the cache. When combining Bohrium and MPI, use this option to avoid
# write conflicts by only having rank zero write to the cache dir.
cache_readonly = false
# Set the size limit of malloc cache in percentage of the unused system memory.
# NB: if the amount of unused memory cannot be determined, 20% of total memory system is used.
malloc_cache_limit = 80
# The command to execute the compiler where {OUT} is replaced with the binary file output and {IN} with the source file
compiler_cmd = "${VE_OPENMP_COMPILER_CMD} ${VE_OPENMP_COMPILER_FLG} ${VE_OPENMP_COMPILER_INC} {IN} -o {OUT}"
# JIT compile options
compiler_openmp = ${_VE_OPENMP_COMPILER_OPENMP}
compiler_openmp_simd = ${_VE_OPENMP_COMPILER_OPENMP_SIMD}
# List of extension methods
libs = ${BH_OPENMP_LIBS}
# The pre-fuser to use ('none' or 'lossy')
pre_fuser = lossy
# List of instruction fuser/transformers
fuser_list = greedy, collapse_redundant_axes
# Number of edges in the fusion graph that makes the greedy fuser use the `reshapable_first` fuser instead
greedy_threshold = 10000
# *_as_var specifies whether to hard-code variables or have them as variables
index_as_var = true
strides_as_var = true
const_as_var = true
# Monolithic combines all blocks into one shared library rather than a block-nest per shared library
monolithic = false
[opencl]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_ve_opencl${CMAKE_SHARED_LIBRARY_SUFFIX}
verbose = false
# Profiling statistics
prof = false
prof_filename =
# Write a Graphviz graph for each kernel
graph = false
# Directory for temporary files (e.g. /tmp/). Default: NONE, which is `boost::filesystem::temp_directory_path()`
tmp_dir = NONE
# Directory for cache files (persistent between executions). Default: NONE, which disable the cache
cache_dir = ${BIN_KERNEL_CACHE_DIR}
# Maximum number of cache files to keep in the cache dir (use -1 for infinity)
cache_file_max = 50000
# Set to true, if no files should we written to the cache. When combining Bohrium and MPI, use this option to avoid
# write conflicts by only having rank zero write to the cache dir.
cache_readonly = false
# Set the size limit of malloc cache in percentage of total GPU memory.
# NB: if the device is a CPU, only 10% of the total memory will be used.
malloc_cache_limit = 90
# Bohrium sort all found devices by type ('gpu', 'cpu', or 'accelerator'). Set the device number to the device
# Bohrium should use (0 means first). PS: use `python -m bohrium_api --info` the get available devices.
device_number = 0
# Additional options given to the opencl compiler. See documentation for clBuildProgram
compiler_inc_dir = "${VE_OPENCL_COMPILER_INC_DIR}"
# List of extension methods
libs = ${BH_OPENCL_LIBS}
# The pre-fuser to use ('none' or 'lossy')
pre_fuser = lossy
# List of instruction fuser/transformers
fuser_list = greedy, push_reductions_inwards, split_for_threading, collapse_redundant_axes
# Number of edges in the fusion graph that makes the greedy fuser use the `reshapable_first` fuser instead
greedy_threshold = 10000
# *_as_var specifies whether to hard-code variables or have them as variables
index_as_var = true
strides_as_var = true
const_as_var = true
# OpenCL work group sizes
work_group_size_1dx = 128
work_group_size_2dx = 32
work_group_size_2dy = 4
work_group_size_3dx = 32
work_group_size_3dy = 2
work_group_size_3dz = 2
# Maximum number of threads to use (use 0 for infinity)
num_threads = 0
# Use round robin instead of block parallelization when limiting number of threads.
num_threads_round_robin = false
# Optimize instructions for GPU access (column-major)
to_col_major = false
[cuda]
impl = ${CMAKE_INSTALL_PREFIX}/${LIBDIR}/libbh_ve_cuda${CMAKE_SHARED_LIBRARY_SUFFIX}
verbose = false
# Profiling statistics
prof = false
prof_filename =
# Write a Graphviz graph for each kernel
graph = false
# Directory for temporary files (e.g. /tmp/). Default: NONE, which is `boost::filesystem::temp_directory_path()`
tmp_dir = NONE
# Directory for cache files (persistent between executions). Default: NONE, which disable the cache
cache_dir = ${BIN_KERNEL_CACHE_DIR}
# Maximum number of cache files to keep in the cache dir (use -1 for infinity)
cache_file_max = 50000
# Set to true, if no files should we written to the cache. When combining Bohrium and MPI, use this option to avoid
# write conflicts by only having rank zero write to the cache dir.
cache_readonly = false
# Set the size limit of malloc cache in percentage of total GPU memory.
malloc_cache_limit = 90
# The command to execute the compiler where {OUT} is replaced with the binary file output and {IN} with the source file
# Additionally, {MAJOR} and {MINOR} are dynamically replaced with the compute capability version of the device
compiler_cmd = "${CUDA_NVCC_EXECUTABLE} --cubin -m64 -arch=sm_{MAJOR}{MINOR} -O3 --disable-warnings ${VE_OPENMP_COMPILER_INC} {IN} -o {OUT}"
# List of extension methods
libs = ${BH_CUDA_LIBS}
# The pre-fuser to use ('none' or 'lossy')
pre_fuser = lossy
# List of instruction fuser/transformers
fuser_list = greedy, push_reductions_inwards, split_for_threading, collapse_redundant_axes
# Number of edges in the fusion graph that makes the greedy fuser use the `reshapable_first` fuser instead
greedy_threshold = 10000
# *_as_var specifies whether to hard-code variables or have them as variables
index_as_var = true
strides_as_var = true
const_as_var = true
# CUDA work group sizes
work_group_size_1dx = 128
work_group_size_2dx = 32
work_group_size_2dy = 4
work_group_size_3dx = 32
work_group_size_3dy = 2
work_group_size_3dz = 2
# Optimize instructions for GPU access (column-major)
to_col_major = false