Hello Team Pyfr,
This is in regards to my thesis work.
I have to do T106C test case for my work, but the simulation is taking more time for a warm up of 100 time units of order 1.
following are some initial test runs on CPU before I decide suitable partitions:
increasing or decreasing partitions or memory for job doesn’t provided me much speed up.
Attached are my slrum batch file, output file and INI file of 32 partitions
could you please suggest me, if I am doing something wrong and how to rectify it.
Thank you in advance.
Regards,
Manu.
<slrum batch file>
#!/usr/local_rwth/bin/zsh
###SBATCH --mail-user=
#SBATCH --mail-type=ALL
#SBATCH -t 0-00:30:00
### number of tasks must be equal to number of MPIs(=number of partitions)
#SBATCH --ntasks=32
### To have the best performance and avoid over threading, the number of MPIs(=number of partitions) should
### be eaual to the number of Sockets on each Nodes. The most nodes on RWTH-Cluster have 2 Sockets on each Node,
### Therefore we chose to have 16 Nodes for 32 MPIs.
#SBATCH -N 16
### For OpenMP backend, there is also a parallelization on each MPI, so we call the maximum number of CPUs on each node/socket.
### The most of the nodes on RWTH Cluster have 48 CPUs. So, we want to use 16*48 = 768 CPUs.
### Thus, we need 768/(32=number of tasks) = 24
#SBATCH --cpus-per-task=24
#SBATCH --mem-per-cpu=3900
#SBATCH -J T106C
#SBATCH --output=output.%J.txt
export CONDA_ROOT=$HOME/miniconda3
. $CONDA_ROOT/etc/profile.d/conda.sh
export PATH="$CONDA_ROOT/bin:$PATH"
export PYFR_XSMM_LIBRARY_PATH=/home/vgXXXXX/miniconda3/envs/env3.10/libxsmm/lib/libxsmm.so
;export PYFR_DEBUG_OMP_DISABLE_CACHE=true
conda activate env3.10
module load python
module load gcc
###mpiexec -n 32 pyfr run -b openmp -p T106C-32p.pyfrm T106C-R80K.ini
$MPIEXEC $FLAGS_MPI_BATCH pyfr run -b openmp -p T106C-32p.pyfrm T106C-R80K.ini
<INI file>
[backend]
precision = single
rank-allocator = linear
[backend-openmp]
cc = gcc
;gimmik-max-nnz = 100000000000
;cblas-type=parallel
[constants]
gamma = 1.4
Pr = 0.71
mu = 0.0000075321
cpTref = 3.20
cpTs = 1.29
[solver]
system = navier-stokes
order = 1
viscosity-correction = sutherland
;anti-alias = flux, surf-flux
[solver-time-integrator]
formulation = std
scheme = rk45
controller = pi
tstart = 0.0
dt = 0.01
tend = 100
atol = 0.00001
rtol = 0.000001
[solver-interfaces]
riemann-solver = rusanov
ldg-beta = 0.5
ldg-tau = 0.1
[solver-interfaces-quad]
flux-pts = gauss-legendre
quad-deg = 11
quad-pts = gauss-legendre
[solver-elements-quad]
soln-pts = gauss-legendre
quad-deg = 11
quad-pts = gauss-legendre
[solver-elements-hex]
soln-pts = gauss-legendre
quad-deg = 11
quad-pts = gauss-legendre
[soln-plugin-writer]
dt-out = 5
basedir = Results
basename = T106C_R80-P1-{t:.2f}
region = *
[soln-bcs-blade]
type = no-slp-adia-wall
[soln-bcs-inlet]
type = sub-in-ftpttang
pt = 1
cpTt = 3.5
theta = 32.7
phi = 90
[soln-bcs-outlet]
type = char-riem-inv
rho = 0.8164437585
u = 0.3990132891
v = -0.6215270726
w = 0
p = 0.752
[soln-ics]
rho = 0.85
u = 0.32
v = 0
w = 0
p = 0.8
slrum batch output