KU Leuven/UHasselt Tier-2 High Performance Computing Infrastructure (VSC)
NB: You will need an account to use the HPC cluster to run the pipeline.
- Install Nextflow on the cluster
conda create --name nf-core python=3.12 nf-core nextflow
:::note
A nextflow module is available that can be loaded module load Nextflow
but it does not support plugins. So it’s not recommended
:::
- Set up the environment variables in
~/.bashrc
or~/.bash_profile
:
:::note If you have access to dedicated nodes, you can export these as a command separated list. These queues will only be used if specified task requirements are not available in the normal partitions but they are available in dedicated partitions. AMD is considered a dedicated partition. :::
export SLURM_ACCOUNT="<your-credential-account>"
# Comma-separated list of available dedicated partitions (if any)
# For example: export VSC_DEDICATED_QUEUES="dedicated_big_bigmem,dedicated_big_gpu"
export VSC_DEDICATED_QUEUES="<available-dedicated-partitions>"
# Needed for running Nextflow jobs
export NXF_HOME="$VSC_SCRATCH/.nextflow"
export NXF_WORK="$VSC_SCRATCH/work"
# Needed for running Apptainer containers
export APPTAINER_CACHEDIR="$VSC_SCRATCH/.apptainer/cache"
export APPTAINER_TMPDIR="$VSC_SCRATCH/.apptainer/tmp"
export NXF_CONDA_CACHEDIR="$VSC_SCRATCH/miniconda3/envs"
# Optional tower key
# export TOWER_ACCESS_TOKEN="<your_tower_access_token>"
# export NXF_VER="<version>" # make sure it's larger then 24.10.1
:::warning The current config is setup with array jobs. Make sure nextflow version >= 24.10.1, read array jobs in nextflow you can do this in
export NXF_VER=24.10.1
:::
- Make the submission script.
NB: you should go to the cluster you want to run the pipeline on. You can check what clusters have the most free space using following command
sinfo --cluster wice|genius
.
$ more job.pbs
#!/bin/bash -l
#SBATCH --account=...
#SBATCH --chdir=....
#SBATCH --partition=batch_long
#SBATCH --nodes="1"
#SBATCH --ntasks-per-node="1"
# module load Nextflow # does not support plugins
conda activate nf-core
nextflow run <pipeline> -profile vsc_kul_uhasselt,<CLUSTER> <Add your other parameters>
NB: You have to specify your credential account, by setting
export SLURM_ACCOUNT="<your-credential-account>"
else the jobs will fail!
Here the cluster options are:
- genius
- genius_gpu
- wice
- wice_gpu
- superdome
NB: The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. The profile will select to its best ability the most appropriate partition for the job. Including modules with a label containing
gpu
will be allocated to a gpu partition when the ‘normal’genius
profile is selected. Select thegenius_gpu
orwice_gpu
profile to force the job to be allocated to a gpu partition. NB: If the module does not haveaccelerator
set, it will determine the number of GPUs based on the requested resources.
Use the --cluster
option to specify the cluster you intend to use when submitting the job:
sbatch --cluster=wice|genius job.slurmÂ
All of the intermediate files required to run the pipeline will be stored in the work/
directory. It is recommended to delete this directory after the pipeline has finished successfully because it can get quite large, and all of the main output files will be saved in the results/
directory anyway.
Config file
// Default to /tmp directory if $VSC_SCRATCH scratch env is not available,
// see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
def SCRATCH_DIR = System.getenv("VSC_SCRATCH") ?: "/tmp"
def TIER2_PROJECT = System.getenv("SLURM_ACCOUNT") ?: null
def DEDICATED_QUEUES = System.getenv("VSC_DEDICATED_QUEUES") ?: ""
def AVAILABLE_QUEUES = DEDICATED_QUEUES.toString().split(',')
// Perform work directory cleanup when the run has succesfully completed
// cleanup = true
// Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs
// Limit queueSize to keep job rate under control and avoid timeouts
executor {
submitRateLimit = '50/1min'
queueSize = 50
exitReadTimeout = "10min"
}
// Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory
process {
executor = 'slurm'
stageInMode = "symlink"
stageOutMode = "rsync"
errorStrategy = { sleep(Math.pow(2, task.attempt ?: 1) * 200 as long); return 'retry' }
maxRetries = 3
array = 30
}
// Specify that singularity should be used and where the cache dir will be for the images
singularity {
enabled = true
autoMounts = true
cacheDir = "$SCRATCH_DIR/.singularity"
pullTimeout = "30 min"
}
params {
config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be'
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html'
}
env {
APPTAINER_TMPDIR="$SCRATCH_DIR/.apptainer/tmp"
APPTAINER_CACHEDIR="$SCRATCH_DIR/.apptainer/cache"
}
// AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time)
aws {
maxErrorRetry = 3
}
/*
* Queue Selection Utility Functions for HPC Environments
* ==================================================
* This module provides functions to determine appropriate HPC queues based on task requirements
* for both GENIUS and WICE clusters.
*/
/*
* Constants:
* ----------
* TIME_THRESHOLD: 72 hours - Threshold for determining long-running jobs
* MEMORY_THRESHOLD (GENIUS): 175GB - Memory threshold for bigmem queues
* MEMORY_THRESHOLD (WICE): 239GB - Memory threshold for high-memory queues
*/
def TIME_THRESHOLD = 72.h
def MEMORY_THRESHOLD_GENIUS = 175.GB
def MEMORY_THRESHOLD_WICE = 239.GB
/*
* ---------
* Functions:
* ----------
* These functions are designed to select the appropriate HPC queues of
* VSC_KUL_UHASSELT based on task requirements. They handle both standard
* and GPU queues, considering memory requirements, execution time, and
* queue availability.
*/
/*
* limitTaskTime(time, maxTime)
* Ensures task time doesn't exceed the maximum allowed time
* @param time Current task time
* @param maxTime Maximum allowed time
* @return Limited task time
*/
def limitTaskTime(time, maxTime) {
return time > maxTime ? maxTime : time
}
/*
* determineGeniusQueue(task)
* Selects appropriate CPU queue for GENIUS cluster
* @param task Nextflow task object containing memory and time requirements
* @return Queue name based on task requirements
*/
def determineGeniusQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_GENIUS
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedBigmem = AVAILABLE_QUEUES.contains('dedicated_big_bigmem')
if (isHighMemory) {
return isLongRunning ?
(hasDedicatedBigmem ? 'dedicated_big_bigmem' : 'bigmem_long') :
'bigmem'
}
return isLongRunning ? 'batch_long' : 'batch'
}
/*
* determineGeniusGpuQueue(task)
* Selects appropriate GPU queue for GENIUS cluster
* @param task Nextflow task object containing memory and time requirements
* @return GPU queue name based on task requirements
*/
def determineGeniusGpuQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_GENIUS
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedGpu = AVAILABLE_QUEUES.contains('dedicated_rega_gpu')
def hasAmdGpu = AVAILABLE_QUEUES.contains('amd')
if (isHighMemory) {
return isLongRunning ? 'gpu_v100_long' : 'gpu_v100'
}
if (isLongRunning) {
if (hasDedicatedGpu) return 'dedicated_rega_gpu'
if (hasAmdGpu) return 'amd_long'
return 'gpu_p100_long'
}
return hasAmdGpu ? 'amd' : 'gpu_p100'
}
/*
* determineWiceQueue(task)
* Selects appropriate CPU queue for WICE cluster
* @param task Nextflow task object containing memory and time requirements
* @return Queue name based on task requirements and availability
*/
def determineWiceQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedQueue = AVAILABLE_QUEUES.contains('dedicated_big_bigmem')
if (isHighMemory) {
if (isLongRunning && hasDedicatedQueue) {
return 'dedicated_big_bigmem'
}
task.time = limitTaskTime(task.time, TIME_THRESHOLD)
return 'bigmem,hugemem'
}
return isLongRunning ?
'batch_long,batch_icelake_long,batch_sapphirerapids_long' :
'batch,batch_sapphirerapids,batch_icelake'
}
/*
* determineWiceGpuQueue(task)
* Selects appropriate GPU queue for WICE cluster
* @param task Nextflow task object containing memory and time requirements
* @return GPU queue name based on task requirements
*/
def determineWiceGpuQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedQueue = isHighMemory ?
AVAILABLE_QUEUES.contains('dedicated_big_gpu_h100') :
AVAILABLE_QUEUES.contains('dedicated_big_gpu')
if (isLongRunning && !hasDedicatedQueue) {
task.time = limitTaskTime(task.time, TIME_THRESHOLD)
}
if (isHighMemory) {
return (isLongRunning && hasDedicatedQueue) ? 'dedicated_big_gpu_h100' : 'gpu_h100'
}
return (isLongRunning && hasDedicatedQueue) ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
}
/*
* ========
* Profiles
* ========
* These profiles define the resource limits, queue selection, and cluster options
* for WICE and GENIUS clusters. They also include GPU-specific configurations.
* Details of the resource limits can be found in for genius at
* https://docs.vscentrum.be/leuven/tier2_hardware/genius_hardware.html
* and for wice at https://docs.vscentrum.be/leuven/tier2_hardware/wice_hardware.html
*/
// Define profiles for each cluster
profiles {
genius {
params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.'
process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ]
beforeScript = { 'module load cluster/genius/' + determineGeniusQueue(task).toString().split(',')[0] }
queue = { determineGeniusQueue(task) }
clusterOptions = {
determineGeniusQueue(task) =~ /dedicated/ ?
"--clusters=genius --account=lp_big_genius_cpu" :
"--clusters=genius --account=$TIER2_PROJECT"
}
withLabel: '.*gpu.*'{
resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ]
beforeScript = { 'module load cluster/genius/' + determineGeniusGpuQueue(task).toString().split(',')[0] }
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
queue = { determineGeniusGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
}
}
}
}
genius_gpu {
params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.'
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h]
beforeScript = { 'module load cluster/genius/' + determineGeniusGpuQueue(task).toString().split(',')[0] }
queue = { determineGeniusGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
}
}
}
wice {
params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.'
process {
// max is 2016000
resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ]
beforeScript = { 'module load cluster/wice/' + determineWiceQueue(task).toString().split(',')[0] }
queue = { determineWiceQueue(task) }
clusterOptions = {
determineWiceQueue(task) =~ /dedicated/ ?
"--clusters=wice --account=lp_big_wice_cpu" :
"--clusters=wice --account=$TIER2_PROJECT"
}
withLabel: '.*gpu.*' {
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
beforeScript = { 'module load cluster/wice/' + determineWiceGpuQueue(task).toString().split(',')[0] }
queue = { determineWiceGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
def queueValue = determineWiceGpuQueue(task)
queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
}
}
}
}
wice_gpu {
params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.'
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
process {
// 768 - 65 so 65GB for overhead, max is 720000MB
beforeScript = { 'module load cluster/wice/' + determineWiceGpuQueue(task).toString().split(',')[0] }
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
queue = { determineWiceGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
def queueValue = determineWiceGpuQueue(task)
queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
}
}
}
superdome {
params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.'
process {
clusterOptions = {"--clusters=genius --account=$TIER2_PROJECT"}
beforeScript = 'module load cluster/genius/superdome'
// 6000 - 228 so 228GB for overhead, max is 5910888MB
resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h]
queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' }
}
}
}