Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,53 @@ PYTHONPATH='.' python3 -u ${FRIDATA_PATH}/fridata.py \
-i ${IDS_PATH} \
--input-path ${AFDB_PATH} \
-e ${EMBEDDER_TYPE}
```

## Running on HPC

Running FRIdata on HPC differs on CPU and GPU nodes. This instruction set is valid for HPC hosted in PLGrid infrastructure. Running on other infrastructures may require additional adjustments.

### CPU

Prerequisites:
- Having active grant valid on the HPC
- Having a full list of mandatory ENV vars set (ideally in .bashrc):
- `DEEPFRI_PATH`: should always refer to a parent directory of this repo
- `IDS_PATH`: path to a text file with AFDB indexes listed
- `AFDB_PATH`: path to AFDB structures (can be empty directory - structures will be fetched there)
- `DATA_PATH`: path to the parent diretory of all generated output data
- Optional ENV vars with default values:
- `COMMON_SLURM_PATH`: path to common_slurn_cpu.sh, defaults to `$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/common_slurm_cpu.sh`
- `LAUNCH_WORKER_SLURM_PATH`: path to launch_worker_slurm_cpu.sh, defaults to `$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/launch_workers_slurm_cpu.sh`
- `MEMORY_LIMIT`: memory limit per Dask worker, defaults to `288GiB`
- `IP_INTERFACE`: network unix interface, where dask workers are connected. Defaults to `ens1f0`
- `CONDA_ENV_PATH`: path to conda environment, defaults to `$DEEPFRI_PATH/conda_dev`
- Have installed module miniconda3
- Have installed module gcc

Steps:

1. Download the repo

```
git clone https://github.com/Tomasz-Lab/FRIdata.git
cd FRIdata
```

2. Update run permissions

```
chmod u+x -R scripts/hpc/cpu
```

3. Run `initialize_slurm_cpu.sh`. As an argument put the path into directory, where `.conda` directory should be installed and specify `--cpu` flag

```
./scripts/hpc/cpu/initialize_slurm_cpu.sh <path to .conda> --cpu
```

4. Schedule SBatch script into the HPC with all the args specified

```
sbatch --cpus-per-task=<cpus> --time=<HH:MM:SS> --nodes=<nodes> --account=<grant name> scripts/hpc/cpu/run_slurm_cpu.sh
```
59 changes: 0 additions & 59 deletions initialize_hpc.sh

This file was deleted.

56 changes: 0 additions & 56 deletions initialize_hpc_helios.sh

This file was deleted.

22 changes: 0 additions & 22 deletions run_hpc.sh

This file was deleted.

91 changes: 91 additions & 0 deletions scripts/hpc/cpu/common_slurm_cpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash

# Define the start_computation function
start_computation() {
# Check if a command was provided
if [ -z "$1" ]; then
echo "Error: No Python command provided."
exit 1
fi

export DASK_LOGGING__DISTRIBUTED="WARNING"

# Store the provided Python command
local python_command="$1"

echo 'IP'
if [[ ! -v IP_INTERFACE ]]; then
IP_INTERFACE='ens1f0'
fi
ip a sh dev $IP_INTERFACE | grep -oP '(?<=inet )\S+' | cut -d'/' -f1

cd $DEEPFRI_PATH

module load gcc
module load miniconda3
eval "$(conda shell.bash hook)"
conda activate $CONDA_ENV_PATH

echo "Start time: `date`"
start_time=$(date +%s)

cd ./FRIdata

nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)

if [ -e $DEEPFRI_PATH/scheduler.json ]; then
rm $DEEPFRI_PATH/scheduler.json
fi

dask scheduler --scheduler-file $DEEPFRI_PATH/scheduler.json --preload ./toolbox/worker_setup.py &

while [[ ! -e $DEEPFRI_PATH/scheduler.json ]]; do
sleep 10
done

if [[ ! -v LAUNCH_WORKER_SLURM_PATH ]]; then
LAUNCH_WORKER_SLURM_PATH="$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/launch_workers_slurm_cpu.sh"
fi

chmod +x $LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh

$LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh $SLURM_CPUS_PER_TASK ${nodes_array[0]} &

echo "Head node workers"

worker_num=$((SLURM_JOB_NUM_NODES - 1))

for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
srun -w "$node_i" -c $SLURM_CPUS_PER_TASK $LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh $SLURM_CPUS_PER_TASK $node_i &
echo "$node_i started srun workers"
done

# Record start time
start_time=$(date +%s)

echo "eval python command"

# Execute the provided Python command
eval "$python_command"

end_time=$(date +%s)
echo "End time: `date`"

duration=$((end_time - start_time))

# Convert seconds to hours, minutes, and seconds
hours=$((duration / 3600))
minutes=$(( (duration % 3600) / 60 ))
seconds=$((duration % 60))

# Print the formatted duration
printf "Computation time: %02d:%02d:%02d\n" $hours $minutes $seconds
}

# Check if this script is being run directly and not sourced
if [ "${BASH_SOURCE[0]}" == "${0}" ]; then
# Call the function to execute the script with arguments
start_computation "$@"
fi
21 changes: 17 additions & 4 deletions install.sh → scripts/hpc/cpu/initialize_slurm_cpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,36 @@ if [ -z "$GROUP_DIR" ]; then
exit 1
fi

# Check if DEEPFRI_PATH is set, if not then throw an error
if [[ ! -v DEEPFRI_PATH ]]; then
echo "Error: DEEPFRI_PATH environment variable is not set."
exit 1
fi

# Check if CONDA_ENV_PATH is set, if not then set a default path
if [[ ! -v CONDA_ENV_PATH ]]; then
CONDA_ENV_PATH="$DEEPFRI_PATH/conda_dev"
fi

CONDA_DIR="$GROUP_DIR/.conda"

module load miniconda3
conda config --add pkgs_dirs "$CONDA_DIR"

# Create environment from base YAML (without PyTorch)
conda env create --prefix $ENV_PATH --file "toolbox_env_conda.yml"
conda env create --prefix $CONDA_ENV_PATH --file "$DEEPFRI_PATH/FRIdata/toolbox_env_conda.yml"

conda config --set auto_activate_base false

source activate $ENV_PATH
source activate $CONDA_ENV_PATH

# Install PyTorch based on mode
if [ "$CPU_ONLY" = true ]; then
echo "Installing CPU-only PyTorch..."
conda install -y pytorch cpuonly -c pytorch -c conda-forge
conda install -y pytorch cpuonly -c pytorch
else
echo "Installing GPU-enabled PyTorch..."
conda install -y pytorch-gpu -c conda-forge
conda install -y pytorch-gpu
fi

# Install ESM (requires PyTorch to be installed first)
Expand Down
16 changes: 16 additions & 0 deletions scripts/hpc/cpu/launch_workers_slurm_cpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

export DASK_LOGGING__DISTRIBUTED="WARNING"

WORKERS_COUNT=$(($1))
if [[ ! -v MEMORY_LIMIT ]]; then
MEMORY_LIMIT='288GiB'
fi

mkdir -p $SCRATCH/slurm_jobdir/$SLURM_JOB_ID/dask-workers

local_dir=$SCRATCH/slurm_jobdir/$SLURM_JOB_ID/dask-workers/$2

mkdir $local_dir

dask worker --scheduler-file $DEEPFRI_PATH/scheduler.json --nworkers $WORKERS_COUNT --nthreads 1 --memory-limit $MEMORY_LIMIT --local-directory $local_dir --preload $DEEPFRI_PATH/FRIdata/toolbox/worker_setup.py
13 changes: 13 additions & 0 deletions scripts/hpc/cpu/run_slurm_cpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

EMBEDDER_TYPE=esm2_t33_650M_UR50D

if [[ ! -v COMMON_SLURM_PATH ]]; then
COMMON_SLURM_PATH="$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/common_slurm_cpu.sh"
fi

source $COMMON_SLURM_PATH/common_slurm_cpu.sh

PYTHON_COMMAND="PYTHONPATH='.' python3 -u ${DEEPFRI_PATH}/FRIdata/fridata.py input_generation -t sequences,coordinates -d AFDB -c subset --overwrite --version 1_test_dask -i ${IDS_PATH} --input-path ${AFDB_PATH} -e ${EMBEDDER_TYPE} --slurm --verbose"

start_computation "$PYTHON_COMMAND"
Loading