From 97b5aa3adc003f82db0e88fd97c8622fa955b007 Mon Sep 17 00:00:00 2001 From: ChrisGadek1 Date: Thu, 5 Mar 2026 23:56:43 +0100 Subject: [PATCH] Add HPC CPU scripts and update README with usage instructions --- README.md | 49 ++++++++++ initialize_hpc.sh | 59 ------------ initialize_hpc_helios.sh | 56 ------------ run_hpc.sh | 22 ----- scripts/hpc/cpu/common_slurm_cpu.sh | 91 +++++++++++++++++++ .../hpc/cpu/initialize_slurm_cpu.sh | 21 ++++- scripts/hpc/cpu/launch_workers_slurm_cpu.sh | 16 ++++ scripts/hpc/cpu/run_slurm_cpu.sh | 13 +++ 8 files changed, 186 insertions(+), 141 deletions(-) delete mode 100644 initialize_hpc.sh delete mode 100644 initialize_hpc_helios.sh delete mode 100644 run_hpc.sh create mode 100644 scripts/hpc/cpu/common_slurm_cpu.sh rename install.sh => scripts/hpc/cpu/initialize_slurm_cpu.sh (62%) create mode 100644 scripts/hpc/cpu/launch_workers_slurm_cpu.sh create mode 100644 scripts/hpc/cpu/run_slurm_cpu.sh diff --git a/README.md b/README.md index 4e7f268..d4f655f 100644 --- a/README.md +++ b/README.md @@ -83,4 +83,53 @@ PYTHONPATH='.' python3 -u ${FRIDATA_PATH}/fridata.py \ -i ${IDS_PATH} \ --input-path ${AFDB_PATH} \ -e ${EMBEDDER_TYPE} +``` + +## Running on HPC + +Running FRIdata on HPC differs on CPU and GPU nodes. This instruction set is valid for HPC hosted in PLGrid infrastructure. Running on other infrastructures may require additional adjustments. + +### CPU + +Prerequisites: +- Having active grant valid on the HPC +- Having a full list of mandatory ENV vars set (ideally in .bashrc): + - `DEEPFRI_PATH`: should always refer to a parent directory of this repo + - `IDS_PATH`: path to a text file with AFDB indexes listed + - `AFDB_PATH`: path to AFDB structures (can be empty directory - structures will be fetched there) + - `DATA_PATH`: path to the parent diretory of all generated output data + - Optional ENV vars with default values: + - `COMMON_SLURM_PATH`: path to common_slurn_cpu.sh, defaults to `$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/common_slurm_cpu.sh` + - `LAUNCH_WORKER_SLURM_PATH`: path to launch_worker_slurm_cpu.sh, defaults to `$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/launch_workers_slurm_cpu.sh` + - `MEMORY_LIMIT`: memory limit per Dask worker, defaults to `288GiB` + - `IP_INTERFACE`: network unix interface, where dask workers are connected. Defaults to `ens1f0` + - `CONDA_ENV_PATH`: path to conda environment, defaults to `$DEEPFRI_PATH/conda_dev` +- Have installed module miniconda3 +- Have installed module gcc + +Steps: + +1. Download the repo + +``` +git clone https://github.com/Tomasz-Lab/FRIdata.git +cd FRIdata +``` + +2. Update run permissions + +``` +chmod u+x -R scripts/hpc/cpu +``` + +3. Run `initialize_slurm_cpu.sh`. As an argument put the path into directory, where `.conda` directory should be installed and specify `--cpu` flag + +``` +./scripts/hpc/cpu/initialize_slurm_cpu.sh --cpu +``` + +4. Schedule SBatch script into the HPC with all the args specified + +``` +sbatch --cpus-per-task= --time= --nodes= --account= scripts/hpc/cpu/run_slurm_cpu.sh ``` \ No newline at end of file diff --git a/initialize_hpc.sh b/initialize_hpc.sh deleted file mode 100644 index 4c275ab..0000000 --- a/initialize_hpc.sh +++ /dev/null @@ -1,59 +0,0 @@ -GROUP_DIR="$PLG_GROUPS_STORAGE/plggdeepfri2" - -set -x - -if [[ ! -d "deepfri" ]] -then - mkdir deepfri -fi - -cd deepfri -PROJECT_DIR="$GROUP_DIR/deepfri" -DATA_DIR="$PROJECT_DIR/dev_data" -mkdir "$DATA_DIR" - -module load python/3.10 -module load miniconda3 - -ENV_PATH="$PROJECT_DIR/dev_env" -EMBEDDING_ENV_PATH="$PROJECT_DIR/embedding_env" - -if [[ -d "deepFRI2-toolbox-dev" ]] -then - cd "deepFRI2-toolbox-dev" - git pull - - source activate $ENV_PATH - -else - git clone https://github.com/youngdashu/deepFRI2-toolbox-dev.git - cd "deepFRI2-toolbox-dev" - echo "DATA_PATH=$PLG_GROUPS_STORAGE/plggdeepfri2/$DATA_DIR" > .env - echo "SEPARATOR=-" >> .env - - CONDA_DIR="$GROUP_DIR/.conda" - mkdir -p "$CONDA_DIR" - conda config --add pkgs_dirs "$CONDA_DIR" - - conda env create --prefix $ENV_PATH --file "dev_env_conda.yml" - - conda config --set auto_activate_base false - - source activate $ENV_PATH - - pip install bio~=1.7.0 - pip install foldcomp~=0.0.7 - - conda deactivate - - git submodule init - git submodule update - -fi - - - - - - - diff --git a/initialize_hpc_helios.sh b/initialize_hpc_helios.sh deleted file mode 100644 index 3ade729..0000000 --- a/initialize_hpc_helios.sh +++ /dev/null @@ -1,56 +0,0 @@ -GROUP_DIR="$PLG_GROUPS_STORAGE/plggsano/tomaszlab" - -set -x -set -e - -if [[ ! -d "deepfri" ]] -then - mkdir deepfri -fi - -cd deepfri - -DATA_DIR="deepfri/dev_data" -mkdir "$GROUP_DIR/$DATA_DIR" - -module load GCCcore -module load git -module load Miniconda3 -eval "$(conda shell.bash hook)" - - -ENV_PATH="$GROUP_DIR/deepfri/dev_env" - -if [[ -d "deepFRI2-toolbox-dev" ]] -then - cd "deepFRI2-toolbox-dev" - git pull - - conda activate $ENV_PATH - -else - git clone https://github.com/youngdashu/deepFRI2-toolbox-dev.git - cd "deepFRI2-toolbox-dev" - echo "DATA_PATH=$GROUP_DIR/$DATA_DIR" > .env - echo "SEPARATOR=-" >> .env - - CONDA_DIR="$GROUP_DIR/.conda" - mkdir -p "$CONDA_DIR" - conda config --add pkgs_dirs "$CONDA_DIR" - - conda env create --prefix $ENV_PATH --file "dev_env_conda.yml" - - conda config --set auto_activate_base false - - conda activate $ENV_PATH - - pip install bio~=1.7.0 - pip install foldcomp~=0.0.7 - - # cd $GROUP_DIR/deepfri - - # conda create -n embedding_env python=3.9 pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia - -fi - -PYTHONPATH='.' python3 -u ./toolbox/initialize_repository.py diff --git a/run_hpc.sh b/run_hpc.sh deleted file mode 100644 index 5a75c63..0000000 --- a/run_hpc.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=10 -#SBATCH --time=2:00:00 - -#SBATCH -p plgrid - -#SBATCH -A plgdf2storage-cpu - - -cd $PLG_GROUPS_STORAGE/plggdeepfri2/ - -cd deepfri - -module load miniconda3 -conda activate ./dev_env - -cd ./deepFRI2-toolbox-dev - -#PYTHONPATH='.' python3 -u toolbox/monitoring/scrap-metrics.py &>scrapping_logs.txt & - -PYTHONPATH='.' python3 ./fridata.py -d AFDB -c part -t afdb_swissprot_v4 diff --git a/scripts/hpc/cpu/common_slurm_cpu.sh b/scripts/hpc/cpu/common_slurm_cpu.sh new file mode 100644 index 0000000..ded8c6b --- /dev/null +++ b/scripts/hpc/cpu/common_slurm_cpu.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +# Define the start_computation function +start_computation() { + # Check if a command was provided + if [ -z "$1" ]; then + echo "Error: No Python command provided." + exit 1 + fi + + export DASK_LOGGING__DISTRIBUTED="WARNING" + + # Store the provided Python command + local python_command="$1" + + echo 'IP' + if [[ ! -v IP_INTERFACE ]]; then + IP_INTERFACE='ens1f0' + fi + ip a sh dev $IP_INTERFACE | grep -oP '(?<=inet )\S+' | cut -d'/' -f1 + + cd $DEEPFRI_PATH + + module load gcc + module load miniconda3 + eval "$(conda shell.bash hook)" + conda activate $CONDA_ENV_PATH + + echo "Start time: `date`" + start_time=$(date +%s) + + cd ./FRIdata + + nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + nodes_array=($nodes) + + if [ -e $DEEPFRI_PATH/scheduler.json ]; then + rm $DEEPFRI_PATH/scheduler.json + fi + + dask scheduler --scheduler-file $DEEPFRI_PATH/scheduler.json --preload ./toolbox/worker_setup.py & + + while [[ ! -e $DEEPFRI_PATH/scheduler.json ]]; do + sleep 10 + done + + if [[ ! -v LAUNCH_WORKER_SLURM_PATH ]]; then + LAUNCH_WORKER_SLURM_PATH="$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/launch_workers_slurm_cpu.sh" + fi + + chmod +x $LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh + + $LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh $SLURM_CPUS_PER_TASK ${nodes_array[0]} & + + echo "Head node workers" + + worker_num=$((SLURM_JOB_NUM_NODES - 1)) + + for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + srun -w "$node_i" -c $SLURM_CPUS_PER_TASK $LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh $SLURM_CPUS_PER_TASK $node_i & + echo "$node_i started srun workers" + done + + # Record start time + start_time=$(date +%s) + + echo "eval python command" + + # Execute the provided Python command + eval "$python_command" + + end_time=$(date +%s) + echo "End time: `date`" + + duration=$((end_time - start_time)) + + # Convert seconds to hours, minutes, and seconds + hours=$((duration / 3600)) + minutes=$(( (duration % 3600) / 60 )) + seconds=$((duration % 60)) + + # Print the formatted duration + printf "Computation time: %02d:%02d:%02d\n" $hours $minutes $seconds +} + +# Check if this script is being run directly and not sourced +if [ "${BASH_SOURCE[0]}" == "${0}" ]; then + # Call the function to execute the script with arguments + start_computation "$@" +fi \ No newline at end of file diff --git a/install.sh b/scripts/hpc/cpu/initialize_slurm_cpu.sh similarity index 62% rename from install.sh rename to scripts/hpc/cpu/initialize_slurm_cpu.sh index 0b1700c..da999b1 100644 --- a/install.sh +++ b/scripts/hpc/cpu/initialize_slurm_cpu.sh @@ -24,23 +24,36 @@ if [ -z "$GROUP_DIR" ]; then exit 1 fi +# Check if DEEPFRI_PATH is set, if not then throw an error +if [[ ! -v DEEPFRI_PATH ]]; then + echo "Error: DEEPFRI_PATH environment variable is not set." + exit 1 +fi + +# Check if CONDA_ENV_PATH is set, if not then set a default path +if [[ ! -v CONDA_ENV_PATH ]]; then + CONDA_ENV_PATH="$DEEPFRI_PATH/conda_dev" +fi + CONDA_DIR="$GROUP_DIR/.conda" + +module load miniconda3 conda config --add pkgs_dirs "$CONDA_DIR" # Create environment from base YAML (without PyTorch) -conda env create --prefix $ENV_PATH --file "toolbox_env_conda.yml" +conda env create --prefix $CONDA_ENV_PATH --file "$DEEPFRI_PATH/FRIdata/toolbox_env_conda.yml" conda config --set auto_activate_base false -source activate $ENV_PATH +source activate $CONDA_ENV_PATH # Install PyTorch based on mode if [ "$CPU_ONLY" = true ]; then echo "Installing CPU-only PyTorch..." - conda install -y pytorch cpuonly -c pytorch -c conda-forge + conda install -y pytorch cpuonly -c pytorch else echo "Installing GPU-enabled PyTorch..." - conda install -y pytorch-gpu -c conda-forge + conda install -y pytorch-gpu fi # Install ESM (requires PyTorch to be installed first) diff --git a/scripts/hpc/cpu/launch_workers_slurm_cpu.sh b/scripts/hpc/cpu/launch_workers_slurm_cpu.sh new file mode 100644 index 0000000..989e06f --- /dev/null +++ b/scripts/hpc/cpu/launch_workers_slurm_cpu.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +export DASK_LOGGING__DISTRIBUTED="WARNING" + +WORKERS_COUNT=$(($1)) +if [[ ! -v MEMORY_LIMIT ]]; then + MEMORY_LIMIT='288GiB' +fi + +mkdir -p $SCRATCH/slurm_jobdir/$SLURM_JOB_ID/dask-workers + +local_dir=$SCRATCH/slurm_jobdir/$SLURM_JOB_ID/dask-workers/$2 + +mkdir $local_dir + +dask worker --scheduler-file $DEEPFRI_PATH/scheduler.json --nworkers $WORKERS_COUNT --nthreads 1 --memory-limit $MEMORY_LIMIT --local-directory $local_dir --preload $DEEPFRI_PATH/FRIdata/toolbox/worker_setup.py \ No newline at end of file diff --git a/scripts/hpc/cpu/run_slurm_cpu.sh b/scripts/hpc/cpu/run_slurm_cpu.sh new file mode 100644 index 0000000..20fdda0 --- /dev/null +++ b/scripts/hpc/cpu/run_slurm_cpu.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +EMBEDDER_TYPE=esm2_t33_650M_UR50D + +if [[ ! -v COMMON_SLURM_PATH ]]; then + COMMON_SLURM_PATH="$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/common_slurm_cpu.sh" +fi + +source $COMMON_SLURM_PATH/common_slurm_cpu.sh + +PYTHON_COMMAND="PYTHONPATH='.' python3 -u ${DEEPFRI_PATH}/FRIdata/fridata.py input_generation -t sequences,coordinates -d AFDB -c subset --overwrite --version 1_test_dask -i ${IDS_PATH} --input-path ${AFDB_PATH} -e ${EMBEDDER_TYPE} --slurm --verbose" + +start_computation "$PYTHON_COMMAND" \ No newline at end of file