diff --git a/README.md b/README.md index 644c481..d338504 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,6 @@ PYTHONPATH='.' python3 -u ${FRIDATA_PATH}/fridata.py \ Running FRIdata on HPC differs on CPU and GPU nodes. This instruction set is valid for HPC hosted in PLGrid infrastructure. Running on other infrastructures may require additional adjustments. -### CPU - Prerequisites: - Having active grant valid on the HPC - Having a full list of mandatory ENV vars set (ideally in .bashrc): @@ -100,8 +98,8 @@ Prerequisites: - `AFDB_PATH`: path to AFDB structures (can be empty directory - structures will be fetched there) - `DATA_PATH`: path to the parent diretory of all generated output data - Optional ENV vars with default values: - - `COMMON_SLURM_PATH`: path to common_slurn_cpu.sh, defaults to `$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/common_slurm_cpu.sh` - - `LAUNCH_WORKER_SLURM_PATH`: path to launch_worker_slurm_cpu.sh, defaults to `$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/launch_workers_slurm_cpu.sh` + - `COMMON_SLURM_PATH`: path to common_slurn.sh, defaults to `$DEEPFRI_PATH/FRIdata/scripts/hpc/common_slurm.sh` + - `LAUNCH_WORKER_SLURM_PATH`: path to launch_worker_slurm.sh, defaults to `$DEEPFRI_PATH/FRIdata/scripts/hpc/launch_workers_slurm.sh` - `MEMORY_LIMIT`: memory limit per Dask worker, defaults to `288GiB` - `IP_INTERFACE`: network unix interface, where dask workers are connected. Defaults to `ens1f0` - `CONDA_ENV_PATH`: path to conda environment, defaults to `$DEEPFRI_PATH/conda_dev` @@ -123,14 +121,21 @@ cd FRIdata chmod u+x -R scripts/hpc/cpu ``` -3. Run `initialize_slurm_cpu.sh`. As an argument put the path into directory, where `.conda` directory should be installed and specify `--cpu` flag +3. Run `initialize_slurm.sh`. As an argument put the path into directory, where `.conda` directory should be installed and specify `--cpu` flag if the script is run on CPU cluster. ``` -./scripts/hpc/cpu/initialize_slurm_cpu.sh --cpu +./scripts/hpc/initialize_slurm.sh [--cpu] ``` -4. Schedule SBatch script into the HPC with all the args specified +4. Schedule SBatch script into the HPC with all the args specified. Operations to be chosen are: `sequences`, `coordinates`, `embeddings` + + +For CPU: +``` +sbatch --cpus-per-task= --time= --nodes= --account= scripts/hpc/run_slurm.sh sequences,coordinates +``` +For GPU: ``` -sbatch --cpus-per-task= --time= --nodes= --account= scripts/hpc/cpu/run_slurm_cpu.sh +sbatch --gres=gpu[:gpu-number] --time= --account= --nodes=1 --partition= --cpus-per-task= scripts/hpc/run_slurm.sh embeddings ``` \ No newline at end of file diff --git a/scripts/hpc/cpu/common_slurm_cpu.sh b/scripts/hpc/common_slurm.sh similarity index 64% rename from scripts/hpc/cpu/common_slurm_cpu.sh rename to scripts/hpc/common_slurm.sh index ded8c6b..5d83e7f 100644 --- a/scripts/hpc/cpu/common_slurm_cpu.sh +++ b/scripts/hpc/common_slurm.sh @@ -21,8 +21,39 @@ start_computation() { cd $DEEPFRI_PATH - module load gcc - module load miniconda3 + # Robustly try to load GCC and a Conda/Miniconda module (handle varied names) + LOADED_GCC=false + LOADED_CONDA=false + if command -v module >/dev/null 2>&1; then + GCC_CANDIDATES=(gcc GCC) + for MOD in "${GCC_CANDIDATES[@]}"; do + if module load "$MOD" >/dev/null 2>&1; then + echo "Loaded module: $MOD" + LOADED_GCC=true + break + fi + done + + CONDA_CANDIDATES=(miniconda3 Miniconda3 miniconda Anaconda3 anaconda3) + for MOD in "${CONDA_CANDIDATES[@]}"; do + if module load "$MOD" >/dev/null 2>&1; then + echo "Loaded module: $MOD" + LOADED_CONDA=true + break + fi + done + fi + + if [ "$LOADED_GCC" = false ]; then + echo "Error: Could not load a GCC module." + exit 1 + fi + + if [ "$LOADED_CONDA" = false ]; then + echo "Error: Could not load a Conda module." + exit 1 + fi + eval "$(conda shell.bash hook)" conda activate $CONDA_ENV_PATH @@ -45,12 +76,12 @@ start_computation() { done if [[ ! -v LAUNCH_WORKER_SLURM_PATH ]]; then - LAUNCH_WORKER_SLURM_PATH="$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/launch_workers_slurm_cpu.sh" + LAUNCH_WORKER_SLURM_PATH="$DEEPFRI_PATH/FRIdata/scripts/hpc/launch_workers_slurm.sh" fi - chmod +x $LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh + chmod +x $LAUNCH_WORKER_SLURM_PATH - $LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh $SLURM_CPUS_PER_TASK ${nodes_array[0]} & + $LAUNCH_WORKER_SLURM_PATH $SLURM_CPUS_PER_TASK ${nodes_array[0]} & echo "Head node workers" @@ -58,7 +89,7 @@ start_computation() { for ((i = 1; i <= worker_num; i++)); do node_i=${nodes_array[$i]} - srun -w "$node_i" -c $SLURM_CPUS_PER_TASK $LAUNCH_WORKER_SLURM_PATH/launch_workers_slurm_cpu.sh $SLURM_CPUS_PER_TASK $node_i & + srun -w "$node_i" -c $SLURM_CPUS_PER_TASK $LAUNCH_WORKER_SLURM_PATH $SLURM_CPUS_PER_TASK $node_i & echo "$node_i started srun workers" done diff --git a/scripts/hpc/cpu/run_slurm_cpu.sh b/scripts/hpc/cpu/run_slurm_cpu.sh deleted file mode 100644 index 281fffa..0000000 --- a/scripts/hpc/cpu/run_slurm_cpu.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -EMBEDDER_TYPE=esm2_t33_650M_UR50D - -if [[ ! -v COMMON_SLURM_PATH ]]; then - COMMON_SLURM_PATH="$DEEPFRI_PATH/FRIdata/scripts/hpc/cpu/common_slurm_cpu.sh" -fi - -source $COMMON_SLURM_PATH/common_slurm_cpu.sh - -PYTHON_COMMAND="PYTHONPATH='.' python3 -u ${DEEPFRI_PATH}/FRIdata/fridata.py generate_data -t sequences,coordinates -d AFDB -c subset --overwrite --version 1_test_dask -i ${IDS_PATH} --input-path ${AFDB_PATH} -e ${EMBEDDER_TYPE} --slurm --verbose" - -start_computation "$PYTHON_COMMAND" \ No newline at end of file diff --git a/scripts/hpc/cpu/initialize_slurm_cpu.sh b/scripts/hpc/initialize_slurm.sh similarity index 70% rename from scripts/hpc/cpu/initialize_slurm_cpu.sh rename to scripts/hpc/initialize_slurm.sh index da999b1..96c733d 100644 --- a/scripts/hpc/cpu/initialize_slurm_cpu.sh +++ b/scripts/hpc/initialize_slurm.sh @@ -37,7 +37,24 @@ fi CONDA_DIR="$GROUP_DIR/.conda" -module load miniconda3 +# Try loading a Conda/Miniconda module in a robust way (handle varied names) +LOADED_MODULE=false +if command -v module >/dev/null 2>&1; then + MODULE_CANDIDATES=(miniconda3 Miniconda3 miniconda Anaconda3 anaconda3) + for MOD in "${MODULE_CANDIDATES[@]}"; do + if module load "$MOD" >/dev/null 2>&1; then + echo "Loaded module: $MOD" + LOADED_MODULE=true + break + fi + done +fi + +if [ "$LOADED_MODULE" = false ]; then + echo "Error: Could not load a Conda module." + exit 1 +fi + conda config --add pkgs_dirs "$CONDA_DIR" # Create environment from base YAML (without PyTorch) @@ -45,7 +62,8 @@ conda env create --prefix $CONDA_ENV_PATH --file "$DEEPFRI_PATH/FRIdata/toolbox_ conda config --set auto_activate_base false -source activate $CONDA_ENV_PATH +eval "$(conda shell.bash hook)" +conda activate $CONDA_ENV_PATH # Install PyTorch based on mode if [ "$CPU_ONLY" = true ]; then diff --git a/scripts/hpc/cpu/launch_workers_slurm_cpu.sh b/scripts/hpc/launch_workers_slurm.sh similarity index 100% rename from scripts/hpc/cpu/launch_workers_slurm_cpu.sh rename to scripts/hpc/launch_workers_slurm.sh diff --git a/scripts/hpc/run_slurm.sh b/scripts/hpc/run_slurm.sh new file mode 100644 index 0000000..849e7ae --- /dev/null +++ b/scripts/hpc/run_slurm.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +EMBEDDER_TYPE=esm2_t33_650M_UR50D + +if [[ ! -v COMMON_SLURM_PATH ]]; then + COMMON_SLURM_PATH="$DEEPFRI_PATH/FRIdata/scripts/hpc/common_slurm.sh" +fi + +source $COMMON_SLURM_PATH + +PYTHON_COMMAND="PYTHONPATH='.' python3 -u ${DEEPFRI_PATH}/FRIdata/fridata.py input_generation -t $1 -d AFDB -c subset --overwrite --version 1_test_dask -i ${IDS_PATH} --input-path ${AFDB_PATH} -e ${EMBEDDER_TYPE} --slurm --verbose" + +start_computation "$PYTHON_COMMAND" \ No newline at end of file diff --git a/toolbox/worker_setup.py b/toolbox/worker_setup.py index 79a3b86..978a138 100644 --- a/toolbox/worker_setup.py +++ b/toolbox/worker_setup.py @@ -10,6 +10,10 @@ dotenv.load_dotenv() data_path = os.getenv("DATA_PATH") -data_path = pathlib.Path(data_path).parent / "fridata" sys.path.append(str(data_path)) + +here = pathlib.Path(__file__).resolve() +repo_root = here.parents[1] +if str(repo_root) not in sys.path: + sys.path.insert(0, str(repo_root)) \ No newline at end of file