diff --git a/.github/workflows/behave-cloudberry.yml b/.github/workflows/behave-cloudberry.yml new file mode 100644 index 00000000000..b78a80f4412 --- /dev/null +++ b/.github/workflows/behave-cloudberry.yml @@ -0,0 +1,744 @@ +# +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -------------------------------------------------------------------- +# GitHub Actions Workflow: Apache Cloudberry Behave Pipeline +# -------------------------------------------------------------------- +# Description: +# +# This workflow runs Apache Cloudberry gpMgmt Behave tests on Rocky Linux 9. +# It is intentionally separated from the main build/installcheck workflow so +# that Behave-specific matrix expansion, environment setup, result parsing, +# and iterative test stabilization do not disturb the primary CI path. +# +# Workflow Overview: +# 1. **Prepare Behave Matrix**: +# - Expands the selected Behave command-level test matrix. +# - Supports manual filtering through `test_selection`. +# +# 2. **Build Job**: +# - Builds Apache Cloudberry and creates source/RPM artifacts for reuse +# within this workflow. +# +# 3. **Behave Job (Matrix)**: +# - Creates a demo cluster for each Behave matrix entry. +# - Runs the selected gpMgmt feature file(s) in isolation. +# - Parses Behave summaries and uploads logs/metadata artifacts. +# +# 4. **Report Job**: +# - Aggregates build and Behave job status into a final workflow summary. +# +# Execution Environment: +# - **Runs On**: ubuntu-22.04 with Rocky Linux 9 containers. +# - **Primary Test Scope**: `gpMgmt/test/behave/mgmt_utils` +# +# Notes: +# - Trigger mode: push, pull_request, and manual `workflow_dispatch`. +# - Behave tests are split by command to reduce cross-feature environment +# pollution. +# - This workflow currently focuses on single-host CI-compatible Behave tests. +# - Logs and parsed summaries are uploaded as artifacts for each matrix entry. +# -------------------------------------------------------------------- + +name: Apache Cloudberry Behave + +on: + push: + branches: [main, REL_2_STABLE] + pull_request: + branches: [main, REL_2_STABLE] + types: [opened, synchronize, reopened, edited] + workflow_dispatch: + inputs: + test_selection: + description: 'Select Behave tests to run (comma-separated). Examples: ic-behave-gpconfig,ic-behave-gpstart' + required: false + default: 'all' + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + +permissions: + contents: read + packages: read + actions: write + checks: read + pull-requests: read + +env: + LOG_RETENTION_DAYS: 7 + ENABLE_DEBUG: false + +jobs: + prepare-behave-matrix: + runs-on: ubuntu-22.04 + outputs: + behave-matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - id: set-matrix + run: | + echo "=== Behave Matrix Preparation Diagnostics ===" + echo "Event type: ${{ github.event_name }}" + echo "Test selection input: '${{ github.event.inputs.test_selection || 'all' }}'" + + ALL_BEHAVE_TESTS='{ + "include": [ + {"test":"ic-behave-analyzedb","behave_features":["test/behave/mgmt_utils/analyzedb.feature"]}, + {"test":"ic-behave-gp-bash-functions","behave_features":["test/behave/mgmt_utils/gp_bash_functions.feature"]}, + {"test":"ic-behave-gpactivatestandby","behave_features":["test/behave/mgmt_utils/gpactivatestandby.feature"]}, + {"test":"ic-behave-gpaddmirrors", + "behave_features":["test/behave/mgmt_utils/gpaddmirrors.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpcheckcat", + "behave_features":["test/behave/mgmt_utils/gpcheckcat.feature"], + "behave_args":"--tags ~@extended" + }, + {"test":"ic-behave-gpcheckperf", + "behave_features":["test/behave/mgmt_utils/gpcheckperf.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpconfig","behave_features":["test/behave/mgmt_utils/gpconfig.feature"]}, + {"test":"ic-behave-gpinitstandby", + "behave_features":["test/behave/mgmt_utils/gpinitstandby.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpinitsystem", + "behave_features":["test/behave/mgmt_utils/gpinitsystem.feature"], + "behave_args":"--tags ~@extended" + }, + {"test":"ic-behave-gpmovemirrors", + "behave_features":["test/behave/mgmt_utils/gpmovemirrors.feature"], + "behave_args":"--tags ~@concourse_cluster --tags ~@extended" + }, + {"test":"ic-behave-gprecoverseg", + "behave_features":["test/behave/mgmt_utils/gprecoverseg.feature"], + "behave_args":"--tags ~@concourse_cluster --tags ~@extended" + }, + {"test":"ic-behave-gpreload","behave_features":["test/behave/mgmt_utils/gpreload.feature"]}, + {"test":"ic-behave-gpstart", + "behave_features":["test/behave/mgmt_utils/gpstart.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpstate", + "behave_features":["test/behave/mgmt_utils/gpstate.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpstop","behave_features":["test/behave/mgmt_utils/gpstop.feature"]}, + {"test":"ic-behave-gpssh", + "behave_features":["test/behave/mgmt_utils/gpssh.feature"], + "behave_args":"--tags ~@requires_netem" + }, + {"test":"ic-behave-minirepro", + "behave_features":["test/behave/mgmt_utils/minirepro.feature"], + "behave_args":"--tags ~@extended" + }, + {"test":"ic-behave-replication-slots", + "behave_features":["test/behave/mgmt_utils/replication_slots.feature"], + "behave_args":"--tags ~@extended" + } + ] + }' + + VALID_TESTS=$(echo "$ALL_BEHAVE_TESTS" | jq -r '.include[].test') + IFS=',' read -ra SELECTED_TESTS <<< "${{ github.event.inputs.test_selection || 'all' }}" + + if [[ "${SELECTED_TESTS[*]}" == "all" || -z "${SELECTED_TESTS[*]}" ]]; then + mapfile -t SELECTED_TESTS <<< "$VALID_TESTS" + fi + + INVALID_TESTS=() + FILTERED_TESTS=() + for TEST in "${SELECTED_TESTS[@]}"; do + TEST=$(echo "$TEST" | tr -d '[:space:]') + if echo "$VALID_TESTS" | grep -qw "$TEST"; then + FILTERED_TESTS+=("$TEST") + else + INVALID_TESTS+=("$TEST") + fi + done + + if [[ ${#INVALID_TESTS[@]} -gt 0 ]]; then + echo "::error::Invalid Behave test(s) selected: ${INVALID_TESTS[*]}" + echo "Valid tests are: $(echo "$VALID_TESTS" | tr '\n' ', ')" + exit 1 + fi + + RESULT='{"include":[' + FIRST=true + for TEST in "${FILTERED_TESTS[@]}"; do + CONFIG=$(jq -c --arg test "$TEST" '.include[] | select(.test == $test)' <<< "$ALL_BEHAVE_TESTS") + if [[ "$FIRST" == true ]]; then + FIRST=false + else + RESULT="${RESULT}," + fi + RESULT="${RESULT}${CONFIG}" + done + RESULT="${RESULT}]}" + + echo "Final behave matrix configuration:" + echo "$RESULT" | jq . + + { + echo "matrix<> "$GITHUB_OUTPUT" + + build: + name: Build Apache Cloudberry RPM + env: + JOB_TYPE: build + runs-on: ubuntu-22.04 + timeout-minutes: 120 + outputs: + build_timestamp: ${{ steps.set_timestamp.outputs.timestamp }} + container: + image: apache/incubator-cloudberry:cbdb-build-rocky9-latest + options: >- + --user root + -h cdw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt + steps: + - name: Free Disk Space + run: | + echo "=== Disk space before cleanup ===" + df -h / + + rm -rf /host_opt/hostedtoolcache || true + rm -rf /host_usr_local/lib/android || true + rm -rf /host_usr_share/dotnet || true + rm -rf /host_opt/ghc || true + rm -rf /host_usr_local/.ghcup || true + rm -rf /host_usr_share/swift || true + rm -rf /host_usr_local/share/powershell || true + rm -rf /host_usr_local/share/chromium || true + rm -rf /host_usr_share/miniconda || true + rm -rf /host_opt/az || true + rm -rf /host_usr_share/sbt || true + + echo "=== Disk space after cleanup ===" + df -h / + + - name: Set build timestamp + id: set_timestamp + run: | + timestamp=$(date +'%Y%m%d_%H%M%S') + echo "timestamp=$timestamp" | tee -a "$GITHUB_OUTPUT" + echo "BUILD_TIMESTAMP=$timestamp" | tee -a "$GITHUB_ENV" + + - name: Checkout Apache Cloudberry + uses: actions/checkout@v4 + with: + fetch-depth: 1 + submodules: true + + - name: Cloudberry Environment Initialization + env: + LOGS_DIR: build-logs + run: | + set -eo pipefail + if ! su - gpadmin -c "/tmp/init_system.sh"; then + echo "::error::Container initialization failed" + exit 1 + fi + + mkdir -p "${LOGS_DIR}/details" + chown -R gpadmin:gpadmin . + chmod -R 755 . + chmod 777 "${LOGS_DIR}" + + df -kh / + rm -rf /__t/* + df -kh / + + df -h | tee -a "${LOGS_DIR}/details/disk-usage.log" + free -h | tee -a "${LOGS_DIR}/details/memory-usage.log" + + { + echo "=== Environment Information ===" + uname -a + df -h + free -h + env + } | tee -a "${LOGS_DIR}/details/environment.log" + + echo "SRC_DIR=${GITHUB_WORKSPACE}" | tee -a "$GITHUB_ENV" + + - name: Run Apache Cloudberry configure script + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh"; then + echo "::error::Configure script failed" + exit 1 + fi + + - name: Run Apache Cloudberry build script + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/build-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/build-cloudberry.sh"; then + echo "::error::Build script failed" + exit 1 + fi + + - name: Create Source tarball, create RPM and verify artifacts + env: + CBDB_VERSION: 99.0.0 + BUILD_NUMBER: 1 + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + tar czf "${SRC_DIR}"/../apache-cloudberry-incubating-src.tgz -C "${SRC_DIR}"/.. ./cloudberry + mv "${SRC_DIR}"/../apache-cloudberry-incubating-src.tgz "${SRC_DIR}" + + rpmdev-setuptree + ln -s "${SRC_DIR}"/devops/build/packaging/rpm/apache-cloudberry-db-incubating.spec "${HOME}"/rpmbuild/SPECS/apache-cloudberry-db-incubating.spec + cp "${SRC_DIR}"/LICENSE /usr/local/cloudberry-db + + DEBUG_RPMBUILD_OPT="" + DEBUG_IDENTIFIER="" + if [ "${{ env.ENABLE_DEBUG }}" = "true" ]; then + DEBUG_RPMBUILD_OPT="--with-debug" + DEBUG_IDENTIFIER=".debug" + fi + + "${SRC_DIR}"/devops/build/packaging/rpm/build-rpm.sh --version "${CBDB_VERSION}" --release "${BUILD_NUMBER}" "${DEBUG_RPMBUILD_OPT}" + + os_version=$(grep -oP '(?<=^VERSION_ID=")[0-9]' /etc/os-release) + RPM_FILE="${HOME}"/rpmbuild/RPMS/x86_64/apache-cloudberry-db-incubating-"${CBDB_VERSION}"-"${BUILD_NUMBER}""${DEBUG_IDENTIFIER}".el"${os_version}".x86_64.rpm + cp "${RPM_FILE}" "${SRC_DIR}" + RPM_DEBUG="${HOME}"/rpmbuild/RPMS/x86_64/apache-cloudberry-db-incubating-debuginfo-"${CBDB_VERSION}"-"${BUILD_NUMBER}""${DEBUG_IDENTIFIER}".el"${os_version}".x86_64.rpm + cp "${RPM_DEBUG}" "${SRC_DIR}" + + - name: Upload build logs + uses: actions/upload-artifact@v4 + with: + name: behave-build-logs-${{ env.BUILD_TIMESTAMP }} + path: | + build-logs/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Upload Cloudberry RPM build artifacts + uses: actions/upload-artifact@v4 + with: + name: apache-cloudberry-db-incubating-rpm-build-artifacts + retention-days: ${{ env.LOG_RETENTION_DAYS }} + if-no-files-found: error + path: | + *.rpm + + - name: Upload Cloudberry source build artifacts + uses: actions/upload-artifact@v4 + with: + name: apache-cloudberry-db-incubating-source-build-artifacts + retention-days: ${{ env.LOG_RETENTION_DAYS }} + if-no-files-found: error + path: | + apache-cloudberry-incubating-src.tgz + + behave: + name: ${{ matrix.test }} + needs: [build, prepare-behave-matrix] + if: | + !cancelled() && + needs.build.result == 'success' + runs-on: ubuntu-22.04 + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.prepare-behave-matrix.outputs.behave-matrix) }} + container: + image: apache/incubator-cloudberry:cbdb-build-rocky9-latest + options: >- + --privileged + --user root + --hostname cdw + --shm-size=2gb + --ulimit core=-1 + --cgroupns=host + -v /sys/fs/cgroup:/sys/fs/cgroup:rw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt + steps: + - name: Free Disk Space + run: | + echo "=== Disk space before cleanup ===" + df -h / + + rm -rf /host_opt/hostedtoolcache || true + rm -rf /host_usr_local/lib/android || true + rm -rf /host_usr_share/dotnet || true + rm -rf /host_opt/ghc || true + rm -rf /host_usr_local/.ghcup || true + rm -rf /host_usr_share/swift || true + rm -rf /host_usr_local/share/powershell || true + rm -rf /host_usr_local/share/chromium || true + rm -rf /host_usr_share/miniconda || true + rm -rf /host_opt/az || true + rm -rf /host_usr_share/sbt || true + + echo "=== Disk space after cleanup ===" + df -h / + + - name: Cloudberry Environment Initialization + env: + LOGS_DIR: build-logs + run: | + set -eo pipefail + if ! su - gpadmin -c "/tmp/init_system.sh"; then + echo "::error::Container initialization failed" + exit 1 + fi + + mkdir -p "${LOGS_DIR}/details" + chown -R gpadmin:gpadmin . + chmod -R 755 . + chmod 777 "${LOGS_DIR}" + + df -kh / + rm -rf /__t/* + df -kh / + + df -h | tee -a "${LOGS_DIR}/details/disk-usage.log" + free -h | tee -a "${LOGS_DIR}/details/memory-usage.log" + + { + echo "=== Environment Information ===" + uname -a + df -h + free -h + env + } | tee -a "${LOGS_DIR}/details/environment.log" + + echo "SRC_DIR=${GITHUB_WORKSPACE}" | tee -a "$GITHUB_ENV" + + # Ensure hostname resolves to IPv4, not ::1, so that gpinitsystem + # generates pg_hba.conf entries that gpstop/gpstart can use. + echo '127.0.0.1 cdw' >> /etc/hosts + + - name: Generate Behave Job Summary Start + if: always() + run: | + { + echo "# Behave Job Summary: ${{ matrix.test }}" + echo "## Environment" + echo "- Start Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + echo "- OS Version: $(cat /etc/redhat-release)" + } >> "$GITHUB_STEP_SUMMARY" + + - name: Download Cloudberry RPM build artifacts + uses: actions/download-artifact@v4 + with: + name: apache-cloudberry-db-incubating-rpm-build-artifacts + path: ${{ github.workspace }}/rpm_build_artifacts + merge-multiple: false + run-id: ${{ github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download Cloudberry Source build artifacts + uses: actions/download-artifact@v4 + with: + name: apache-cloudberry-db-incubating-source-build-artifacts + path: ${{ github.workspace }}/source_build_artifacts + merge-multiple: false + run-id: ${{ github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Verify downloaded artifacts + id: verify-artifacts + run: | + set -eo pipefail + + SRC_TARBALL_FILE=$(ls "${GITHUB_WORKSPACE}"/source_build_artifacts/apache-cloudberry-incubating-src.tgz) + if [ ! -f "${SRC_TARBALL_FILE}" ]; then + echo "::error::SRC TARBALL file not found" + exit 1 + fi + echo "src_tarball_file=${SRC_TARBALL_FILE}" >> "$GITHUB_OUTPUT" + + RPM_FILE=$(ls "${GITHUB_WORKSPACE}"/rpm_build_artifacts/apache-cloudberry-db-incubating-[0-9]*.rpm | grep -v "debuginfo") + if [ ! -f "${RPM_FILE}" ]; then + echo "::error::RPM file not found" + exit 1 + fi + echo "rpm_file=${RPM_FILE}" >> "$GITHUB_OUTPUT" + + - name: Install Cloudberry RPM + if: success() + env: + RPM_FILE: ${{ steps.verify-artifacts.outputs.rpm_file }} + run: | + set -eo pipefail + + dnf clean all + dnf makecache --refresh || dnf makecache + rm -rf /usr/local/cloudberry-db + + if ! time dnf install -y --setopt=retries=10 --releasever=9 "${RPM_FILE}"; then + echo "::error::RPM installation failed" + exit 1 + fi + + rm -rf "${GITHUB_WORKSPACE}"/rpm_build_artifacts + + - name: Extract source tarball + if: success() + env: + SRC_TARBALL_FILE: ${{ steps.verify-artifacts.outputs.src_tarball_file }} + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + if ! time tar zxf "${SRC_TARBALL_FILE}" -C "${SRC_DIR}"/.. ; then + echo "::error::Source extraction failed" + exit 1 + fi + + rm -rf "${GITHUB_WORKSPACE}"/source_build_artifacts + + - name: Create Apache Cloudberry demo cluster + if: success() + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh + + if ! time su - gpadmin -c "cd ${SRC_DIR} && NUM_PRIMARY_MIRROR_PAIRS='3' SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh"; then + echo "::error::Demo cluster creation failed" + exit 1 + fi + + - name: Run Behave Tests + if: success() + env: + SRC_DIR: ${{ github.workspace }} + shell: bash {0} + run: | + set -o pipefail + + mkdir -p build-logs/details + config_log="build-logs/details/make-${{ matrix.test }}-config0.log" + behave_targets="${{ join(matrix.behave_features, ' ') }}" + behave_args="${{ matrix.behave_args || '' }}" + + mkdir -p "/tmp/cloudberry-cores" + chmod 1777 "/tmp/cloudberry-cores" + sysctl -w kernel.core_pattern="/tmp/cloudberry-cores/core-%e-%s-%u-%g-%p-%t" + + dnf install -y libffi-devel || echo "Warning: failed to install libffi-devel" + su - gpadmin -c "pip3 install --user -r ${SRC_DIR}/gpMgmt/requirements-dev.txt || pip install --user -r ${SRC_DIR}/gpMgmt/requirements-dev.txt" + + echo "Running features:" + for feature in $behave_targets; do + echo "- $feature" + done + if [[ -n "$behave_args" ]]; then + echo "Behave args: $behave_args" + fi + + if ! time su - gpadmin -c "cd ${SRC_DIR}/gpMgmt && source /usr/local/cloudberry-db/cloudberry-env.sh && source ${SRC_DIR}/gpAux/gpdemo/gpdemo-env.sh && PYTHONPATH=${SRC_DIR}/gpMgmt:\$PYTHONPATH behave $behave_args $behave_targets" \ + 2>&1 | tee -a "$config_log"; then + echo "::warning::Behave execution reported failures" + exit 1 + fi + + - name: Parse Behave Results + if: always() + shell: bash {0} + run: | + set -o pipefail + + config_log="build-logs/details/make-${{ matrix.test }}-config0.log" + behave_cmd="behave ${{ matrix.behave_args || '' }} ${{ join(matrix.behave_features, ' ') }}" + if [ ! -f "$config_log" ]; then + { + echo "MAKE_COMMAND=\"${behave_cmd}\"" + echo "STATUS=missing_log" + echo "TOTAL_TESTS=0" + echo "FAILED_TESTS=0" + echo "PASSED_TESTS=0" + echo "IGNORED_TESTS=0" + } | tee "test_results.${{ matrix.test }}.0.txt" + exit 1 + fi + + features_line=$(grep -E '^[0-9]+ feature(s)? passed, [0-9]+ failed, [0-9]+ skipped$' "$config_log" | tail -n 1) + scenarios_line=$(grep -E '^[0-9]+ scenario(s)? passed, [0-9]+ failed, [0-9]+ skipped(, [0-9]+ untested)?$' "$config_log" | tail -n 1) + steps_line=$(grep -E '^[0-9]+ step(s)? passed, [0-9]+ failed, [0-9]+ skipped, [0-9]+ undefined(, [0-9]+ untested)?$' "$config_log" | tail -n 1) + + if [[ -z "$scenarios_line" ]]; then + { + echo "MAKE_COMMAND=\"${behave_cmd}\"" + echo "STATUS=parse_error" + echo "TOTAL_TESTS=0" + echo "FAILED_TESTS=0" + echo "PASSED_TESTS=0" + echo "IGNORED_TESTS=0" + } | tee "test_results.${{ matrix.test }}.0.txt" + exit 1 + fi + + scenario_counts=$(echo "$scenarios_line" | sed -E 's/^([0-9]+) scenario(s)? passed, ([0-9]+) failed, ([0-9]+) skipped(, ([0-9]+) untested)?$/\1 \3 \4 \6/') + read -r scenarios_passed scenarios_failed scenarios_skipped scenarios_untested <<< "$scenario_counts" + scenarios_untested=${scenarios_untested:-0} + total_scenarios=$((scenarios_passed + scenarios_failed + scenarios_skipped)) + + { + echo "MAKE_COMMAND=\"${behave_cmd}\"" + if [[ "$scenarios_failed" -eq 0 ]]; then + echo "STATUS=passed" + else + echo "STATUS=failed" + fi + echo "TOTAL_TESTS=${total_scenarios}" + echo "FAILED_TESTS=${scenarios_failed}" + echo "PASSED_TESTS=${scenarios_passed}" + echo "IGNORED_TESTS=${scenarios_skipped}" + echo "BEHAVE_UNTESTED_SCENARIOS=${scenarios_untested}" + echo "BEHAVE_FEATURES_SUMMARY=\"${features_line:-unavailable}\"" + echo "BEHAVE_SCENARIOS_SUMMARY=\"${scenarios_line}\"" + echo "BEHAVE_STEPS_SUMMARY=\"${steps_line:-unavailable}\"" + } | tee "test_results.${{ matrix.test }}.0.txt" + + if [[ "$scenarios_failed" -eq 0 ]]; then + exit 0 + fi + exit 1 + + - name: Generate Behave Job Summary End + if: always() + shell: bash {0} + run: | + { + echo "## Test Results" + echo "- End Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + + if [[ ! -f "test_results.${{ matrix.test }}.0.txt" ]]; then + echo "### Result Status" + echo "⚠️ No results file found" + exit 0 + fi + + . "test_results.${{ matrix.test }}.0.txt" + + echo "### Command" + echo "\`$MAKE_COMMAND\`" + echo "" + + echo "### Status" + case "${STATUS:-unknown}" in + passed) + echo "✅ All scenarios passed" + ;; + failed) + echo "❌ Some scenarios failed" + ;; + parse_error) + echo "⚠️ Could not parse Behave results" + ;; + missing_log) + echo "⚠️ Behave log file missing" + ;; + *) + echo "⚠️ Unknown status: ${STATUS:-unknown}" + ;; + esac + + echo "" + echo "### Scenario Counts" + echo "| Metric | Count |" + echo "|--------|-------|" + echo "| Total Scenarios | ${TOTAL_TESTS:-0} |" + echo "| Passed Scenarios | ${PASSED_TESTS:-0} |" + echo "| Failed Scenarios | ${FAILED_TESTS:-0} |" + echo "| Skipped Scenarios | ${IGNORED_TESTS:-0} |" + echo "| Untested Scenarios | ${BEHAVE_UNTESTED_SCENARIOS:-0} |" + + echo "" + echo "### Behave Summary" + echo "| Metric | Summary |" + echo "|--------|---------|" + echo "| Features | ${BEHAVE_FEATURES_SUMMARY:-unavailable} |" + echo "| Scenarios | ${BEHAVE_SCENARIOS_SUMMARY:-unavailable} |" + echo "| Steps | ${BEHAVE_STEPS_SUMMARY:-unavailable} |" + } >> "$GITHUB_STEP_SUMMARY" || true + + - name: Upload behave logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: behave-logs-${{ matrix.test }}-${{ needs.build.outputs.build_timestamp || github.run_id }} + path: | + build-logs/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Upload Behave Metadata + if: always() + uses: actions/upload-artifact@v4 + with: + name: behave-metadata-${{ matrix.test }} + path: | + test_results*.txt + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + report: + name: Generate Apache Cloudberry Behave Report + needs: [build, prepare-behave-matrix, behave] + if: always() + runs-on: ubuntu-22.04 + steps: + - name: Generate Final Report + run: | + { + echo "# Apache Cloudberry Behave Report" + echo "## Job Status" + echo "- Build Job: ${{ needs.build.result }}" + echo "- Behave Job: ${{ needs.behave.result }}" + echo "- Completion Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + + if [[ "${{ needs.build.result }}" == "success" && + "${{ needs.behave.result }}" =~ ^(success|skipped)$ ]]; then + echo "✅ Pipeline completed successfully" + else + echo "⚠️ Pipeline completed with failures" + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Notify on failure + if: | + needs.build.result != 'success' || + !contains(fromJson('["success","skipped"]'), needs.behave.result) + run: | + echo "::error::Behave pipeline failed! Check job summaries and logs for details" + echo "Timestamp: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + echo "Build Result: ${{ needs.build.result }}" + echo "Behave Result: ${{ needs.behave.result }}" diff --git a/.github/workflows/build-cloudberry.yml b/.github/workflows/build-cloudberry.yml index 0d76fa0da51..60bfb38bea4 100644 --- a/.github/workflows/build-cloudberry.yml +++ b/.github/workflows/build-cloudberry.yml @@ -223,7 +223,6 @@ jobs: DEFAULT_ENABLE_CGROUPS=false DEFAULT_ENABLE_CORE_CHECK=true DEFAULT_PG_SETTINGS_OPTIMIZER="" - # Define base test configurations ALL_TESTS='{ "include": [ @@ -1606,8 +1605,6 @@ jobs: continue fi - # Parse this configuration's results - MAKE_NAME="${{ matrix.test }}-config$i" \ "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/parse-test-results.sh "$config_log" status_code=$? diff --git a/gpAux/gpdemo/demo_cluster.sh b/gpAux/gpdemo/demo_cluster.sh index 225bb76a5ee..7397894d359 100755 --- a/gpAux/gpdemo/demo_cluster.sh +++ b/gpAux/gpdemo/demo_cluster.sh @@ -314,8 +314,10 @@ cat >> $CLUSTER_CONFIG <<-EOF COORDINATOR_PORT=${COORDINATOR_DEMO_PORT} - # Shell to use to execute commands on all hosts - TRUSTED_SHELL="$(dirname "$0")/lalshell" + # Shell to use to execute commands on all hosts. Use an absolute path here + # because this file is later sourced by gpinitsystem, where \$0 is no longer + # demo_cluster.sh. + TRUSTED_SHELL=$(pwd)/lalshell ENCODING=UNICODE EOF diff --git a/gpMgmt/bin/analyzedb b/gpMgmt/bin/analyzedb index 48d8e16872c..823f99e358f 100755 --- a/gpMgmt/bin/analyzedb +++ b/gpMgmt/bin/analyzedb @@ -49,6 +49,12 @@ ANALYZE_ROOT_SQL = """analyze rootpartition %s""" REPORTS_ARE_STALE_AFTER_N_DAYS = 8 NUM_REPORTS_TO_SAVE = 3 + +def safe_log_string(value): + if isinstance(value, str): + return value.encode('ascii', 'backslashreplace').decode('ascii') + return str(value) + GET_ALL_DATA_TABLES_SQL = """ select n.nspname as schemaname, c.relname as tablename from pg_class c, pg_namespace n @@ -430,7 +436,7 @@ class AnalyzeDb(Operation): target = self._get_tablename_with_cols(can_schema, can_table, input_col_dict) else: # can in root_partition_col_dict target = self._get_tablename_with_cols(can_schema, can_table, root_partition_col_dict) - logger.info(target) + logger.info(safe_log_string(target)) target_list.append(target) logger.info("---------------------------------------------------") @@ -951,7 +957,10 @@ class AnalyzeDb(Operation): # Create a Command object that executes a query using psql. def create_psql_command(dbname, query): psql_cmd = """psql %s -c %s""" % (pipes.quote(dbname), pipes.quote(query)) - return Command(query, psql_cmd) + # Keep the command text intact for execution, but make the display name + # ASCII-safe so logger/output paths do not choke on UTF-8 identifiers. + safe_query_display = query.encode('ascii', 'backslashreplace').decode('ascii') + return Command(safe_query_display, psql_cmd) def run_sql(conn, query): @@ -1402,13 +1411,13 @@ class AnalyzeWorker(Worker): self.cmd = None return elif self.pool.should_stop: - self.logger.debug("[%s] got cmd and pool is stopped: %s" % (self.name, self.cmd)) + self.logger.debug("[%s] got cmd and pool is stopped: %s" % (self.name, safe_log_string(self.cmd))) self.pool.markTaskDone() self.cmd = None else: # run the command # get rid of the gucs for displaying in the log - cmd_display = re.sub(r'set .*;\s*', '', self.cmd.name) + cmd_display = safe_log_string(re.sub(r'set .*;\s*', '', self.cmd.name)) self.logger.info("[%s] started %s" % (self.name, cmd_display)) start_time = time.time() self.cmd.run() @@ -1425,7 +1434,7 @@ class AnalyzeWorker(Worker): except Exception as e: self.logger.exception(e) if self.cmd: - self.logger.debug("[%s] finished cmd with exception: %s" % (self.name, self.cmd)) + self.logger.debug("[%s] finished cmd with exception: %s" % (self.name, safe_log_string(self.cmd))) self.pool.addFinishedWorkItem(self.cmd) self.cmd = None diff --git a/gpMgmt/bin/gppylib/commands/base.py b/gpMgmt/bin/gppylib/commands/base.py index d455c6e2d13..477c0ba1a75 100755 --- a/gpMgmt/bin/gppylib/commands/base.py +++ b/gpMgmt/bin/gppylib/commands/base.py @@ -37,6 +37,12 @@ CMD_CACHE = {} + +def _safe_log_string(value): + if isinstance(value, str): + return value.encode('ascii', 'backslashreplace').decode('ascii') + return str(value) + # Maximum retries if sshd rejects the connection due to too many # unauthenticated connections. SSH_MAX_RETRY = 10 @@ -86,7 +92,7 @@ def markTaskDone(self): self.work_queue.task_done() def addCommand(self, cmd): - self.logger.debug("Adding cmd to work_queue: %s" % cmd.cmdStr) + self.logger.debug("Adding cmd to work_queue: %s" % _safe_log_string(cmd.cmdStr)) self.work_queue.put(cmd) self._assigned += 1 @@ -272,20 +278,20 @@ def run(self): self.cmd = None return elif self.pool.should_stop: - self.logger.debug("[%s] got cmd and pool is stopped: %s" % (self.name, self.cmd)) + self.logger.debug("[%s] got cmd and pool is stopped: %s" % (self.name, _safe_log_string(self.cmd))) self.pool.markTaskDone() self.cmd = None else: - self.logger.debug("[%s] got cmd: %s" % (self.name, self.cmd.cmdStr)) + self.logger.debug("[%s] got cmd: %s" % (self.name, _safe_log_string(self.cmd.cmdStr))) self.cmd.run() - self.logger.debug("[%s] finished cmd: %s" % (self.name, self.cmd)) + self.logger.debug("[%s] finished cmd: %s" % (self.name, _safe_log_string(self.cmd))) self.pool.addFinishedWorkItem(self.cmd) self.cmd = None except Exception as e: self.logger.exception(e) if self.cmd: - self.logger.debug("[%s] finished cmd with exception: %s" % (self.name, self.cmd)) + self.logger.debug("[%s] finished cmd with exception: %s" % (self.name, _safe_log_string(self.cmd))) self.pool.addFinishedWorkItem(self.cmd) self.cmd = None @@ -548,9 +554,9 @@ def __init__(self, name, cmdStr, ctxt=LOCAL, remoteHost=None, stdin=None, gphome def __str__(self): if self.results: - return "%s cmdStr='%s' had result: %s" % (self.name, self.cmdStr, self.results) + return "%s cmdStr='%s' had result: %s" % (self.name, _safe_log_string(self.cmdStr), self.results) else: - return "%s cmdStr='%s'" % (self.name, self.cmdStr) + return "%s cmdStr='%s'" % (self.name, _safe_log_string(self.cmdStr)) # Start a process that will execute the command but don't wait for # it to complete. Return the Popen object instead. @@ -559,7 +565,7 @@ def runNoWait(self): return self.exec_context.proc def run(self, validateAfter=False): - self.logger.debug("Running Command: %s" % self.cmdStr) + self.logger.debug("Running Command: %s" % _safe_log_string(self.cmdStr)) self.exec_context.execute(self, pickled=self.pickled, start_new_session=self.start_new_session) if validateAfter: diff --git a/gpMgmt/bin/gppylib/test/unit/test_unit_analyzedb.py b/gpMgmt/bin/gppylib/test/unit/test_unit_analyzedb.py new file mode 100644 index 00000000000..be3b33efd66 --- /dev/null +++ b/gpMgmt/bin/gppylib/test/unit/test_unit_analyzedb.py @@ -0,0 +1,22 @@ +import imp +import os + +from gppylib.test.unit.gp_unittest import GpTestCase, run_tests + + +class AnalyzeDbTestCase(GpTestCase): + def setUp(self): + analyzedb_file = os.path.abspath(os.path.dirname(__file__) + "/../../../analyzedb") + self.subject = imp.load_source('analyzedb', analyzedb_file) + + def test_create_psql_command_keeps_utf8_sql_but_uses_ascii_safe_display_name(self): + query = 'analyze "public"."spiegelungssätze"' + + cmd = self.subject.create_psql_command('special_encoding_db', query) + + self.assertEqual(cmd.name, 'analyze "public"."spiegelungss\\xe4tze"') + self.assertIn('spiegelungssätze', cmd.cmdStr) + + +if __name__ == '__main__': + run_tests() diff --git a/gpMgmt/test/behave/mgmt_utils/analyzedb.feature b/gpMgmt/test/behave/mgmt_utils/analyzedb.feature index 5809c7745a9..c757726dcbc 100644 --- a/gpMgmt/test/behave/mgmt_utils/analyzedb.feature +++ b/gpMgmt/test/behave/mgmt_utils/analyzedb.feature @@ -1775,6 +1775,7 @@ Feature: Incrementally analyze the database And the user runs "dropdb schema_with_temp_table" And the user drops the named connection "default" + @extended Scenario: analyzedb can handle the table name with special utf-8 characters. Given database "special_encoding_db" is dropped and recreated And the user connects to "special_encoding_db" with named connection "default" diff --git a/gpMgmt/test/behave/mgmt_utils/environment.py b/gpMgmt/test/behave/mgmt_utils/environment.py index d79f9c18acc..a8a2f9be828 100644 --- a/gpMgmt/test/behave/mgmt_utils/environment.py +++ b/gpMgmt/test/behave/mgmt_utils/environment.py @@ -11,6 +11,7 @@ from steps.gpssh_exkeys_mgmt_utils import GpsshExkeysMgmtContext from steps.mgmt_utils import backup_bashrc, restore_bashrc from gppylib.db import dbconn +from gppylib.commands.base import Command, REMOTE def before_all(context): if list(map(int, behave.__version__.split('.'))) < [1,2,6]: @@ -62,19 +63,27 @@ def before_feature(context, feature): dbconn.execSQL(context.conn, 'create table t1(a integer, b integer)') dbconn.execSQL(context.conn, 'create table t2(c integer, d integer)') dbconn.execSQL(context.conn, 'create table t3(e integer, f integer)') + dbconn.execSQL(context.conn, 'create table spiegelungssätze(col_ä integer, 列2 integer)') dbconn.execSQL(context.conn, 'create view v1 as select a, b from t1, t3 where t1.a=t3.e') dbconn.execSQL(context.conn, 'create view v2 as select c, d from t2, t3 where t2.c=t3.f') dbconn.execSQL(context.conn, 'create view v3 as select a, d from v1, v2 where v1.a=v2.c') dbconn.execSQL(context.conn, 'insert into t1 values(1, 2)') dbconn.execSQL(context.conn, 'insert into t2 values(1, 3)') dbconn.execSQL(context.conn, 'insert into t3 values(1, 4)') + dbconn.execSQL(context.conn, 'insert into spiegelungssätze values(1, 5)') + # minirepro tests require statistical data about the contents of the database + # we should execute 'ANALYZE' to fill the pg_statistic catalog table. + dbconn.execSQL(context.conn, 'analyze t1') + dbconn.execSQL(context.conn, 'analyze t2') + dbconn.execSQL(context.conn, 'analyze t3') + dbconn.execSQL(context.conn, 'analyze spiegelungssätze') + dbconn.execSQL(context.conn, 'create or replace function select_one() returns integer as $$ select 1 $$ language sql') context.conn.commit() if 'gppkg' in feature.tags: run_command(context, 'bash demo/gppkg/generate_sample_gppkg.sh buildGppkg') run_command(context, 'cp -f /tmp/sample-gppkg/sample.gppkg test/behave/mgmt_utils/steps/data/') - def after_feature(context, feature): if 'analyzedb' in feature.tags: context.conn.close() @@ -102,6 +111,9 @@ def before_scenario(context, scenario): if 'gprecoverseg' in context.feature.tags: context.mirror_context = MirrorMgmtContext() + if 'gprecoverseg_newhost' in context.feature.tags: + context.mirror_context = MirrorMgmtContext() + if 'gpconfig' in context.feature.tags: context.gpconfig_context = GpConfigContext() @@ -146,11 +158,17 @@ def after_scenario(context, scenario): return tags_to_cleanup = ['gpmovemirrors', 'gpssh-exkeys'] - if set(context.feature.tags).intersection(tags_to_cleanup): + if set(context.feature.tags).intersection(tags_to_cleanup) and "skip_cleanup" not in scenario.effective_tags: if 'temp_base_dir' in context and os.path.exists(context.temp_base_dir): os.chmod(context.temp_base_dir, 0o700) shutil.rmtree(context.temp_base_dir) + if 'umount_required' in context and context.umount_required: + context.execute_steps(''' + # unmounting all mounter filesystem in concourse cluster + Then umount all mounted filesystem + ''') + tags_to_not_restart_db = ['analyzedb', 'gpssh-exkeys'] if not set(context.feature.tags).intersection(tags_to_not_restart_db): start_database_if_not_started(context) @@ -182,3 +200,12 @@ def after_scenario(context, scenario): execute_sql('postgres', create_fault_query) reset_fault_query = "SELECT gp_inject_fault_infinite('all', 'reset', dbid) FROM gp_segment_configuration WHERE status='u';" execute_sql('postgres', reset_fault_query) + + if os.getenv('SUSPEND_PG_REWIND') is not None: + del os.environ['SUSPEND_PG_REWIND'] + + if "remove_rsync_bash" in scenario.effective_tags: + for host in context.hosts_with_rsync_bash: + cmd = Command(name='remove /usr/local/bin/rsync', cmdStr="sudo rm /usr/local/bin/rsync", remoteHost=host, + ctxt=REMOTE) + cmd.run(validateAfter=True) diff --git a/gpMgmt/test/behave/mgmt_utils/gpaddmirrors.feature b/gpMgmt/test/behave/mgmt_utils/gpaddmirrors.feature index e65e782938d..9817183b579 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpaddmirrors.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpaddmirrors.feature @@ -5,6 +5,8 @@ Feature: Tests for gpaddmirrors And a tablespace is created with data When gpaddmirrors adds 3 mirrors And an FTS probe is triggered + #gpaddmirrors triggers full recovery where old replication slot is dropped and new one is created + And verify replication slot internal_wal_replication_slot is available on all the segments And the segments are synchronized Then verify the database has mirrors And the tablespace is valid @@ -24,6 +26,8 @@ Feature: Tests for gpaddmirrors And an FTS probe is triggered And the segments are synchronized And verify the database has mirrors + #gpaddmirrors triggers full recovery where old replication slot is dropped and new one is created + And verify replication slot internal_wal_replication_slot is available on all the segments And the tablespace is valid And user stops all primary processes And user can start transactions @@ -49,8 +53,10 @@ Feature: Tests for gpaddmirrors When gpaddmirrors adds 3 mirrors Then gpaddmirrors should return a return code of 0 + And gpaddmirrors should not print "Unable to kill walsender on primary" to stdout And verify the database has mirrors And the segments are synchronized + And check segment conf: postgresql.conf And user can start transactions Scenario: gpaddmirrors setup recovery part two @@ -162,7 +168,7 @@ Feature: Tests for gpaddmirrors And the user reset the walsender on the primary on content 0 And the user waits until saved async process is completed And recovery_progress.file should not exist in gpAdminLogs in gpAdminLogs - And the user waits until mirror on content 0,1,2 is up + And verify that mirror on content 0,1,2 is up And check if mirrors on content 0,1,2 are moved to new location on input file And verify there are no recovery backout files @@ -174,7 +180,19 @@ Feature: Tests for gpaddmirrors And all the segments are running And the segments are synchronized + And check segment conf: postgresql.conf + And all files in gpAdminLogs directory are deleted + Scenario: gpaddmirrors errors out if the directory for the mirror to be added is not empty + Given the cluster is generated with "3" primaries only + And all files in gpAdminLogs directory are deleted + And a gaddmirrors directory under '/tmp' with mode '0700' is created + And a gpaddmirrors input file is created + And edit the input file to add mirror with content 0,1,2 to a new non-empty directory with mode 0700 + When the user runs gpaddmirrors with input file and additional args "-a" + Then gpaddmirrors should print "Segment directory '/tmp/.*' exists but is not empty!" to stdout + And all the segments are running + And check segment conf: postgresql.conf And all files in gpAdminLogs directory are deleted @@ -191,7 +209,7 @@ Feature: Tests for gpaddmirrors # And the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format # And the user waits until saved async process is completed # And recovery_progress.file should not exist in gpAdminLogs -# And the user waits until mirror on content 0,1,2 is up +# And verify that mirror on content 0,1,2 is up # # And check if mirrors on content 0,1,2 are moved to new location on input file # And verify there are no recovery backout files @@ -214,20 +232,21 @@ Feature: Tests for gpaddmirrors Scenario: spread mirroring configuration Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with "spread" segment mirroring on "mdw" and "sdw1, sdw2, sdw3" + And a cluster is created with "spread" segment mirroring on "cdw" and "sdw1, sdw2, sdw3" Then verify that mirror segments are in "spread" configuration Given a preferred primary has failed When the user runs "gprecoverseg -a" Then gprecoverseg should return a return code of 0 And all the segments are running And the segments are synchronized + And check segment conf: postgresql.conf And the user runs "gpstop -aqM fast" @concourse_cluster Scenario Outline: gpaddmirrors can add mirrors even if mirrors failed during basebackup Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1, sdw2" + And a cluster is created with no mirrors on "cdw" and "sdw1, sdw2" And all files in gpAdminLogs directory are deleted on all hosts in the cluster And a gpaddmirrors directory under '/tmp' with mode '0700' is created And a gpaddmirrors input file is created @@ -269,7 +288,7 @@ Feature: Tests for gpaddmirrors Scenario Outline: gpaddmirrors can add mirrors even if start fails for mirrors Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1, sdw2" + And a cluster is created with no mirrors on "cdw" and "sdw1, sdw2" And all files in gpAdminLogs directory are deleted on all hosts in the cluster And a gpaddmirrors directory under '/tmp' with mode '0700' is created And a gpaddmirrors input file is created @@ -309,13 +328,14 @@ Feature: Tests for gpaddmirrors Scenario: gprecoverseg works correctly on a newly added mirror with HBA_HOSTNAMES=0 Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And with HBA_HOSTNAMES "0" a cluster is created with no mirrors on "mdw" and "sdw1, sdw2" + And with HBA_HOSTNAMES "0" a cluster is created with no mirrors on "cdw" and "sdw1, sdw2" And pg_hba file "/tmp/gpaddmirrors/data/primary/gpseg0/pg_hba.conf" on host "sdw1" contains only cidr addresses And gpaddmirrors adds mirrors And pg_hba file "/tmp/gpaddmirrors/data/primary/gpseg0/pg_hba.conf" on host "sdw1" contains only cidr addresses And pg_hba file "/tmp/gpaddmirrors/data/primary/gpseg0/pg_hba.conf" on host "sdw1" contains entries for "samehost" And verify that the file "pg_hba.conf" in each segment data directory has "no" line starting with "host.*replication.*\(127.0.0\|::1\).*trust" Then verify the database has mirrors + And gpaddmirrors should not print "Unable to kill walsender on primary" to stdout Then the mirror on content 0 is stopped with the immediate flag And an FTS probe is triggered @@ -348,11 +368,12 @@ Feature: Tests for gpaddmirrors Scenario: gprecoverseg works correctly on a newly added mirror with HBA_HOSTNAMES=1 Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And with HBA_HOSTNAMES "1" a cluster is created with no mirrors on "mdw" and "sdw1, sdw2" - And pg_hba file "/tmp/gpaddmirrors/data/primary/gpseg0/pg_hba.conf" on host "sdw1" contains entries for "mdw, sdw1" + And with HBA_HOSTNAMES "1" a cluster is created with no mirrors on "cdw" and "sdw1, sdw2" + And pg_hba file "/tmp/gpaddmirrors/data/primary/gpseg0/pg_hba.conf" on host "sdw1" contains entries for "cdw, sdw1" And gpaddmirrors adds mirrors with options "--hba-hostnames" - And pg_hba file "/tmp/gpaddmirrors/data/primary/gpseg0/pg_hba.conf" on host "sdw1" contains entries for "mdw, sdw1, sdw2, samehost" + And pg_hba file "/tmp/gpaddmirrors/data/primary/gpseg0/pg_hba.conf" on host "sdw1" contains entries for "cdw, sdw1, sdw2, samehost" Then verify the database has mirrors + And gpaddmirrors should not print "Unable to kill walsender on primary" to stdout When the mirror on content 0 is stopped with the immediate flag And an FTS probe is triggered @@ -385,50 +406,56 @@ Feature: Tests for gpaddmirrors Scenario: gpaddmirrors puts mirrors on the same hosts when there is a standby configured Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1, sdw2, sdw3" + And a cluster is created with no mirrors on "cdw" and "sdw1, sdw2, sdw3" And gpaddmirrors adds mirrors - Then verify the database has mirrors + Then gpaddmirrors should not print "Unable to kill walsender on primary" to stdout + And verify the database has mirrors And save the gparray to context And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1, sdw2, sdw3" + And a cluster is created with no mirrors on "cdw" and "sdw1, sdw2, sdw3" And the user runs gpinitstandby with options " " Then gpinitstandby should return a return code of 0 And gpaddmirrors adds mirrors Then mirror hostlist matches the one saved in context + And check segment conf: postgresql.conf And the user runs "gpstop -aqM fast" @concourse_cluster Scenario: gpaddmirrors puts mirrors on different host Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1, sdw2, sdw3" + And a cluster is created with no mirrors on "cdw" and "sdw1, sdw2, sdw3" And gpaddmirrors adds mirrors in spread configuration Then verify that mirror segments are in "spread" configuration + And check segment conf: postgresql.conf And the user runs "gpstop -aqM fast" @concourse_cluster Scenario: gpaddmirrors with a default coordinator data directory Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And gpaddmirrors adds mirrors - Then verify the database has mirrors + Then gpaddmirrors should not print "Unable to kill walsender on primary" to stdout + And verify the database has mirrors + And check segment conf: postgresql.conf And the user runs "gpstop -aqM fast" @concourse_cluster Scenario: gpaddmirrors with a given coordinator data directory [-d ] Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And gpaddmirrors adds mirrors with temporary data dir Then verify the database has mirrors + And check segment conf: postgresql.conf And the user runs "gpstop -aqM fast" @concourse_cluster Scenario: gpaddmirrors mirrors are recognized after a cluster restart Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" When gpaddmirrors adds mirrors Then verify the database has mirrors When an FTS probe is triggered @@ -438,13 +465,25 @@ Feature: Tests for gpaddmirrors And wait until the process "gpstart" goes down Then all the segments are running And the segments are synchronized + And check segment conf: postgresql.conf + And the user runs "gpstop -aqM fast" + + @concourse_cluster + Scenario: gpaddmirrors should create consistent port entry on mirrors postgresql.conf file + Given a working directory of the test as '/tmp/gpaddmirrors' + And the database is not running + And a cluster is created with no mirrors on "cdw" and "sdw1" + When gpaddmirrors adds mirrors + Then gpaddmirrors should not print "Unable to kill walsender on primary" to stdout + And verify the database has mirrors + And check segment conf: postgresql.conf And the user runs "gpstop -aqM fast" @concourse_cluster Scenario: gpaddmirrors when the primaries have data Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And database "gptest" exists And there is a "heap" table "public.heap_table" in "gptest" with "100" rows And there is a "ao" table "public.ao_table" in "gptest" with "100" rows @@ -463,10 +502,11 @@ Feature: Tests for gpaddmirrors Scenario: tablespaces work on a multi-host environment Given a working directory of the test as '/tmp/gpaddmirrors' And the database is not running - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And a tablespace is created with data When gpaddmirrors adds mirrors - Then verify the database has mirrors + Then gpaddmirrors should not print "Unable to kill walsender on primary" to stdout + And verify the database has mirrors When an FTS probe is triggered And the segments are synchronized diff --git a/gpMgmt/test/behave/mgmt_utils/gpcheckcat.feature b/gpMgmt/test/behave/mgmt_utils/gpcheckcat.feature index d9b91838909..3b7d9683597 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpcheckcat.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpcheckcat.feature @@ -9,6 +9,9 @@ Feature: gpcheckcat tests Given database "all_good" is dropped and recreated Then the user runs "gpcheckcat -A" Then gpcheckcat should return a return code of 0 + When the user runs "gpcheckcat -C pg_class" + Then gpcheckcat should return a return code of 0 + And gpcheckcat should not print "Execution error:" to stdout And the user runs "dropdb all_good" Scenario: gpcheckcat should drop leaked schemas @@ -125,6 +128,46 @@ Feature: gpcheckcat tests Then gpcheckcat should print "Extra" to stdout And gpcheckcat should print "Table miss_attr_db4.public.heap_table.1" to stdout + Scenario: gpcheckcat should report inconsistent pg_fastsequence.lastrownums values with gp_fastsequence for AO tables + Given database "errorneous_lastrownums" is dropped and recreated + And the user runs "psql errorneous_lastrownums -c "create table errlastrownum(a int) using ao_row; insert into errlastrownum select * from generate_series(1,100);"" + And the user runs "psql errorneous_lastrownums -c "alter table errlastrownum add column newcol int;"" + When the user runs "gpcheckcat -R ao_lastrownums errorneous_lastrownums" + Then gpcheckcat should return a return code of 0 + When the user runs sql "set allow_system_table_mods=on; update gp_fastsequence set last_sequence = 0 where last_sequence > 0;" in "errorneous_lastrownums" on first primary segment + When the user runs "gpcheckcat -R ao_lastrownums errorneous_lastrownums" + Then gpcheckcat should return a return code of 3 + And gpcheckcat should print "Failed test\(s\) that are not reported here: ao_lastrownums" to stdout + Given database "errorneous_lastrownums" is dropped and recreated + And the user runs "psql errorneous_lastrownums -c "create table errlastrownum(a int) using ao_row; insert into errlastrownum select * from generate_series(1,10);"" + And the user runs "psql errorneous_lastrownums -c "alter table errlastrownum add column newcol int;"" + When the user runs "gpcheckcat -R ao_lastrownums errorneous_lastrownums" + Then gpcheckcat should return a return code of 0 + Then the user runs sql "set allow_system_table_mods=on; delete from gp_fastsequence where last_sequence > 0;" in "errorneous_lastrownums" on first primary segment + When the user runs "gpcheckcat -R ao_lastrownums errorneous_lastrownums" + Then gpcheckcat should return a return code of 3 + And gpcheckcat should print "Failed test\(s\) that are not reported here: ao_lastrownums" to stdout + + Scenario: gpcheckcat should report inconsistent pg_fastsequence.lastrownums values with gp_fastsequence for AOCO tables + Given database "errorneous_lastrownums" is dropped and recreated + And the user runs "psql errorneous_lastrownums -c "create table errlastrownum(a int) using ao_column; insert into errlastrownum select * from generate_series(1,100);"" + And the user runs "psql errorneous_lastrownums -c "alter table errlastrownum add column newcol int;"" + When the user runs "gpcheckcat -R ao_lastrownums errorneous_lastrownums" + Then gpcheckcat should return a return code of 0 + When the user runs sql "set allow_system_table_mods=on; update gp_fastsequence set last_sequence = 0 where last_sequence > 0;" in "errorneous_lastrownums" on first primary segment + When the user runs "gpcheckcat -R ao_lastrownums errorneous_lastrownums" + Then gpcheckcat should return a return code of 3 + And gpcheckcat should print "Failed test\(s\) that are not reported here: ao_lastrownums" to stdout + Given database "errorneous_lastrownums" is dropped and recreated + And the user runs "psql errorneous_lastrownums -c "create table errlastrownum(a int) using ao_column; insert into errlastrownum select * from generate_series(1,10);"" + And the user runs "psql errorneous_lastrownums -c "alter table errlastrownum add column newcol int;"" + When the user runs "gpcheckcat -R ao_lastrownums errorneous_lastrownums" + Then gpcheckcat should return a return code of 0 + Then the user runs sql "set allow_system_table_mods=on; delete from gp_fastsequence where last_sequence > 0;" in "errorneous_lastrownums" on first primary segment + When the user runs "gpcheckcat -R ao_lastrownums errorneous_lastrownums" + Then gpcheckcat should return a return code of 3 + And gpcheckcat should print "Failed test\(s\) that are not reported here: ao_lastrownums" to stdout + Scenario: gpcheckcat should report and repair owner errors and produce timestamped repair scripts Given database "owner_db1" is dropped and recreated And database "owner_db2" is dropped and recreated @@ -155,6 +198,27 @@ Feature: gpcheckcat tests And the user runs "dropdb owner_db2" And the path "gpcheckcat.repair.*" is removed from current working directory + Scenario: gpcheckcat should report and repair owner errors on appendonly tables and its indexes + Given database "owner_db" is dropped and recreated + And the path "gpcheckcat.repair.*" is removed from current working directory + And there is a "ao" table "public.gpadmin_ao_tbl" in "owner_db" with data + And the user runs "psql owner_db -c "CREATE INDEX gpadmin_ao_tbl_idx on gpadmin_ao_tbl (column1);"" + And the user runs sql "alter table gpadmin_ao_tbl OWNER TO wolf" in "owner_db" on first primary segment + Then psql should return a return code of 0 + + When the user runs "gpcheckcat -R owner owner_db" + Then gpcheckcat should return a return code of 3 + Then the path "gpcheckcat.repair.*" is found in cwd "1" times + + When the user runs all the repair scripts in the dir "gpcheckcat.repair.*" + And the path "gpcheckcat.repair.*" is removed from current working directory + And the user runs "gpcheckcat -R owner owner_db" + Then Then gpcheckcat should return a return code of 0 + Then the path "gpcheckcat.repair.*" is found in cwd "0" times + + And the user runs "dropdb owner_db" + And the path "gpcheckcat.repair.*" is removed from current working directory + Scenario: gpcheckcat should report and repair invalid constraints Given database "constraint_db" is dropped and recreated And the path "gpcheckcat.repair.*" is removed from current working directory @@ -435,7 +499,7 @@ Feature: gpcheckcat tests Then gpcheckcat should print "Table pg_type has a dependency issue on oid .* at content 0" to stdout And the user runs "dropdb gpcheckcat_dependency" - Scenario: gpcheckcat should report no inconsistency of pg_extension between Master and Segements + Scenario: gpcheckcat should report no inconsistency of pg_extension between Coordinator and Segements Given database "pgextension_db" is dropped and recreated And the user runs sql "set allow_system_table_mods=true;update pg_extension set extconfig='{2130}', extcondition='{2130}';" in "pgextension_db" on first primary segment Then the user runs "gpcheckcat -R inconsistent pgextension_db" @@ -651,11 +715,6 @@ Feature: gpcheckcat tests And the user runs "dropdb check_dependency_error" And the user runs "psql -d postgres -c "DROP ROLE foo"" - -########################### @concourse_cluster tests ########################### -# The @concourse_cluster tag denotes the scenario that requires a remote cluster - - @concourse_cluster Scenario Outline: gpcheckcat should discover missing attributes for external tables Given database "miss_attr_db3" is dropped and recreated And the user runs "echo > /tmp/backup_gpfdist_dummy" @@ -675,42 +734,42 @@ Feature: gpcheckcat tests | attrname | tablename | | ftrelid | pg_foreign_table | -# GPDB_12_MERGE_FIXME: -# 1, this case is removed because 12 partitioning implementation will not record pg_constraint, right? -# 2, gpcheckcat in the concourse only runs 1 or 2 tests, how about merging into another task? -# -# @concourse_cluster -# Scenario Outline: gpcheckcat should discover missing attributes for external tables -# Given database "miss_attr_db3" is dropped and recreated -# And the user runs "echo > /tmp/backup_gpfdist_dummy" -# And the user runs "gpfdist -p 8098 -d /tmp &" -# And there is a partition table "part_external" has external partitions of gpfdist with file "backup_gpfdist_dummy" on port "8098" in "miss_attr_db3" with data -# Then data for partition table "part_external" with leaf partition distributed across all segments on "miss_attr_db3" -# When the user runs "gpcheckcat miss_attr_db3" -# And gpcheckcat should return a return code of 0 -# Then gpcheckcat should not print "Missing" to stdout -# And the user runs "psql miss_attr_db3 -c "SET allow_system_table_mods=true; DELETE FROM where ='part_external_1_prt_p_2'::regclass::oid;"" -# Then psql should return a return code of 0 -# When the user runs "gpcheckcat miss_attr_db3" -# Then gpcheckcat should print "Missing" to stdout -# And gpcheckcat should print "part_external_1_prt_p_2_check" to stdout -# Examples: -# | attrname | tablename | -# | conrelid | pg_constraint | -# - - Scenario: gpcheckcat should discover missing attributes of pg_description catalogue table - Given there is a "heap" table "public.heap_table" in "miss_attr_db5" with data and description - When the user runs "gpcheckcat -v miss_attr_db5" + Scenario Outline: gpcheckcat should discover missing attributes for external tables + Given database "miss_attr_db3" is dropped and recreated + And the user runs "echo > /tmp/backup_gpfdist_dummy" + And the user runs "gpfdist -p 8098 -d /tmp &" + And there is a partition table "part_external" has external partitions of gpfdist with file "backup_gpfdist_dummy" on port "8098" in "miss_attr_db3" with data + Then data for partition table "part_external" with leaf partition distributed across all segments on "miss_attr_db3" + When the user runs "gpcheckcat miss_attr_db3" And gpcheckcat should return a return code of 0 Then gpcheckcat should not print "Missing" to stdout - And the user runs "psql miss_attr_db5 -c "SET allow_system_table_mods=true; DELETE FROM pg_description where objoid='heap_table'::regclass::oid;"" + And the user runs "psql miss_attr_db3 -c "SET allow_system_table_mods=true; DELETE FROM where ='part_external_1_prt_p_2';"" Then psql should return a return code of 0 - When the user runs "gpcheckcat -v miss_attr_db5" + When the user runs "gpcheckcat miss_attr_db3" + Then gpcheckcat should print "Missing" to stdout + And gpcheckcat should print "Name of test which found this issue: missing_extraneous_pg_class" to stdout + And gpcheckcat should print "Relation name: part_external_1_prt_p_2" to stdout + Examples: + | attrname | tablename | + | relname | pg_class | + + Scenario: gpcheckcat should discover missing attributes of pg_description and pg_shdescription catalogue table without errors + Given database "miss_attr_db5" is dropped and recreated + And there is a "heap" table "public.heap_table" in "miss_attr_db5" with data and description + And a tablespace is created with data and description + When the user runs "gpcheckcat miss_attr_db5" + Then gpcheckcat should return a return code of 0 + And gpcheckcat should not print "Missing" to stdout + When the user runs "psql miss_attr_db5 -c "SET allow_system_table_mods=true; DELETE FROM pg_description where objoid='heap_table'::regclass::oid;"" + Then psql should return a return code of 0 + When the user runs "psql miss_attr_db5 -c "SET allow_system_table_mods=true; DELETE FROM pg_shdescription where objoid=(SELECT oid from pg_tablespace where spcname='outerspace');"" + Then psql should return a return code of 0 + When the user runs "gpcheckcat miss_attr_db5" Then gpcheckcat should print "Missing description metadata of {.*} on content -1" to stdout And gpcheckcat should not print "Execution error:" to stdout And gpcheckcat should print "Name of test which found this issue: missing_extraneous_pg_description" to stdout - + Then gpcheckcat should print "Missing shdescription metadata of {.*} on content -1" to stdout + And gpcheckcat should print "Name of test which found this issue: missing_extraneous_pg_shdescription" to stdout Scenario: set multiple GUC at session level in gpcheckcat Given database "all_good" is dropped and recreated @@ -741,5 +800,96 @@ Feature: gpcheckcat tests And "gpstop -m" should return a return code of 0 And the user runs "gpstart -a" - - + Scenario: Validate if gpecheckcat throws error when there are tables created using mix distribution policy + Given database "hashops_db" is dropped and recreated + And the user runs "psql hashops_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_legacy_hash_ops_tables.sql" + Then psql should return a return code of 0 + And the user runs "psql hashops_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_non_legacy_hashops_tables.sql" + Then psql should return a return code of 0 + When the user runs "gpcheckcat -R mix_distribution_policy hashops_db " + And gpcheckcat should print "Found tables created using both legacy and non legacy hashops in distribution policy." to stdout + And gpcheckcat should print "Please run the gpcheckcat.distpolicy.sql file to list the tables." to stdout + And the user runs "dropdb hashops_db" + + Scenario: Validate if gpcheckcat succeeds and there are no tables + Given database "hashops_db" is dropped and recreated + When the user runs "gpcheckcat -R mix_distribution_policy hashops_db" + And gpcheckcat should print "PASSED" to stdout + And the user runs "dropdb hashops_db" + + Scenario: Validate if gpcheckcat throws error when GUC gp_use_legacy_hashops is on and there are non legacy tables + Given database "hashops_db" is dropped and recreated + And the user runs "psql hashops_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_non_legacy_hashops_tables.sql" + Then psql should return a return code of 0 + And the user runs "gpconfig -c gp_use_legacy_hashops -v on --skipvalidation" + Then gpconfig should return a return code of 0 + And the user runs "gpstop -a" + Then gpstop should return a return code of 0 + And the user runs "gpstart -a" + When the user runs "gpcheckcat -R mix_distribution_policy hashops_db" + And gpcheckcat should print "GUC gp_use_legacy_hashops is on." to stdout + And gpcheckcat should print "all newly created tables will use legacy hash ops by default for hash distributed table," to stdout + And gpcheckcat should print "but there are tables using non-legacy hash ops in the cluster." to stdout + And gpcheckcat should print "Please run the gpcheckcat.distpolicy.sql file to list the tables." to stdout + And the user runs "dropdb hashops_db" + + Scenario: Validate if gpcheckcat succeeds when GUC gp_use_legacy_hashops is on and there are legacy tables + Given database "hashops_db" is dropped and recreated + And the user runs "psql hashops_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_legacy_hash_ops_tables.sql" + Then psql should return a return code of 0 + And the user runs "gpconfig -c gp_use_legacy_hashops -v on --skipvalidation" + Then gpconfig should return a return code of 0 + And the user runs "gpstop -a" + Then gpstop should return a return code of 0 + And the user runs "gpstart -a" + When the user runs "gpcheckcat -R mix_distribution_policy hashops_db" + And gpcheckcat should print "PASSED" to stdout + And the user runs "dropdb hashops_db" + + Scenario: Validate if gpcheckcat throws error when GUC gp_use_legacy_hashops is off and there are legacy tables + Given database "hashops_db" is dropped and recreated + And the user runs "psql hashops_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_legacy_hash_ops_tables.sql" + And the user runs "gpconfig -c gp_use_legacy_hashops -v off --skipvalidation" + Then gpconfig should return a return code of 0 + And the user runs "gpstop -a" + Then gpstop should return a return code of 0 + And the user runs "gpstart -a" + When the user runs "gpcheckcat -R mix_distribution_policy hashops_db" + And gpcheckcat should print "GUC gp_use_legacy_hashops is off." to stdout + And gpcheckcat should print "all newly created tables will use non legacy hash ops by default for hash distributed table," to stdout + And gpcheckcat should print "but there are tables using legacy hash ops in the cluster." to stdout + And gpcheckcat should print "Please run the gpcheckcat.distpolicy.sql file to list the tables." to stdout + And the user runs "dropdb hashops_db" + + Scenario: Validate if gpcheckcat succeeds when GUC gp_use_legacy_hashops is off and there are non legacy tables + Given database "hashops_db" is dropped and recreated + And the user runs "psql hashops_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_non_legacy_hashops_tables.sql" + And the user runs "gpconfig -c gp_use_legacy_hashops -v off --skipvalidation" + Then gpconfig should return a return code of 0 + And the user runs "gpstop -a" + Then gpstop should return a return code of 0 + And the user runs "gpstart -a" + When the user runs "gpcheckcat -R mix_distribution_policy hashops_db" + And gpcheckcat should print "PASSED" to stdout + And the user runs "dropdb hashops_db" + + Scenario: gpcheckcat -l should report mix_distribution_policy to stdout + When the user runs "gpcheckcat -l " + And gpcheckcat should print "mix_distribution_policy" to stdout + + Scenario: gpcheckcat report all tables created using legacy opclass on multiple database + Given database "hashops_db" is dropped and recreated + And the user runs "psql hashops_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_legacy_hash_ops_tables.sql" + And the user runs "psql hashops_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_non_legacy_hashops_tables.sql" + Then psql should return a return code of 0 + Given database "hashops_db2" is dropped and recreated + And the user runs "psql hashops_db2 -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_legacy_hash_ops_tables.sql" + And the user runs "psql hashops_db2 -f test/behave/mgmt_utils/steps/data/gpcheckcat/create_non_legacy_hashops_tables.sql" + Then psql should return a return code of 0 + When the user runs "gpcheckcat -A -R mix_distribution_policy" + And gpcheckcat should print "Found tables created using both legacy and non legacy hashops in distribution policy." to stdout + And gpcheckcat should print "Please run the gpcheckcat.distpolicy.sql file to list the tables." to stdout + Then gpcheckcat should print "Completed 1 test(s) on database 'hashops_db'" to logfile with latest timestamp + Then gpcheckcat should print "Completed 1 test(s) on database 'hashops_db2'" to logfile with latest timestamp + And the user runs "dropdb hashops_db" + And the user runs "dropdb hashops_db2" diff --git a/gpMgmt/test/behave/mgmt_utils/gpexpand.feature b/gpMgmt/test/behave/mgmt_utils/gpexpand.feature index 90bc88836d4..6bf58246def 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpexpand.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpexpand.feature @@ -7,11 +7,11 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And the coordinator pid has been saved And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" When the user runs gpexpand interview to add 2 new segment and 0 new host "ignored.host" Then the number of segments have been saved And user has created expansiontest tables @@ -33,11 +33,11 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And the user runs gpinitstandby with options " " And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" When the user runs gpexpand interview to add 2 new segment and 0 new host "ignored.host" Then user has created expansiontest tables And 4000000 rows are inserted into table "expansiontest0" in schema "public" with column type list "int" @@ -53,10 +53,10 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" When the user runs gpexpand interview to add 2 new segment and 0 new host "ignored.host" Then the number of segments have been saved And user has created expansiontest tables @@ -76,10 +76,10 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" When the user runs gpexpand interview to add 2 new segment and 0 new host "ignored.host" Then the number of segments have been saved When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent" @@ -91,10 +91,10 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1,sdw2" + And the cluster is setup for an expansion on hosts "cdw,sdw1,sdw2" And the new host "sdw2" is ready to go When the user runs gpexpand interview to add 0 new segment and 1 new host "sdw2" Then the number of segments have been saved @@ -107,10 +107,10 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1,sdw2" + And the cluster is setup for an expansion on hosts "cdw,sdw1,sdw2" And the new host "sdw2" is ready to go When the user runs gpexpand interview to add 1 new segment and 1 new host "sdw2" Then the number of segments have been saved @@ -123,10 +123,10 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" And the number of segments have been saved When the user runs gpexpand with a static inputfile for a single-node cluster with mirrors Then verify that the cluster has 4 new segments @@ -137,9 +137,9 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" And the user runs gpexpand with a static inputfile for a two-node cluster with mirrors And expanded preferred primary on segment "3" has failed When the user runs "gprecoverseg -a" @@ -157,10 +157,10 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1,sdw2,sdw3" + And the cluster is setup for an expansion on hosts "cdw,sdw1,sdw2,sdw3" And the new host "sdw2,sdw3" is ready to go When the user runs gpexpand interview to add 0 new segment and 2 new host "sdw2,sdw3" Then the number of segments have been saved @@ -174,11 +174,11 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And the user runs gpinitstandby with options " " And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1,sdw2,sdw3" + And the cluster is setup for an expansion on hosts "cdw,sdw1,sdw2,sdw3" And the new host "sdw2,sdw3" is ready to go When the user runs gpexpand interview to add 1 new segment and 2 new host "sdw2,sdw3" Then the number of segments have been saved @@ -192,13 +192,13 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And the user runs gpinitstandby with options " " And database "gptest" exists And a tablespace is created with data And another tablespace is created with data And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1,sdw2,sdw3" + And the cluster is setup for an expansion on hosts "cdw,sdw1,sdw2,sdw3" And the new host "sdw2,sdw3" is ready to go When the user runs gpexpand interview to add 1 new segment and 2 new host "sdw2,sdw3" Then the number of segments have been saved @@ -234,6 +234,59 @@ Feature: expand the cluster by adding more segments When the user runs gpexpand to redistribute Then the tablespace is valid after gpexpand + @gpexpand_icproxy + Scenario: Cluster expansion failed (no new proxy address) with IC proxy mode enabled + Given the database is not running + And a working directory of the test as '/data/gpdata/gpexpand' + And the user runs command "rm -rf /data/gpdata/gpexpand/*" + And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into + And a cluster is created with no mirrors on "cdw" and "sdw1" + And the coordinator pid has been saved + And database "gptest" exists + And there are no gpexpand_inputfiles + And the cluster is running in IC proxy mode + And the cluster is setup for an expansion on hosts "cdw" + And the user runs gpexpand interview to add 1 new segment and 0 new host "ignore.host" + And the number of segments have been saved + When the user runs gpexpand with the latest gpexpand_inputfile without ret code check + Then gpexpand should return a return code of 3 + And gpexpand should print "Checking ICProxy addresses failed" to stdout + + @gpexpand_icproxy + Scenario: Cluster expansion failed (bind an wrong proxy address) with IC proxy mode enabled + Given the database is not running + And a working directory of the test as '/data/gpdata/gpexpand' + And the user runs command "rm -rf /data/gpdata/gpexpand/*" + And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into + And a cluster is created with no mirrors on "cdw" and "sdw1" + And the coordinator pid has been saved + And database "gptest" exists + And there are no gpexpand_inputfiles + And the cluster is running in IC proxy mode with new proxy address 4:2:cdw:16502 + And the cluster is setup for an expansion on hosts "cdw" + And the user runs gpexpand interview to add 1 new segment and 0 new host "ignore.host" + And the number of segments have been saved + When the user runs gpexpand with the latest gpexpand_inputfile without ret code check + Then gpexpand should return a return code of 3 + And gpexpand should print "The ic_proxy process failed to bind or listen" to stdout + + @gpexpand_icproxy + Scenario: Cluster expansion successful with IC proxy mode enabled + Given the database is not running + And a working directory of the test as '/data/gpdata/gpexpand' + And the user runs command "rm -rf /data/gpdata/gpexpand/*" + And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into + And a cluster is created with no mirrors on "cdw" and "sdw1" + And the coordinator pid has been saved + And database "gptest" exists + And there are no gpexpand_inputfiles + And the cluster is running in IC proxy mode with new proxy address 4:2:sdw1:16502 + And the cluster is setup for an expansion on hosts "cdw" + And the user runs gpexpand interview to add 1 new segment and 0 new host "ignore.host" + And the number of segments have been saved + When the user runs gpexpand with the latest gpexpand_inputfile without ret code check + Then gpexpand should return a return code of 0 + @gpexpand_verify_redistribution Scenario: Verify data is correctly redistributed after expansion Given the database is not running @@ -278,11 +331,11 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And database "gptest" exists And the user runs psql with "-f /home/gpadmin/sqldump/dump.sql" against database "gptest" And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1,sdw2,sdw3" + And the cluster is setup for an expansion on hosts "cdw,sdw1,sdw2,sdw3" And the new host "sdw2,sdw3" is ready to go And the user runs gpexpand interview to add 1 new segment and 2 new host "sdw2,sdw3" And the number of segments have been saved @@ -297,11 +350,11 @@ Feature: expand the cluster by adding more segments And a working directory of the test as '/data/gpdata/gpexpand' And the user runs command "rm -rf /data/gpdata/gpexpand/*" And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And the coordinator pid has been saved And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" And the user runs gpexpand interview to add 2 new segment and 0 new host "ignored.host" And the number of segments have been saved And user has created test table @@ -321,11 +374,11 @@ Feature: expand the cluster by adding more segments And a working directory of the test as '/data/gpdata/gpexpand' And the user runs command "rm -rf /data/gpdata/gpexpand/*" And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And the coordinator pid has been saved And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw" + And the cluster is setup for an expansion on hosts "cdw" And the user runs gpexpand interview to add 1 new segment and 0 new host "ignore.host" And the number of segments have been saved When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent" @@ -343,14 +396,14 @@ Feature: expand the cluster by adding more segments And a working directory of the test as '/data/gpdata/gpexpand' And the user runs command "rm -rf /data/gpdata/gpexpand/*" And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And the coordinator pid has been saved And database "gptest" exists And user has created test table And 20 rows are inserted into table "test" in schema "public" with column type list "int" And a long-run read-only transaction exists on "test" And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw" + And the cluster is setup for an expansion on hosts "cdw" And the user runs gpexpand interview to add 1 new segment and 0 new host "ignore.host" And the number of segments have been saved When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent" @@ -368,12 +421,12 @@ Feature: expand the cluster by adding more segments And a working directory of the test as '/data/gpdata/gpexpand' And the user runs command "rm -rf /data/gpdata/gpexpand/*" And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And the coordinator pid has been saved And database "gptest" exists And a long-run transaction starts And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw" + And the cluster is setup for an expansion on hosts "cdw" And the user runs gpexpand interview to add 1 new segment and 0 new host "ignore.host" And the number of segments have been saved When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent" @@ -391,11 +444,11 @@ Feature: expand the cluster by adding more segments And a working directory of the test as '/data/gpdata/gpexpand' And the user runs command "rm -rf /data/gpdata/gpexpand/*" And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And the coordinator pid has been saved And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw" + And the cluster is setup for an expansion on hosts "cdw" And the user runs gpexpand interview to add 1 new segment and 0 new host "ignore.host" And the number of segments have been saved And the transactions are started for dml @@ -418,11 +471,11 @@ Feature: expand the cluster by adding more segments And a working directory of the test as '/tmp/gpexpand_behave' And the user runs command "rm -rf /tmp/gpexpand_behave/*" And a temporary directory under "/tmp/gpexpand_behave/expandedData" to expand into - And a cluster is created with no mirrors on "mdw" and "sdw1" + And a cluster is created with no mirrors on "cdw" and "sdw1" And database "gptest" exists And create database schema table with special character And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" And the number of segments have been saved And the user runs gpexpand interview to add 1 new segment and 0 new host "ignored.host" When the user runs gpexpand with the latest gpexpand_inputfile without ret code check @@ -492,10 +545,10 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1,sdw2,sdw3" + And the cluster is setup for an expansion on hosts "cdw,sdw1,sdw2,sdw3" And the new host "sdw2,sdw3" is ready to go When the user runs gpexpand interview to add 0 new segment and 2 new host "sdw2,sdw3" Then the number of segments have been saved @@ -507,11 +560,11 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And the user runs gpinitstandby with options " " And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" And the primary on content 0 is stopped And user can start transactions And an FTS probe is triggered @@ -524,11 +577,11 @@ Feature: expand the cluster by adding more segments Given the database is not running And a working directory of the test as '/data/gpdata/gpexpand' And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into - And a cluster is created with mirrors on "mdw" and "sdw1" + And a cluster is created with mirrors on "cdw" and "sdw1" And the user runs gpinitstandby with options " " And database "gptest" exists And there are no gpexpand_inputfiles - And the cluster is setup for an expansion on hosts "mdw,sdw1" + And the cluster is setup for an expansion on hosts "cdw,sdw1" And the primary on content 0 is stopped And user can start transactions And an FTS probe is triggered @@ -540,3 +593,117 @@ Feature: expand the cluster by adding more segments When the user runs gpexpand with a static inputfile for a single-node cluster with mirrors without ret code check Then gpexpand should return a return code of 0 And gpexpand should print "One or more segments are either down or not in preferred role." to stdout + + @gpexpand_no_mirrors + + @gpexpand_segment + Scenario: Gpexpand should succeed when there has event trigger + Given the database is not running + And a working directory of the test as '/data/gpdata/gpexpand' + And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into + And the cluster is generated with "1" primaries only + And database "gptest" exists + And the user runs psql with "-c 'create table t(a int)'" against database "gptest" + And create event trigger function + And the user runs psql with "-c 'create event trigger log_alter on ddl_command_end execute function notcie_ddl()'" against database "gptest" + And there are no gpexpand_inputfiles + And the cluster is setup for an expansion on hosts "localhost" + When the user runs gpexpand interview to add 1 new segment and 0 new host "ignored.host" + Then the number of segments have been saved + When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent" + Then verify that the cluster has 1 new segments + And the user runs psql with "-c 'alter table t add column b int'" against database "gptest" + + @gpexpand_segment + Scenario: expand a cluster and verify necessary catalog tables are copied to new segments + Given the database is not running + And a working directory of the test as '/data/gpdata/gpexpand' + And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into + And a cluster is created with no mirrors on "cdw" and "sdw1" + And database "gptest" exists + And the user runs psql with "-c 'CREATE ROLE abc; ALTER ROLE abc DENY DAY 0 DENY DAY 2 DENY BETWEEN DAY 4 AND DAY 5;'" against database "gptest" + And there are no gpexpand_inputfiles + And the cluster is setup for an expansion on hosts "cdw,sdw1" + When the user runs gpexpand interview to add 2 new segment and 0 new host "ignored.host" + Then the number of segments have been saved + When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent" + Then verify that the cluster has 2 new segments + And verify that "pg_description" catalog table is present on new segments + And verify that "pg_shdescription" catalog table is present on new segments + And verify that "pg_auth_time_constraint" catalog table is present on new segments + When the user runs "gpcheckcat gptest" + Then gpcheckcat should return a return code of 0 + And the user runs psql with "-c 'DROP ROLE abc'" against database "gptest" + + @gpexpand_mirrors + @gpexpand_segment + @gpexpand_verify_catalogs + Scenario: expand a cluster that has mirrors and check that gpexpand does not copy extra data directories from master + Given the database is not running + # need to remove this log because otherwise SCAN_LOG may pick up a previous error/warning in the log + And the user runs command "rm -rf ~/gpAdminLogs/gpinitsystem*" + And a working directory of the test as '/data/gpdata/gpexpand' + And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into + And a cluster is created with mirrors on "cdw" and "sdw1" + And database "gptest" exists + And the user runs command "analyzedb -d gptest -a" + And there are no gpexpand_inputfiles + And the cluster is setup for an expansion on hosts "cdw,sdw1" + And the number of segments have been saved + When the user runs gpexpand with a static inputfile for a single-node cluster with mirrors + Then verify that the cluster has 4 new segments + And verify that the path "db_dumps" in each segment data directory does not exist + And verify that the path "gpperfmon/data" in each segment data directory does not exist + And verify that the path "gpperfmon/logs" in each segment data directory does not exist + And verify that the path "promote" in each segment data directory does not exist + And verify that the path "db_analyze" in each segment data directory does not exist + + @gpexpand_no_mirrors + @gpexpand_segment + Scenario: gpexpand should skip already expanded/broken tables when redistributing + Given the database is not running + And a working directory of the test as '/data/gpdata/gpexpand' + And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into + And a cluster is created with no mirrors on "cdw" and "sdw1" + And database "gptest" exists + And the user runs psql with "-c 'CREATE TABLE test_good_1(a int)'" against database "gptest" + And the user runs psql with "-c 'CREATE TABLE test_already_expanded(a int)'" against database "gptest" + And the user runs psql with "-c 'CREATE TABLE test_broken(a int)'" against database "gptest" + And the user runs psql with "-c 'CREATE TABLE test_good_2(a int)'" against database "gptest" + And the user runs sql "DROP TABLE test_broken" in "gptest" on primary segment with content 0 + And there are no gpexpand_inputfiles + And the cluster is setup for an expansion on hosts "cdw,sdw1" + When the user runs gpexpand interview to add 2 new segment and 0 new host "ignored.host" + Then the number of segments have been saved + When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent" + Then verify that the cluster has 2 new segments + And the user runs psql with "-c 'ALTER TABLE test_already_expanded expand table'" against database "gptest" + When the user runs gpexpand to redistribute + Then gpexpand should print "[WARNING]:-Encountered unexpected issue when expanding table gptest.public.test_broken, skipping" escaped to stdout + And gpexpand should print "[INFO]:-Table gptest.public.test_already_expanded seems to be already expanded, marking as done" escaped to stdout + And table "test_good_1" should be marked as expanded + And table "test_good_2" should be marked as expanded + And table "test_already_expanded" should be marked as expanded + And table "test_broken" should not be marked as expanded + + @gpexpand_no_mirrors + @gpexpand_verify_dtx + Scenario: Gpexpand should succeed when xlog has DTX info + Given the database is not running + And a working directory of the test as '/data/gpdata/gpexpand' + And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into + And the cluster is generated with "3" primaries only + And database "gptest" exists + And the user runs psql with "-c 'create extension IF NOT EXISTS gp_inject_fault;create table ttt(tc1 int);'" against database "gptest" + And the user runs psql with "-c "SELECT gp_inject_fault('before_notify_commited_dtx_transaction', 'suspend', dbid) FROM gp_segment_configuration WHERE content = -1 AND role = 'p';"" against database "gptest" + And the user runs the command "psql gptest -c 'insert into ttt select generate_series(1,100);'" in the background without sleep + And waiting "1" seconds + And there are no gpexpand_inputfiles + And the cluster is setup for an expansion on hosts "localhost" + When the user runs gpexpand interview to add 1 new segment and 0 new host "ignored.host" + Then the number of segments have been saved + When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent" + And the user runs psql with "-c "SELECT gp_inject_fault('before_notify_commited_dtx_transaction', 'reset', dbid) FROM gp_segment_configuration WHERE content = -1 AND role = 'p';"" against database "gptest" + And waiting "1" seconds + And the user runs psql with "-c 'drop table ttt;'" against database "gptest" + Then verify that the cluster has 1 new segments diff --git a/gpMgmt/test/behave/mgmt_utils/gpinitsystem.feature b/gpMgmt/test/behave/mgmt_utils/gpinitsystem.feature index 1d69a5403ff..0c6e3dbfa19 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpinitsystem.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpinitsystem.feature @@ -17,6 +17,23 @@ Feature: gpinitsystem tests And gpconfig should print "Coordinator value: off" to stdout And gpconfig should print "Segment value: off" to stdout + Scenario: gpinitsystem should import the system collations + Given the database is not running + And create demo cluster config + When the user runs command "gpinitsystem -a -c ../gpAux/gpdemo/clusterConfigFile" + Then gpinitsystem should return a return code of 0 + And the user runs "psql postgres -c \"create table collationimport1 as select * from pg_collation where collnamespace = 'pg_catalog'::regnamespace\"" + # no more collation is imported + When the user runs "psql postgres -c \"select pg_import_system_collations('pg_catalog')\"" + Then psql should return a return code of 0 + And psql should print "0" to stdout + And psql should print "(1 row)" to stdout + And the user runs "psql postgres -c \"create table collationimport2 as select * from pg_collation where collnamespace = 'pg_catalog'::regnamespace\"" + # no difference is before import and after import + When the user runs "psql postgres -c \"select * from collationimport1 except select * from collationimport2\"" + Then psql should return a return code of 0 + And psql should print "(0 rows)" to stdout + Scenario: gpinitsystem creates a cluster when the user set LC_ALL env variable Given create demo cluster config And the environment variable "LC_ALL" is set to "en_US.UTF-8" @@ -70,6 +87,7 @@ Feature: gpinitsystem tests Given the user runs "gpstate" Then gpstate should return a return code of 0 + @extended Scenario: gpinitsystem creates a backout file when gpinitsystem process terminated Given create demo cluster config And all files in gpAdminLogs directory are deleted @@ -84,6 +102,7 @@ Feature: gpinitsystem tests And gpinitsystem should return a return code of 0 And gpintsystem logs should not contain lines about running backout script + @extended Scenario: gpinitsystem creates a backout file when gpcreateseg process terminated Given create demo cluster config And all files in gpAdminLogs directory are deleted @@ -97,6 +116,7 @@ Feature: gpinitsystem tests And gpinitsystem should return a return code of 0 And gpintsystem logs should not contain lines about running backout script + @extended Scenario: gpinitsystem does not create or need backout file when user terminated very early Given create demo cluster config And all files in gpAdminLogs directory are deleted @@ -333,4 +353,3 @@ Feature: gpinitsystem tests When the user runs command "grep -q '.*gpcreateseg\.sh.*Completed ssh.*' ~/gpAdminLogs/gpinitsystem*log" Then grep should return a return code of 0 And the user runs command "mv ../gpAux/gpdemo/clusterConfigFile.bak ../gpAux/gpdemo/clusterConfigFile" - diff --git a/gpMgmt/test/behave/mgmt_utils/gpmovemirrors.feature b/gpMgmt/test/behave/mgmt_utils/gpmovemirrors.feature index ed6dda775f4..62565fa3245 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpmovemirrors.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpmovemirrors.feature @@ -26,23 +26,31 @@ Feature: Tests for gpmovemirrors Given a standard local demo cluster is created And a gpmovemirrors directory under '/tmp/gpmovemirrors' with mode '0700' is created And a 'good' gpmovemirrors file is created + And verify replication slot internal_wal_replication_slot is available on all the segments When the user runs gpmovemirrors Then gpmovemirrors should return a return code of 0 And verify the database has mirrors + #gpmovemirrors triggers full recovery where old replication slot is dropped and new one is created + And verify replication slot internal_wal_replication_slot is available on all the segments And all the segments are running And the segments are synchronized + And check segment conf: postgresql.conf And verify that mirrors are recognized after a restart Scenario: gpmovemirrors can change the port of mirrors within a single host Given a standard local demo cluster is created And a gpmovemirrors directory under '/tmp/gpmovemirrors' with mode '0700' is created And a 'samedir' gpmovemirrors file is created + And verify replication slot internal_wal_replication_slot is available on all the segments When the user runs gpmovemirrors Then gpmovemirrors should return a return code of 0 And verify the database has mirrors + #gpmovemirrors triggers full recovery where old replication slot is dropped and new one is created + And verify replication slot internal_wal_replication_slot is available on all the segments And all the segments are running And the segments are synchronized And verify that mirrors are recognized after a restart + And check segment conf: postgresql.conf Scenario: gpmovemirrors gives a warning when passed identical attributes for new and old mirrors Given a standard local demo cluster is created @@ -56,6 +64,7 @@ Feature: Tests for gpmovemirrors And the segments are synchronized And verify that mirrors are recognized after a restart + @skip_cleanup Scenario: tablespaces work Given a standard local demo cluster is created And a tablespace is created with data @@ -69,6 +78,7 @@ Feature: Tests for gpmovemirrors And verify that mirrors are recognized after a restart And the tablespace is valid + @skip_cleanup Scenario Outline: gpmovemirrors limits number of parallel processes correctly Given the database is running And all the segments are running @@ -102,7 +112,7 @@ Feature: Tests for gpmovemirrors add a validation error like both hosts recoverying to the same port - so that the triplet code fails assert that gp_seg_config wasn't updated """ - + @skip_cleanup Scenario Outline: user can if mirrors failed to move initially Given the database is running And all the segments are running @@ -178,6 +188,7 @@ Feature: Tests for gpmovemirrors And gprecoverseg should return a return code of 0 And all the segments are running And the segments are synchronized + And check segment conf: postgresql.conf And user can start transactions @@ -199,7 +210,7 @@ Feature: Tests for gpmovemirrors And the user reset the walsender on the primary on content 0 And the user waits until saved async process is completed And recovery_progress.file should not exist in gpAdminLogs - And the user waits until mirror on content 0,1 is up + And verify that mirror on content 0,1 is up And check if mirrors on content 0,1 are moved to new location on input file And user can start transactions And all files in gpAdminLogs directory are deleted on all hosts in the cluster @@ -223,10 +234,129 @@ Feature: Tests for gpmovemirrors And the user reset the walsender on the primary on content 0 And the user waits until saved async process is completed And recovery_progress.file should not exist in gpAdminLogs - And the user waits until mirror on content 0,1,2 is up + And verify that mirror on content 0,1,2 is up And check if mirrors on content 0,1,2 are moved to new location on input file And user can start transactions And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And the cluster is recovered in full and rebalanced + + @demo_cluster + @concourse_cluster + @skip_cleanup + Scenario: gpmovemirrors gives warning if pg_basebackup is already running for one of the mirrors to be moved + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And the information of contents 0,1,2 is saved + And user immediately stops all mirror processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + And the user asynchronously runs "gprecoverseg -aF" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + And user waits until gp_stat_replication table has no pg_basebackup entries for content 1,2 + And an FTS probe is triggered + And the user waits until mirror on content 1,2 is up + And verify that mirror on content 0 is down + And the gprecoverseg lock directory is removed + And user immediately stops all mirror processes for content 1,2 + And the user waits until mirror on content 1,2 is down + And a gpmovemirrors directory under '/tmp' with mode '0700' is created + And a gpmovemirrors input file is created + And edit the input file to recover mirror with content 0,1,2 to a new directory with mode 0700 + When the user runs gpmovemirrors with input file and additional args " " + Then gprecoverseg should print "Found pg_basebackup running for segments with contentIds [0], skipping recovery of these segments" to logfile + And gprecoverseg should return a return code of 0 + And gpmovemirrors should return a return code of 0 + And verify that mirror on content 1,2 is up + And verify that mirror on content 0 is down + And check if mirrors on content 1,2 are moved to new location on input file + And check if mirrors on content 0 are in their original configuration + And the user reset the walsender on the primary on content 0 + And the user waits until saved async process is completed + And recovery_progress.file should not exist in gpAdminLogs + And verify that mirror on content 0 is up + And the cluster is recovered in full and rebalanced + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + + @demo_cluster + @concourse_cluster + @skip_cleanup + Scenario: gpmovemirrors gives warning if pg_basebackup is already running for some of the mirrors to be moved + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And the information of contents 0,1,2 is saved + And user immediately stops all mirror processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + And the user suspend the walsender on the primary on content 1 + And the user asynchronously runs "gprecoverseg -aF" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + And user waits until gp_stat_replication table has no pg_basebackup entries for content 2 + And the user waits until mirror on content 2 is up + And verify that mirror on content 0,1 is down + And the gprecoverseg lock directory is removed + And user immediately stops all mirror processes for content 2 + And the user waits until mirror on content 2 is down + And a gpmovemirrors directory under '/tmp' with mode '0700' is created + And a gpmovemirrors input file is created + And edit the input file to recover mirror with content 0,1,2 to a new directory with mode 0700 + When the user runs gpmovemirrors with input file and additional args " " + Then gprecoverseg should print "Found pg_basebackup running for segments with contentIds [0, 1], skipping recovery of these segments" to logfile + And gprecoverseg should return a return code of 0 + And gpmovemirrors should return a return code of 0 + And verify that mirror on content 2 is up + And verify that mirror on content 0,1 is down + And check if mirrors on content 2 are moved to new location on input file + And check if mirrors on content 0,1 are in their original configuration + And the user reset the walsender on the primary on content 0 + And the user reset the walsender on the primary on content 1 + And the user waits until saved async process is completed + And recovery_progress.file should not exist in gpAdminLogs + And verify that mirror on content 0,1 is up + And the cluster is recovered in full and rebalanced + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + + @demo_cluster + @concourse_cluster + @skip_cleanup + Scenario: gpmovemirrors gives warning if pg_basebackup is already running for all mirrors to be moved + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And the information of contents 0,1,2 is saved + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover mirror with content 0 to a new directory on remote host with mode 0700 + And edit the input file to recover mirror with content 1 to a new directory on remote host with mode 0700 + And edit the input file to recover mirror with content 2 to a new directory on remote host with mode 0700 + And user immediately stops all mirror processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + And the user suspend the walsender on the primary on content 1 + And the user suspend the walsender on the primary on content 2 + When the user asynchronously runs gprecoverseg with input file and additional args "-a" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + And verify that mirror on content 0,1,2 is down + And the gprecoverseg lock directory is removed + Given a gpmovemirrors directory under '/tmp' with mode '0700' is created + And a gpmovemirrors input file is created + And edit the input file to recover mirror with content 0,1,2 to a new directory with mode 0700 + When the user runs gpmovemirrors with input file and additional args "-v" + And gprecoverseg should return a return code of 0 + And gpmovemirrors should return a return code of 0 + Then gprecoverseg should print "Found pg_basebackup running for segments with contentIds [0, 1, 2], skipping recovery of these segments" to logfile + And the user reset the walsender on the primary on content 0 + And the user reset the walsender on the primary on content 1 + And the user reset the walsender on the primary on content 2 + And the user waits until saved async process is completed + And recovery_progress.file should not exist in gpAdminLogs + And verify that mirror on content 0,1,2 is up + And the cluster is recovered in full and rebalanced + And all files in gpAdminLogs directory are deleted on all hosts in the cluster ########################### @concourse_cluster tests ########################### @@ -236,7 +366,7 @@ Feature: Tests for gpmovemirrors Scenario: gpmovemirrors can change from group mirroring to spread mirroring Given verify that mirror segments are in "group" configuration And pg_hba file "/data/gpdata/primary/gpseg1/pg_hba.conf" on host "sdw1" contains only cidr addresses - And a sample gpmovemirrors input file is created in "spread" configuration + And a sample gpmovemirrors input file is created in "spread" configuration on "old" parent directory When the user runs "gpmovemirrors --input=/tmp/gpmovemirrors_input_spread" Then gpmovemirrors should return a return code of 0 # Verify that mirrors are functional in the new configuration @@ -268,19 +398,23 @@ Feature: Tests for gpmovemirrors Then gprecoverseg should return a return code of 0 And all the segments are running And the segments are synchronized + And check segment conf: postgresql.conf @concourse_cluster Scenario: gpmovemirrors can change from spread mirroring to group mirroring Given verify that mirror segments are in "spread" configuration - And a sample gpmovemirrors input file is created in "group" configuration + And a sample gpmovemirrors input file is created in "group" configuration on "old" parent directory When the user runs "gpmovemirrors --input=/tmp/gpmovemirrors_input_group --hba-hostnames" Then gpmovemirrors should return a return code of 0 # Verify that mirrors are functional in the new configuration Then verify the database has mirrors And all the segments are running And the segments are synchronized + And saving host IP address of "sdw3" # gpmovemirrors_input_group moves mirror on sdw3 to sdw2, corresponding primary should now have sdw2 entry And pg_hba file "/data/gpdata/primary/gpseg1/pg_hba.conf" on host "sdw1" contains entries for "sdw2" + And pg_hba file on primary of mirrors on "sdw2" with "1" contains no replication entries for "sdw3" + And verify that only replication connection primary has is to "sdw2" And verify that mirror segments are in "group" configuration And verify that mirrors are recognized after a restart And the information of a "mirror" segment on a remote host is saved @@ -305,12 +439,13 @@ Feature: Tests for gpmovemirrors Then gprecoverseg should return a return code of 0 And all the segments are running And the segments are synchronized + And check segment conf: postgresql.conf @concourse_cluster Scenario: tablespaces work on a multi-host environment Given verify that mirror segments are in "group" configuration And a tablespace is created with data - And a sample gpmovemirrors input file is created in "spread" configuration + And a sample gpmovemirrors input file is created in "spread" configuration on "old" parent directory When the user runs "gpmovemirrors --input=/tmp/gpmovemirrors_input_spread" Then gpmovemirrors should return a return code of 0 And verify the tablespace directories on host "sdw2" for content "1" are deleted @@ -353,13 +488,14 @@ Feature: Tests for gpmovemirrors And gprecoverseg should print "Initiating segment recovery." to stdout And check if mirrors on content 0,1,2 are moved to new location on input file - And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts + And gpAdminLogs directory has "pg_basebackup*" files on respective hosts only for content 0,1,2 And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts And the mode of all the created data directories is changed to 0700 And the cluster is recovered in full and rebalanced + And check segment conf: postgresql.conf And the row count from table "test_movemirrors" in "postgres" is verified against the saved data @concourse_cluster @@ -370,6 +506,7 @@ Feature: Tests for gpmovemirrors And the segments are synchronized And all files in gpAdminLogs directory are deleted on all hosts in the cluster And the information of contents 0,1,2 is saved + And check segment conf: postgresql.conf And sql "DROP TABLE if exists test_movemirrors; CREATE TABLE test_movemirrors AS SELECT generate_series(1,10000) AS i" is executed in "postgres" db And the "test_movemirrors" table row count in "postgres" is saved @@ -387,13 +524,15 @@ Feature: Tests for gpmovemirrors And check if mirrors on content 0 are in their original configuration And check if mirrors on content 1,2 are moved to new location on input file And verify that mirror on content 1,2,3,4,5 is up - And gpAdminLogs directory has "pg_basebackup*" files on respective hosts only for content 0 + And gpAdminLogs directory has "pg_basebackup*" files on respective hosts only for content 0,1,2 And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts + And check segment conf: postgresql.conf And the mode of all the created data directories is changed to 0700 And the cluster is recovered in full and rebalanced + And check segment conf: postgresql.conf And the row count from table "test_movemirrors" in "postgres" is verified against the saved data @concourse_cluster @@ -404,6 +543,7 @@ Feature: Tests for gpmovemirrors And the segments are synchronized And all files in gpAdminLogs directory are deleted on all hosts in the cluster And the information of contents 0,1,2,3,4,5 is saved + And check segment conf: postgresql.conf And sql "DROP TABLE if exists test_movemirrors; CREATE TABLE test_movemirrors AS SELECT generate_series(1,10000) AS i" is executed in "postgres" db And the "test_movemirrors" table row count in "postgres" is saved @@ -424,9 +564,11 @@ Feature: Tests for gpmovemirrors And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts + And check segment conf: postgresql.conf And the mode of all the created data directories is changed to 0700 And the cluster is recovered in full and rebalanced + And check segment conf: postgresql.conf And the row count from table "test_movemirrors" in "postgres" is verified against the saved data @concourse_cluster @@ -437,6 +579,7 @@ Feature: Tests for gpmovemirrors And the segments are synchronized And all files in gpAdminLogs directory are deleted on all hosts in the cluster And the information of contents 0,1,2,3,4,5 is saved + And check segment conf: postgresql.conf And sql "DROP TABLE if exists test_movemirrors; CREATE TABLE test_movemirrors AS SELECT generate_series(1,10000) AS i" is executed in "postgres" db And the "test_movemirrors" table row count in "postgres" is saved @@ -459,7 +602,63 @@ Feature: Tests for gpmovemirrors And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts + And check segment conf: postgresql.conf And the mode of all the created data directories is changed to 0700 And the cluster is recovered in full and rebalanced + And check segment conf: postgresql.conf And the row count from table "test_movemirrors" in "postgres" is verified against the saved data + + @concourse_cluster + Scenario: gpmovemirrors removes the stale replication entries from pg_hba when moving mirrors to another host + Given a working directory of the test as '/tmp/gpmovemirrors' + And the database is not running + And a cluster is created with "spread" segment mirroring on "cdw" and "sdw1, sdw2, sdw3" + And verify that mirror segments are in "spread" configuration + And a gpmovemirrors directory under '/tmp' with mode '0700' is created + And create an input file to move mirrors from "sdw1" to "sdw3" in "same" data directory + When the user runs "gpmovemirrors -a --input=/tmp/gpmovemirrors_input_sdw1_sdw3" + Then gpmovemirrors should return a return code of 0 + Then verify the database has mirrors + And all the segments are running + And the segments are synchronized + And saving host IP address of "sdw1" + And pg_hba file on primary of mirrors on "sdw3" with "3,4" contains no replication entries for "sdw1" + And verify that only replication connection primary has is to "sdw3" + + @concourse_cluster + Scenario: gpmovemirrors fails if the target host does not have enough free disk space to move mirror from source host + Given the database is running + And all the segments are running + And the segments are synchronized + And a tablespace is created with data + And mount a filesystem with min total capacity + And a gpmovemirrors input file is created + And edit the input file to move mirror with content 0 to a new directory on remote host with mode 0700 + And edit the input file to move mirror with content 1 to a new directory on remote host with mode 0700 + And edit the input file to move mirror with content 2 to a new directory on remote host with mode 0700 + And edit the input file to move mirror with content 3 to a new directory on remote host with mode 0700 + And edit the input file to move mirror with content 4 to a new directory on remote host with mode 0700 + And edit the input file to move mirror with content 5 to a new directory on remote host with mode 0700 + + When the user runs gpmovemirrors + Then gpmovemirrors should return a return code of 3 + And gpmovemirrors should print "Insufficient disk space on target mirror hosts." to stdout + And all the segments are running + And the segments are synchronized + + @concourse_cluster + Scenario: gpmovemirrors fails if the target host does not have enough free disk space to move mirror to new host + Given the database is running + And all the segments are running + And the segments are synchronized + And a tablespace is created with data + And mount a filesystem with min total capacity + And create an input file to move mirrors from "sdw2" to "sdw3" in "context" data directory + When the user runs "gpmovemirrors --input=/tmp/gpmovemirrors_input_sdw2_sdw3" + + Then gpmovemirrors should return a return code of 3 + And gpmovemirrors should print "Insufficient disk space on target mirror hosts." to stdout + And all the segments are running + And the segments are synchronized + diff --git a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature index 3d28dfc11d5..d8491f2da38 100644 --- a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature +++ b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature @@ -1,22 +1,72 @@ @gprecoverseg Feature: gprecoverseg tests - Scenario: incremental recovery works with tablespaces + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg recovery with a recovery configuration file and differential flag + Given the database is running + And all the segments are running + And the segments are synchronized + And user immediately stops all mirror processes for content 0,1,2 + And the user waits until mirror on content 0,1,2 is down + And user can start transactions + And the gprecoverseg input file "recover_config_file" is cleaned up + When a gprecoverseg input file "recover_config_file" is created with all the failed segments and valid recovery type + And the user runs "gprecoverseg -i /tmp/recover_config_file -a --differential" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And gprecoverseg should print "Synchronization mode.* = Differential" to stdout 2 times + And gprecoverseg should print "Synchronization mode.* = Full" to stdout 1 times + And all the segments are running + And the segments are synchronized + + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg" with a recovery configuration file specifying the recovery type + Given the database is running + And all the segments are running + And the segments are synchronized + And user immediately stops all mirror processes for content 0,1,2 + And the user waits until mirror on content 0,1,2 is down + And user can start transactions + And the gprecoverseg input file "recover_config_file" is cleaned up + When a gprecoverseg input file "recover_config_file" is created with all the failed segments and invalid recovery type + And the user runs "gprecoverseg -i /tmp/recover_config_file -a" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "Invalid recovery type provided, please provide any of I,D,F,i,d,f as recovery_type" to stdout + And verify that mirror on content 0,1,2 is down + When a gprecoverseg input file "recover_config_file" is created with all the failed segments and valid recovery type + And the user runs "gprecoverseg -i /tmp/recover_config_file -a" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And gprecoverseg should print "Synchronization mode.*= Incremental" to stdout 1 times + And gprecoverseg should print "Synchronization mode.* = Differential" to stdout 1 times + And gprecoverseg should print "Synchronization mode.* = Full" to stdout 1 times + And all the segments are running + And the segments are synchronized + + Scenario Outline: recovery works with tablespaces Given the database is running - And a tablespace is created with data And user stops all primary processes And user can start transactions - When the user runs "gprecoverseg -a" + And a tablespace is created with data + When the user runs "gprecoverseg " Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Future gprecoverseg executions might remove the currently created pg_basebackup/pg_rewind/rsync progress files, please save these files if needed." to stdout And the segments are synchronized + And verify replication slot internal_wal_replication_slot is available on all the segments And the tablespace is valid + And the tablespace has valid symlink And the database segments are in execute mode Given another tablespace is created with data When the user runs "gprecoverseg -ra" Then gprecoverseg should return a return code of 0 And the segments are synchronized + And verify replication slot internal_wal_replication_slot is available on all the segments And the tablespace is valid + And the tablespace has valid symlink And the other tablespace is valid And the database segments are in execute mode Examples: @@ -25,28 +75,137 @@ Feature: gprecoverseg tests | differential | -a --differential | | full | -aF | - Scenario: full recovery works with tablespaces + + @demo_cluster + @concourse_cluster + Scenario: differential recovery runs successfully Given the database is running - And a tablespace is created with data + And the segments are synchronized + And verify replication slot internal_wal_replication_slot is available on all the segments And user stops all primary processes And user can start transactions - When the user runs "gprecoverseg -a -F" + When the user runs "gprecoverseg -av --differential" Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Successfully dropped replication slot internal_wal_replication_slot" to stdout + And gprecoverseg should print "Successfully created replication slot internal_wal_replication_slot" to stdout + And gprecoverseg should print "Segments successfully recovered" to stdout + And verify that mirror on content 0,1,2 is up + And verify replication slot internal_wal_replication_slot is available on all the segments And the segments are synchronized - And the tablespace is valid + And the cluster is rebalanced - Given another tablespace is created with data - When the user runs "gprecoverseg -ra" + + Scenario: differential recovery shows error message if run with the wrong argument + Given the database is running + And user stops all primary processes + And user can start transactions + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + When the user runs "gprecoverseg -a --differential -F" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "Only one of -F and --differential may be specified" to stdout + When the user runs "gprecoverseg -a --differential -p localhost" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "Only one of -p, -r and --differential may be specified" to stdout + When the user runs "gprecoverseg -a --differential -o outputConfigFile" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "Invalid -o provided with --differential argument" to stdout + When the user runs "gprecoverseg -a --differential" + Then gprecoverseg should return a return code of 0 + And the segments are synchronized + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario: Differential recovery succeeds if previous incremental recovery failed + Given the database is running + And user stops all primary processes + And user can start transactions + And all files in pg_wal directory are deleted from data directory of preferred primary of content 0,1,2 + When the user runs "gprecoverseg -a" + Then gprecoverseg should return a return code of 1 + And user can start transactions + And verify that mirror on content 0,1,2 is down + When the user runs "gprecoverseg -a --differential" Then gprecoverseg should return a return code of 0 - And the segments are synchronized - And the tablespace is valid - And the other tablespace is valid + And verify that mirror on content 0,1,2 is up + And verify replication slot internal_wal_replication_slot is available on all the segments + And the cluster is rebalanced + + @demo_cluster + @concourse_cluster + Scenario: Differential recovery succeeds if previous full recovery failed + Given the database is running + And user stops all primary processes + And user can start transactions + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover mirror with content 0 incremental + And edit the input file to recover mirror with content 1 full inplace + And edit the input file to recover mirror with content 2 to a new directory on remote host with mode 0000 + When the user runs gprecoverseg with input file and additional args "-a" + Then gprecoverseg should return a return code of 1 + And user can start transactions + And verify that mirror on content 0,1 is up + And verify that mirror on content 2 is down + When the user runs "gprecoverseg -a --differential" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And verify replication slot internal_wal_replication_slot is available on all the segments + And the cluster is rebalanced + + + @concourse_cluster + Scenario: gpstate track of differential recovery for single host + Given the database is running + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all mirror processes for content 0 + And the user waits until mirror on content 0 is down + And user can start transactions + And sql "DROP TABLE IF EXISTS test_recoverseg; CREATE TABLE test_recoverseg AS SELECT generate_series(1,100000000) AS a;" is executed in "postgres" db + And sql "DROP TABLE IF EXISTS test_recoverseg_1; CREATE TABLE test_recoverseg_1 AS SELECT generate_series(1,100000000) AS a;" is executed in "postgres" db + When the user asynchronously runs "gprecoverseg -a --differential" and the process is saved + Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies that all dbids progress with pg_data are present + When the user runs "gpstate -e" + Then gpstate should print "Segments in recovery" to stdout + And gpstate output contains "differential" entries for mirrors of content 0 + And gpstate output looks like + | Segment | Port | Recovery type | Stage | Completed bytes \(kB\) | Percentage completed | + | \S+ | [0-9]+ | differential | Syncing pg_data of dbid 6 | ([\d,]+)[ \t] | \d+% | + And the user waits until saved async process is completed + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And sql "DROP TABLE IF EXISTS test_recoverseg;" is executed in "postgres" db + And sql "DROP TABLE IF EXISTS test_recoverseg_1;" is executed in "postgres" db + And the cluster is rebalanced + + + @concourse_cluster + Scenario: check Tablespace Recovery Progress with gpstate + Given the database is running + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all mirror processes for content 0 + And user can start transactions + And a tablespace is created with data + And insert additional data into the tablespace + When the user asynchronously runs "gprecoverseg -a --differential" and the process is saved + Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies that all dbids progress with tablespace are present + When the user runs "gpstate -e" + Then gpstate should print "Segments in recovery" to stdout + And gpstate output contains "differential" entries for mirrors of content 0 + And gpstate output looks like + | Segment | Port | Recovery type | Stage | Completed bytes \(kB\) | Percentage completed | + | \S+ | [0-9]+ | differential | Syncing tablespace of dbid 6 for oid \d+ | ([\d,]+)[ \t] | \d+% | + And the user waits until saved async process is completed + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And the cluster is rebalanced + Scenario Outline: full recovery limits number of parallel processes correctly - Given a standard local demo cluster is created + Given the database is running And 2 gprecoverseg directory under '/tmp/recoverseg' with mode '0700' is created And a good gprecoverseg input file is created for moving 2 mirrors - When the user runs gprecoverseg with input file and additional args "-a -F -v " + When the user runs gprecoverseg with input file and additional args "-a -v " Then gprecoverseg should return a return code of 0 And gprecoverseg should only spawn up to workers in WorkerPool And check if gprecoverseg ran "$GPHOME/sbin/gpsegsetuprecovery.py" 1 times with args "-b " @@ -55,6 +214,7 @@ Feature: gprecoverseg tests And gpsegrecovery should only spawn up to workers in WorkerPool And check if gprecoverseg ran "$GPHOME/sbin/gpsegstop.py" 1 times with args "-b " And the segments are synchronized + And check segment conf: postgresql.conf Examples: | args | coordinator_workers | segHost_workers | @@ -62,6 +222,21 @@ Feature: gprecoverseg tests | -B 2 -b 1 | 2 | 1 | | -B 1 -b 2 | 1 | 2 | + Scenario: Differential recovery limits number of parallel processes correctly + Given the database is running + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + When the user runs "gprecoverseg -av --differential -B 1 -b 2" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should only spawn up to 1 workers in WorkerPool + And check if gprecoverseg ran "$GPHOME/sbin/gpsegsetuprecovery.py" 1 times with args "-b 2" + And check if gprecoverseg ran "$GPHOME/sbin/gpsegrecovery.py" 1 times with args "-b 2" + And gpsegsetuprecovery should only spawn up to 2 workers in WorkerPool + And gpsegrecovery should only spawn up to 2 workers in WorkerPool + And the segments are synchronized + And check segment conf: postgresql.conf + And the cluster is rebalanced + Scenario Outline: Rebalance correctly limits the number of concurrent processes Given the database is running And user stops all primary processes @@ -78,6 +253,7 @@ Feature: gprecoverseg tests And check if gprecoverseg ran "$GPHOME/sbin/gpsegrecovery.py" 1 times with args "-b " And check if gprecoverseg ran "$GPHOME/sbin/gpsegstop.py" 1 times with args "-b " And the segments are synchronized + And check segment conf: postgresql.conf Examples: | args | coordinator_workers | segHost_workers | @@ -98,72 +274,45 @@ Feature: gprecoverseg tests When the user runs "gprecoverseg -ra" Then gprecoverseg should return a return code of 0 And gprecoverseg should not print "Unhandled exception in thread started by recovery displays pg_controldata success info Given the database is running And all the segments are running And the segments are synchronized And user stops all mirror processes When user can start transactions - And the user runs "gprecoverseg -F -a" + And the user runs "gprecoverseg " Then gprecoverseg should return a return code of 0 And gprecoverseg should print "Successfully finished pg_controldata.* for dbid.*" to stdout And the segments are synchronized + And verify replication slot internal_wal_replication_slot is available on all the segments And check segment conf: postgresql.conf - Scenario: gprecoverseg incremental recovery displays pg_controldata success info - Given the database is running - And all the segments are running - And the segments are synchronized - And user stops all mirror processes - When user can start transactions - And the user runs "gprecoverseg -a" - Then gprecoverseg should return a return code of 0 - And gprecoverseg should print "Successfully finished pg_controldata.* for dbid.*" to stdout - And the segments are synchronized - And check segment conf: postgresql.conf + Examples: + | scenario | args | + | incremental | -a | + | differential | -a --differential | + | full | -aF | Scenario: gprecoverseg mixed recovery displays pg_basebackup and rewind progress to the user Given the database is running @@ -186,8 +335,8 @@ Feature: gprecoverseg tests And gprecoverseg should print "Segments successfully recovered" to stdout And check if gprecoverseg ran gpsegsetuprecovery.py 1 times with the expected args And check if gprecoverseg ran gpsegrecovery.py 1 times with the expected args - And gpAdminLogs directory has no "pg_basebackup*" files - And gpAdminLogs directory has no "pg_rewind*" files + And gpAdminLogs directory has "pg_basebackup*" files + And gpAdminLogs directory has "pg_rewind*" files And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts And the old data directories are cleaned up for content 0 @@ -208,7 +357,7 @@ Feature: gprecoverseg tests When the user runs "gprecoverseg -a -s" Then gprecoverseg should return a return code of 0 And gprecoverseg should print "pg_rewind: Done!" to stdout for each mirror - And gpAdminLogs directory has no "pg_rewind*" files + And gpAdminLogs directory has "pg_rewind*" files And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts @@ -226,7 +375,7 @@ Feature: gprecoverseg tests Then gprecoverseg should return a return code of 0 And gprecoverseg should print "Initiating segment recovery. Upon completion, will start the successfully recovered segments" to stdout And gprecoverseg should not print "pg_basebackup: base backup completed" to stdout - And gpAdminLogs directory has no "pg_basebackup*" files + And gpAdminLogs directory has "pg_basebackup*" files And all the segments are running And the segments are synchronized @@ -268,9 +417,7 @@ Feature: gprecoverseg tests And gprecoverseg should print "Initiating segment recovery. Upon completion, will start the successfully recovered segments" to stdout And gprecoverseg should print "total size" to stdout for each mirror And gprecoverseg should print "Segments successfully recovered" to stdout - And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts - And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts - And gpAdminLogs directory has no "rsync*" files on all segment hosts + And gpAdminLogs directory has "rsync*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files And gpAdminLogs directory has "gpsegsetuprecovery*" files And all the segments are running @@ -290,9 +437,7 @@ Feature: gprecoverseg tests And gprecoverseg should print "Initiating segment recovery. Upon completion, will start the successfully recovered segments" to stdout And gprecoverseg should not print "total size is .* speedup is .*" to stdout And gprecoverseg should print "Segments successfully recovered" to stdout - And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts - And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts - And gpAdminLogs directory has no "rsync*" files on all segment hosts + And gpAdminLogs directory has "rsync*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files And gpAdminLogs directory has "gpsegsetuprecovery*" files And all the segments are running @@ -371,11 +516,57 @@ Feature: gprecoverseg tests When the user runs "gprecoverseg -a -s" And gprecoverseg should print "skipping pg_rewind on mirror as standby.signal is present" to stdout Then gprecoverseg should return a return code of 0 - And gpAdminLogs directory has no "pg_rewind*" files + And gpAdminLogs directory has "pg_rewind*" files + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + Scenario: gprecoverseg should drop existing slot on full recovery + Given the database is running + And all the segments are running + And the segments are synchronized + And verify replication slot internal_wal_replication_slot is available on all the segments + And user stops all mirror processes + And user can start transactions + And the user waits until mirror on content 0,1,2 is down + When the user runs "gprecoverseg -a -F -v" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Checking if slot internal_wal_replication_slot exists" to stdout + And gprecoverseg should print "Successfully dropped replication slot internal_wal_replication_slot" to stdout + And gprecoverseg should print "pg_basebackup: base backup completed" to stdout + And gprecoverseg should print "Segments successfully recovered" to stdout + And verify that mirror on content 0,1,2 is up + And verify replication slot internal_wal_replication_slot is available on all the segments And all the segments are running And the segments are synchronized And the cluster is rebalanced + Scenario Outline: recovery should not try to drop slot if slot does not exist + Given the database is running + And all the segments are running + And the segments are synchronized + And verify replication slot internal_wal_replication_slot is available on all the segments + And the mirror on content 0 is stopped + And user can start transactions + And the status of the mirror on content 0 should be "d" + And the user runs sql "select pg_drop_replication_slot('internal_wal_replication_slot');" in "postgres" on first primary segment + When the user runs "gprecoverseg " + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Checking if slot internal_wal_replication_slot exists" to stdout + And gprecoverseg should print "Slot internal_wal_replication_slot does not exist" to stdout + And gprecoverseg should not print "Successfully dropped replication slot internal_wal_replication_slot" to stdout + And gprecoverseg should print "Segments successfully recovered" to stdout + And verify that mirror on content 0 is up + And verify replication slot internal_wal_replication_slot is available on all the segments + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + Examples: + | scenario | args | + | differential | -av --differential | + | full | -avF | + @backup_restore_bashrc Scenario: gprecoverseg should not return error when banner configured on host Given the database is running @@ -390,6 +581,49 @@ Feature: gprecoverseg tests And the segments are synchronized And the cluster is rebalanced + Scenario: gprecoverseg errors out with restricted options + Given the database is running + And user stops all primary processes + And user can start transactions + When the user runs "gprecoverseg -a -F -r" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "-F option is not supported with -r option" to stdout + When the user runs "gprecoverseg -a -p localhost -F" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "-F option is not supported with -p option" to stdout + When the user runs "gprecoverseg xyz" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "Recovers a primary or mirror segment instance" to stdout + And gprecoverseg should print "too many arguments: only options may be specified" to stdout + When the user runs "gprecoverseg -a" + Then gprecoverseg should return a return code of 0 + And the segments are synchronized + And the cluster is rebalanced + + Scenario: gprecoverseg recovers segment for valid max-rate options and errors out for others + Given the database is running + And all the segments are running + And the segments are synchronized + When user stops all primary processes + And user can start transactions + And the user runs "gprecoverseg -aF --max-rate 30" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "error: transfer rate 30 is out of range" to stdout + When the user runs "gprecoverseg -aF --max-rate k35" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "error: transfer rate k35 is not a valid value" to stdout + When the user runs "gprecoverseg -aF --max-rate 0" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "error: Transfer rate must be greater than zero" to stdout + When the user runs "gprecoverseg -aF --max-rate 32G" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "error: Invalid --max-rate unit: G" to stdout + When the user runs "gprecoverseg -aF --max-rate 104857.6k" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Segments successfully recovered" to stdout + And gprecoverseg should print "Maximum Transfer Rate.*= 104857.6k" to stdout + And the segments are synchronized + And the cluster is rebalanced ########################### @concourse_cluster tests ########################### # The @concourse_cluster tag denotes the scenario that requires a remote cluster @@ -430,47 +664,71 @@ Feature: gprecoverseg tests And the cluster is returned to a good state Examples: - | scenario | args | - | incremental | -a | - | full | -aF | + | scenario | args | + | incremental | -a | + | differential | -a --differential | + | full | -aF | + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg creates output sample config file correctly when failed segment hosts are unreachable + Given the database is running + And all the segments are running + And the segments are synchronized + And the primary on content 1 is stopped + And the primary on content 2 is stopped + And user can start transactions + And the status of the primary on content 1 should be "d" + And the status of the primary on content 2 should be "d" + And the host for the primary on content 1 is made unreachable + When the user runs "gprecoverseg -o /tmp/output_config" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "One or more hosts are not reachable via SSH." to stdout + And gprecoverseg should print "Host invalid_host is unreachable" to stdout + And the created config file /tmp/output_config contains the commented row for unreachable failed segment + And the cluster is returned to a good state + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg throws exception when -o flag used with invalid flags + Given the database is running + And all the segments are running + And the segments are synchronized + When the user runs "gprecoverseg -o output_config -i input_config" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "Invalid -i provided with -o argument" to stdout + When the user runs "gprecoverseg -o /tmp/output_config -r" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "Invalid -r provided with -o argument" to stdout @concourse_cluster - Scenario: incremental recovery works with tablespaces on a multi-host environment + Scenario Outline: recovery works with tablespaces on a multi-host environment Given the database is running - And a tablespace is created with data And user stops all primary processes And user can start transactions - When the user runs "gprecoverseg -a" + And a tablespace is created with data + When the user runs "gprecoverseg " Then gprecoverseg should return a return code of 0 And the segments are synchronized And the tablespace is valid + And the tablespace has valid symlink And the database segments are in execute mode Given another tablespace is created with data When the user runs "gprecoverseg -ra" Then gprecoverseg should return a return code of 0 And the segments are synchronized + And verify replication slot internal_wal_replication_slot is available on all the segments And the tablespace is valid + And the tablespace has valid symlink And the other tablespace is valid And the database segments are in execute mode - @concourse_cluster - Scenario: full recovery works with tablespaces on a multi-host environment - Given the database is running - And a tablespace is created with data - And user stops all primary processes - And user can start transactions - When the user runs "gprecoverseg -a -F" - Then gprecoverseg should return a return code of 0 - And the segments are synchronized - And the tablespace is valid - - Given another tablespace is created with data - When the user runs "gprecoverseg -ra" - Then gprecoverseg should return a return code of 0 - And the segments are synchronized - And the tablespace is valid - And the other tablespace is valid + Examples: + | scenario | args | + | incremental | -a | + | differential | -a --differential | + | full | -aF | @concourse_cluster Scenario: recovering a host with tablespaces succeeds @@ -511,6 +769,7 @@ Feature: gprecoverseg tests # verify the data And the tablespace is valid + And the tablespace has valid symlink And the row count from table "public.before_host_is_down" in "gptest" is verified against the saved data And the row count from table "public.after_host_is_down" in "gptest" is verified against the saved data @@ -525,14 +784,14 @@ Feature: gprecoverseg tests And sql "DROP TABLE IF EXISTS test_recoverseg; CREATE TABLE test_recoverseg AS SELECT generate_series(1,100000000) AS a;" is executed in "postgres" db When the user asynchronously runs "gprecoverseg -a" and the process is saved Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format - And an FTS probe is triggered And the user waits until saved async process is completed And recovery_progress.file should not exist in gpAdminLogs And the user waits until mirror on content 0,1,2 is up And user can start transactions And all files in gpAdminLogs directory are deleted on all hosts in the cluster And a sample recovery_progress.file is created from saved lines - Then a sample gprecoverseg.lock directory is created in coordinator_data_directory + And we run a sample background script to generate a pid on "coordinator" segment + Then a sample gprecoverseg.lock directory is created using the background pid in coordinator_data_directory When the user runs "gpstate -e" Then gpstate should print "Segments in recovery" to stdout # And gpstate output contains "incremental,incremental,incremental" entries for mirrors of content 0,1,2 @@ -542,6 +801,7 @@ Feature: gprecoverseg tests # | \S+ | [0-9]+ | incremental | [0-9]+ | [0-9]+ | [0-9]+\% | # | \S+ | [0-9]+ | incremental | [0-9]+ | [0-9]+ | [0-9]+\% | And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And the background pid is killed on "coordinator" segment Then the gprecoverseg lock directory is removed And the cluster is rebalanced @@ -552,20 +812,14 @@ Feature: gprecoverseg tests And the user suspend the walsender on the primary on content 0 Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format And verify that lines from recovery_progress.file are present in segment progress files in gpAdminLogs - + When the user runs "gpstate -e" + Then gpstate should print "Segments in recovery" to stdout And the user reset the walsender on the primary on content 0 And the user waits until saved async process is completed And recovery_progress.file should not exist in gpAdminLogs - And an FTS probe is triggered - And the user waits until mirror on content 0,1,2 is up + And verify that mirror on content 0,1,2 is up And user can start transactions - - And a sample recovery_progress.file is created from saved lines - Then a sample gprecoverseg.lock directory is created in coordinator_data_directory - When the user runs "gpstate -e" - Then gpstate should print "Segments in recovery" to stdout And all files in gpAdminLogs directory are deleted on all hosts in the cluster - Then the gprecoverseg lock directory is removed @demo_cluster @concourse_cluster @@ -583,7 +837,7 @@ Feature: gprecoverseg tests And the user reset the walsender on the primary on content 0 And the user waits until saved async process is completed And recovery_progress.file should not exist in gpAdminLogs - And the user waits until mirror on content 0,1,2 is up + And verify that mirror on content 0,1,2 is up And user can start transactions And all files in gpAdminLogs directory are deleted on all hosts in the cluster @@ -602,10 +856,27 @@ Feature: gprecoverseg tests And the user reset the walsender on the primary on content 0 And the user waits until saved async process is completed And recovery_progress.file should not exist in /tmp/custom_logdir - And the user waits until mirror on content 0,1,2 is up + And verify that mirror on content 0,1,2 is up And user can start transactions And all files in "/tmp/custom_logdir" directory are deleted on all hosts in the cluster + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg creates recovery_progress.file in gpAdminLogs for differential recovery of mirrors + Given the database is running + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all mirror processes for content 0,1,2 + And the user waits until mirror on content 0,1,2 is down + And user can start transactions + And sql "DROP TABLE IF EXISTS test_recoverseg; CREATE TABLE test_recoverseg AS SELECT generate_series(1,100000000) AS a;" is executed in "postgres" db + When the user asynchronously runs "gprecoverseg -a --differential" and the process is saved + Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format + And verify that lines from recovery_progress.file are present in segment progress files in gpAdminLogs + And the user waits until saved async process is completed + And recovery_progress.file should not exist in gpAdminLogs + And verify that mirror on content 0,1,2 is up + And user can start transactions + And all files in gpAdminLogs directory are deleted on all hosts in the cluster @demo_cluster @concourse_cluster @@ -631,11 +902,12 @@ Feature: gprecoverseg tests And the user waits until mirror on content 0,1,2 is up And the old data directories are cleaned up for content 0 And user can start transactions + And check segment conf: postgresql.conf And all files in gpAdminLogs directory are deleted on all hosts in the cluster @demo_cluster @concourse_cluster - Scenario: SIGHUP on gprecoverseg should not display progress in gpstate -e + Scenario: SIGKILL on gprecoverseg should not display progress in gpstate -e Given the database is running And all the segments are running And the segments are synchronized @@ -649,13 +921,13 @@ Feature: gprecoverseg tests Then verify if the gprecoverseg.lock directory is present in coordinator_data_directory When the user runs "gpstate -e" Then gpstate should print "Segments in recovery" to stdout - When the user asynchronously sets up to end gprecoverseg process with SIGHUP + When the user asynchronously sets up to end gprecoverseg process with SIGKILL And the user waits until saved async process is completed - Then the gprecoverseg lock directory is removed When the user runs "gpstate -e" Then gpstate should not print "Segments in recovery" to stdout Then the user reset the walsender on the primary on content 0 And the user waits until mirror on content 0,1,2 is up + And the gprecoverseg lock directory is removed And the cluster is rebalanced @demo_cluster @@ -678,20 +950,179 @@ Feature: gprecoverseg tests When the user asynchronously runs gprecoverseg with input file and additional args "-a" and the process is saved Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format And user waits until gp_stat_replication table has no pg_basebackup entries for content 1 - And an FTS probe is triggered And the user waits until mirror on content 1,2 is up And verify that mirror on content 0 is down And user can start transactions And verify that lines from recovery_progress.file are present in segment progress files in gpAdminLogs And the user reset the walsender on the primary on content 0 And the user waits until saved async process is completed - And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts - And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts + And gpAdminLogs directory has "pg_basebackup*" files on respective hosts only for content 0,1 + And gpAdminLogs directory has "pg_rewind*" files on respective hosts only for content 2 And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts And the cluster is recovered in full and rebalanced And the row count from table "test_recoverseg" in "postgres" is verified against the saved data + @demo_cluster + Scenario: gprecoverseg should not give warning if pg_basebackup is running for the up segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1 + And the user suspend the walsender on the primary on content 2 + And the user asynchronously runs pg_basebackup with primary of content 2 as source and the process is saved + And an FTS probe is triggered + And gp_stat_replication table has pg_basebackup entry for content 2 + When the user runs "gprecoverseg -avF" + Then gprecoverseg should not print "No basebackup running" to stdout + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1 is up + And gp_stat_replication table has pg_basebackup entry for content 2 + And the user reset the walsender on the primary on content 2 + And the user waits until saved async process is completed + And verify that mirror on content 2 is up + And user can start transactions + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And the cluster is rebalanced + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg gives warning if pg_basebackup already running for one of the failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + And the user asynchronously runs "gprecoverseg -aF" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + And user waits until gp_stat_replication table has no pg_basebackup entries for content 1,2 + And the user waits until mirror on content 1,2 is up + And verify that mirror on content 0 is down + And the gprecoverseg lock directory is removed + And user immediately stops all primary processes for content 1,2 + And the user waits until mirror on content 1,2 is down + When the user runs "gprecoverseg -avF" + Then gprecoverseg should print "Found pg_basebackup running for segments with contentIds [0], skipping recovery of these segments" to logfile + And gprecoverseg should return a return code of 0 + And verify that mirror on content 1,2 is up + And verify that mirror on content 0 is down + And the user reset the walsender on the primary on content 0 + And the user waits until saved async process is completed + And recovery_progress.file should not exist in gpAdminLogs + And verify that mirror on content 0 is up + And the user runs "gprecoverseg -avF" + Then gprecoverseg should print "No basebackup running" to stdout + And gprecoverseg should return a return code of 0 + And the cluster is rebalanced + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg gives warning if pg_basebackup already running for some of the failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + And the user suspend the walsender on the primary on content 1 + And the user asynchronously runs "gprecoverseg -aF" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + And user waits until gp_stat_replication table has no pg_basebackup entries for content 2 + And the user waits until mirror on content 2 is up + And verify that mirror on content 0,1 is down + And the gprecoverseg lock directory is removed + And user immediately stops all primary processes for content 2 + And the user waits until mirror on content 2 is down + When the user runs "gprecoverseg -avF" + Then gprecoverseg should print "Found pg_basebackup running for segments with contentIds [0, 1], skipping recovery of these segments" to logfile + And gprecoverseg should return a return code of 0 + And verify that mirror on content 2 is up + And verify that mirror on content 0,1 is down + And the user reset the walsender on the primary on content 0 + And the user reset the walsender on the primary on content 1 + And the user waits until saved async process is completed + And recovery_progress.file should not exist in gpAdminLogs + And verify that mirror on content 0,1 is up + And the user runs "gprecoverseg -avF" + Then gprecoverseg should print "No basebackup running" to stdout + And gprecoverseg should return a return code of 0 + And the cluster is rebalanced + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg -aF gives warning if pg_basebackup already running for all of the failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + And the user suspend the walsender on the primary on content 1 + And the user suspend the walsender on the primary on content 2 + And the user asynchronously runs "gprecoverseg -aF" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + And verify that mirror on content 0,1,2 is down + And the gprecoverseg lock directory is removed + When the user runs "gprecoverseg -aF" + Then gprecoverseg should print "Found pg_basebackup running for segments with contentIds [0, 1, 2], skipping recovery of these segments" to logfile + And gprecoverseg should print "No segments to recover" to stdout + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is down + And the user reset the walsender on the primary on content 0 + And the user reset the walsender on the primary on content 1 + And the user reset the walsender on the primary on content 2 + And the user waits until saved async process is completed + And recovery_progress.file should not exist in gpAdminLogs + And verify that mirror on content 0,1,2 is up + And the user runs "gprecoverseg -avF" + Then gprecoverseg should print "No basebackup running" to stdout + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And the cluster is rebalanced + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg -i gives warning if pg_basebackup already running for all failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + And the user suspend the walsender on the primary on content 1 + And the user suspend the walsender on the primary on content 2 + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover mirror with content 0 full inplace + And edit the input file to recover mirror with content 1 full inplace + And edit the input file to recover mirror with content 2 full inplace + When the user asynchronously runs gprecoverseg with input file and additional args "-a" and the process is saved + Then the user just waits until recovery_progress.file is created in gpAdminLogs + And verify that mirror on content 0,1,2 is down + And the gprecoverseg lock directory is removed + When the user runs gprecoverseg with input file and additional args "-a" + Then gprecoverseg should print "Found pg_basebackup running for segments with contentIds [0, 1, 2], skipping recovery of these segments" to logfile + And gprecoverseg should print "No segments to recover" to stdout + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is down + And the user reset the walsender on the primary on content 0 + And the user reset the walsender on the primary on content 1 + And the user reset the walsender on the primary on content 2 + And the user waits until saved async process is completed + And recovery_progress.file should not exist in gpAdminLogs + And verify that mirror on content 0,1,2 is up + And user can start transactions + When the user runs gprecoverseg with input file and additional args "-av" + Then gprecoverseg should print "No basebackup running" to stdout + And gprecoverseg should return a return code of 0 + Then the cluster is rebalanced + @demo_cluster @concourse_cluster Scenario: gprecoverseg incremental recovery segments come up even if one rewind fails @@ -709,9 +1140,9 @@ Feature: gprecoverseg tests And user can start transactions And check if incremental recovery failed for mirrors with content 0 for gprecoverseg - And gprecoverseg should print "Failed to recover the following segments. You must run gprecoverseg -F for all incremental failures" to stdout + And gprecoverseg should print "Failed to recover the following segments. You must run either gprecoverseg --differential or gprecoverseg -F for all incremental failures" to stdout And check if incremental recovery was successful for mirrors with content 1,2 - And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts + And gpAdminLogs directory has "pg_rewind*" files on all segment hosts And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts @@ -736,7 +1167,7 @@ Feature: gprecoverseg tests And gprecoverseg should print "Failed to recover the following segments. You must run either gprecoverseg --differential or gprecoverseg -F for all differential failures" to stdout And verify that mirror on content 1,2 is up And the segments are synchronized for content 1,2 - And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts + And gpAdminLogs directory has "rsync*" files on all segment hosts And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts And the temporary directory is removed @@ -944,19 +1375,25 @@ Feature: gprecoverseg tests When the user runs gprecoverseg with input file and additional args "-a" Then gprecoverseg should return a return code of 1 And user can start transactions + And check segment conf: postgresql.conf + And check if incremental recovery failed for mirrors with content 0 for gprecoverseg And check if full recovery was successful for mirrors with content 1 And check if full recovery failed for mirrors with content 2 for gprecoverseg + And gprecoverseg should print "error:.*required WAL directory ""pg_wal"" does not exist" to stdout + And gprecoverseg should print "error: pg_basebackup: error: could not access directory.* Permission denied" to stdout And gprecoverseg should not print "Segments successfully recovered" to stdout And check if mirrors on content 0,1,2 are in their original configuration And the gp_configuration_history table should contain a backout entry for the primary segment for contents 2 And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts + And check segment conf: postgresql.conf And the mode of all the created data directories is changed to 0700 And the cluster is recovered in full and rebalanced + And check segment conf: postgresql.conf And the row count from table "test_recoverseg" in "postgres" is verified against the saved data @demo_cluster @@ -990,8 +1427,8 @@ Feature: gprecoverseg tests And check if incremental recovery was successful for mirrors with content 2 And check if mirrors on content 0 are moved to new location on input file And check if mirrors on content 1,2 are in their original configuration - And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts - And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts + And gpAdminLogs directory has "pg_basebackup*" files on respective hosts only for content 0,1 + And gpAdminLogs directory has "pg_rewind*" files on respective hosts only for content 2 And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts And verify there are no recovery backout files @@ -1000,9 +1437,11 @@ Feature: gprecoverseg tests And the mode of all the created data directories is changed to 0700 Then the user runs "gprecoverseg -a" And gprecoverseg should return a return code of 0 + And all previous progress files are removed from gpAdminLogs directory on respective hosts only for content 0 And user can start transactions And the segments are synchronized And the cluster is rebalanced + And check segment conf: postgresql.conf And the row count from table "test_recoverseg" in "postgres" is verified against the saved data @demo_cluster @@ -1035,8 +1474,7 @@ Feature: gprecoverseg tests And verify that mirror on content 0,1,2 is down And check if mirrors on content 0,1,2 are moved to new location on input file - And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts - And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts + And gpAdminLogs directory has "pg_basebackup*" files on all segment hosts And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts And verify there are no recovery backout files @@ -1047,8 +1485,71 @@ Feature: gprecoverseg tests And user can start transactions And the segments are synchronized And the cluster is rebalanced + And check segment conf: postgresql.conf And the row count from table "test_recoverseg" in "postgres" is verified against the saved data + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg differential recovery gives warning if any of the failed segment's source is in backup already + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user runs sql "select pg_start_backup('test')" in "postgres" on primary segment with content 0 + When the user runs "gprecoverseg -a --differential" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 1,2 is up + And verify that mirror on content 0 is down + Then gprecoverseg should print "Found differential recovery running for segments with contentIds [0], skipping recovery of these segments" to logfile + And the user runs sql "select pg_stop_backup()" in "postgres" on primary segment with content 0 + When the user runs "gprecoverseg -av --differential" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And the cluster is rebalanced + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg differential recovery gives warning if some of the failed segment's source is in backup already + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user runs sql "select pg_start_backup('test')" in "postgres" on primary segment with content 0,1 + When the user runs "gprecoverseg -a --differential" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 2 is up + And verify that mirror on content 0,1 is down + Then gprecoverseg should print "Found differential recovery running for segments with contentIds [0, 1], skipping recovery of these segments" to logfile + And the user runs sql "select pg_stop_backup()" in "postgres" on primary segment with content 0,1 + When the user runs "gprecoverseg -av --differential" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And the cluster is rebalanced + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg differential recovery gives warning if all of the failed segment's source is in backup already + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user runs sql "select pg_start_backup('test')" in "postgres" on primary segment with content 0,1,2 + When the user runs "gprecoverseg -a --differential" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is down + Then gprecoverseg should print "Found differential recovery running for segments with contentIds [0, 1, 2], skipping recovery of these segments" to logfile + And the user runs sql "select pg_stop_backup()" in "postgres" on primary segment with content 0,1,2 + When the user runs "gprecoverseg -av --differential" + Then gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And the cluster is rebalanced + @concourse_cluster Scenario: gprecoverseg behave test requires a cluster with at least 2 hosts Given the database is running @@ -1056,7 +1557,7 @@ Feature: gprecoverseg tests And the information of a "mirror" segment on a remote host is saved @concourse_cluster - Scenario: When gprecoverseg full recovery is executed and an existing postmaster.pid on the killed primary segment corresponds to a non postgres process + Scenario Outline: When gprecoverseg recovery is executed and an existing postmaster.pid on the killed primary segment corresponds to a non postgres process Given the database is running And all the segments are running And the segments are synchronized @@ -1066,7 +1567,7 @@ Feature: gprecoverseg tests When user can start transactions And we run a sample background script to generate a pid on "primary" segment And we generate the postmaster.pid file with the background pid on "primary" segment - And the user runs "gprecoverseg -F -a" + And the user runs "gprecoverseg " Then gprecoverseg should return a return code of 0 And gprecoverseg should not print "Unhandled exception in thread started by when utility mode is set to @@ -125,8 +122,25 @@ Feature: gpstart behave tests | test_scenarios | utility_mode | psql_cmd | | super user connections | True | -c '\l' | | non-super user connections | True | -U foouser -c '\l' | - | super user connections | False | -c '\l' | - | non-super user connections | False | -U foouser -c '\l' | + + @concourse_cluster + @demo_cluster + Scenario Outline: "gpstart -m" accepts when utility mode is set to + Given the database is not running + And the user runs "gpstart -ma" + And "gpstart -ma" should return a return code of 0 + + When The user runs psql "" against database "postgres" when utility mode is set to "" + Then psql_cmd should return a return code of + And psql_cmd should print "" error message + + And the user runs "gpstop -mai" + And "gpstop -mai" should return a return code of 0 + + Examples: + | test_scenarios | utility_mode | psql_cmd | return_code | error_msg | + | super user connections | False | -c '\l' | 2 | psql: error: FATAL: System was started in single node mode - only utility mode connections are allowed | + | non-super user connections | False | -U foouser -c '\l' | 2 | psql: error: FATAL: System was started in single node mode - only utility mode connections are allowed | @concourse_cluster @demo_cluster @@ -143,11 +157,11 @@ Feature: gpstart behave tests And "gpstop -mai" should return a return code of 0 Examples: - | test_scenarios | utility_mode | psql_cmd | return_code | database | error_out_state | error_msg | - | super user connections | True | -c '\l' | 0 | accepts | should not | psql: error: FATAL: remaining connection slots are reserved for non-replication superuser connections | - | non-super user connections | True | -U foouser -c '\l' | 2 | rejects | should | psql: error: FATAL: remaining connection slots are reserved for non-replication superuser connections | - | super user connections | False | -c '\l' | 0 | accepts | should not | psql: error: FATAL: remaining connection slots are reserved for non-replication superuser connections | - | non-super user connections | False | -U foouser -c '\l' | 2 | rejects | should | psql: error: FATAL: remaining connection slots are reserved for non-replication superuser connections | + | test_scenarios | utility_mode | psql_cmd | return_code | database | error_out_state | error_msg | + | super user connections | True | -c '\l' | 0 | accepts | should not | psql: error: FATAL: remaining connection slots are reserved for non-replication superuser connections | + | non-super user connections | True | -U foouser -c '\l' | 2 | rejects | should | psql: error: FATAL: remaining connection slots are reserved for non-replication superuser connections | + | super user connections | False | -c '\l' | 2 | accepts | should | psql: error: FATAL: System was started in single node mode - only utility mode connections are allowed | + | non-super user connections | False | -U foouser -c '\l' | 2 | rejects | should | psql: error: FATAL: System was started in single node mode - only utility mode connections are allowed | @concourse_cluster @demo_cluster @@ -187,5 +201,3 @@ Feature: gpstart behave tests When the user runs "gpstart -a -B 1" Then "gpstart -a -B 1" should return a return code of 0 And gpcheckcat should not print "Number of segments which failed to start:.*" to stdout - - diff --git a/gpMgmt/test/behave/mgmt_utils/gpstate.feature b/gpMgmt/test/behave/mgmt_utils/gpstate.feature index e03c7f7bd8e..49fc475c0e2 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpstate.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpstate.feature @@ -251,7 +251,7 @@ Feature: gpstate tests Scenario: gpstate -m logs mirror details Given a standard local demo cluster is running When the user runs "gpstate -m" - Then gpstate should print "Current GPDB mirror list and status" to stdout + Then gpstate should print "Current CBDB mirror list and status" to stdout And gpstate output looks like | Mirror | Datadir | Port | Status | Data Status | | \S+ | .*/dbfast_mirror1/demoDataDir0 | [0-9]+ | Passive | Synchronized | @@ -263,7 +263,7 @@ Feature: gpstate tests And user stops all primary processes And user can start transactions When the user runs "gpstate -m" - Then gpstate should print "Current GPDB mirror list and status" to stdout + Then gpstate should print "Current CBDB mirror list and status" to stdout And gpstate output looks like | Mirror | Datadir | Port | Status | Data Status | | \S+ | .*/dbfast_mirror1/demoDataDir0 | [0-9]+ | Acting as Primary | Not In Sync | diff --git a/gpMgmt/test/behave/mgmt_utils/gpstop.feature b/gpMgmt/test/behave/mgmt_utils/gpstop.feature index aed47ea7103..097374bceff 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpstop.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpstop.feature @@ -5,8 +5,10 @@ Feature: gpstop behave tests @demo_cluster Scenario: gpstop succeeds Given the database is running + And running postgres processes are saved in context When the user runs "gpstop -a" Then gpstop should return a return code of 0 + And verify no postgres process is running on all hosts @demo_cluster Scenario: gpstop runs with given coordinator data directory option @@ -33,20 +35,24 @@ Feature: gpstop behave tests Scenario: when there are user connections gpstop waits to shutdown until user switches to fast mode Given the database is running And the user asynchronously runs "psql postgres" and the process is saved + And running postgres processes are saved in context When the user runs gpstop -a -t 4 --skipvalidation and selects f And gpstop should print "'\(s\)mart_mode', '\(f\)ast_mode', '\(i\)mmediate_mode'" to stdout Then gpstop should return a return code of 0 + And verify no postgres process is running on all hosts @concourse_cluster @demo_cluster Scenario: when there are user connections gpstop waits to shutdown until user connections are disconnected Given the database is running And the user asynchronously runs "psql postgres" and the process is saved - And the user asynchronously sets up to end that process in 6 seconds + And the user asynchronously sets up to end that process in 15 seconds + And running postgres processes are saved in context When the user runs gpstop -a -t 2 --skipvalidation and selects s And gpstop should print "There were 1 user connections at the start of the shutdown" to stdout And gpstop should print "'\(s\)mart_mode', '\(f\)ast_mode', '\(i\)mmediate_mode'" to stdout Then gpstop should return a return code of 0 + And verify no postgres process is running on all hosts @demo_cluster Scenario: gpstop succeeds even if the standby host is unreachable @@ -58,6 +64,149 @@ Feature: gpstop behave tests And gpstop should return a return code of 0 And the standby host is made reachable + @demo_cluster + Scenario: gpstop succeeds when pg_ctl command fails + Given the database is running + And the user runs psql with "-c "CREATE EXTENSION IF NOT EXISTS gp_inject_fault;"" against database "postgres" + And the user runs psql with "-c "SELECT gp_inject_fault('checkpoint', 'sleep', '', '', '', 1, -1, 3600, dbid) FROM gp_segment_configuration"" against database "postgres" + And running postgres processes are saved in context + When the user runs "gpstop -a -M fast" + And gpstop should print "Failed to shutdown coordinator with pg_ctl." to stdout + And gpstop should return a return code of 0 + And verify no postgres process is running on all hosts + + @demo_cluster + Scenario: gpstop succeeds with immediate option + Given the database is running + And the user asynchronously runs "psql postgres" and the process is saved + And the user asynchronously sets up to end that process in 15 seconds + And running postgres processes are saved in context + When the user runs "gpstop -a -M immediate" + And gpstop should print "Commencing Coordinator instance shutdown with mode='immediate'" to stdout + Then gpstop should return a return code of 0 + And verify no postgres process is running on all hosts + + @concourse_cluster + @demo_cluster + Scenario Outline: when the first gpstop interrupted and second gpstop handles the unfinished state with mode + Given the database is running + And the user asynchronously runs "psql postgres" and the process is saved + And running postgres processes are saved in context + When the user runs gpstop -a, selects s and interrupt the process + Then verify if the gpstop.lock directory is present in coordinator_data_directory + And the user runs gpstop -a and selects