From c4fca9b8aa93bf15e4967142b10a1f190fda5f14 Mon Sep 17 00:00:00 2001
From: "jonas.brami" <jonasbrami@gmail.com>
Date: Wed, 11 Mar 2026 11:55:20 +0000
Subject: [PATCH 1/3] [fix] Use cursor.description to detect result-returning
 queries

Replace brittle sql_upper.startswith() keyword check with
cursor.description, which reliably detects any statement that
returns a result set (SELECT, SHOW, DESCRIBE, EXPLAIN, CTEs, etc.)
without maintaining a hardcoded list of SQL keywords.
---
 doris_mcp_server/utils/db.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/doris_mcp_server/utils/db.py b/doris_mcp_server/utils/db.py
index cd7b0ce..2a95140 100644
--- a/doris_mcp_server/utils/db.py
+++ b/doris_mcp_server/utils/db.py
@@ -95,15 +95,9 @@ async def execute(self, sql: str, params: tuple | None = None, auth_context=None
             async with self.connection.cursor(aiomysql.DictCursor) as cursor:
                 await cursor.execute(sql, params)
 
-                # Check if it's a query statement (statement that returns result set)
-                # FIX for Issue #62 Bug 5: Added WITH support for Common Table Expressions (CTE)
-                sql_upper = sql.strip().upper()
-                if (sql_upper.startswith("SELECT") or
-                    sql_upper.startswith("SHOW") or
-                    sql_upper.startswith("DESCRIBE") or
-                    sql_upper.startswith("DESC") or
-                    sql_upper.startswith("EXPLAIN") or
-                    sql_upper.startswith("WITH")):  # FIX: Support CTE queries
+                # cursor.description is set by the DB driver for any statement that returns rows,
+                # avoiding a brittle hardcoded keyword list (e.g. missing WITH/CTE, comments before keywords).
+                if cursor.description:
                     data = await cursor.fetchall()
                     row_count = len(data)
                 else:

From 54a6ee7f04926d4c3dfc1239cf019c1ec7c81a4c Mon Sep 17 00:00:00 2001
From: mortalBibo <mortalBibo@gmail.com>
Date: Thu, 21 May 2026 17:57:57 +0800
Subject: [PATCH 2/3] Fix max_rows bypass for comment-prefixed SELECT in
 execute_sql_for_mcp

Same root cause as #75: sql.upper().startswith("SELECT") at
query_executor.py:689 returns False when SQL begins with a leading
'--' or '/* */' comment, so the auto-injected LIMIT {max_rows} is
silently skipped and large queries can be dispatched unbounded.

This callsite runs before cursor.execute, so cursor.description is not
available. Use a small helper that strips leading SQL comments before
extracting the first keyword.
---
 doris_mcp_server/utils/db.py             | 17 +++++++++++++++++
 doris_mcp_server/utils/query_executor.py |  9 ++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/doris_mcp_server/utils/db.py b/doris_mcp_server/utils/db.py
index 2a95140..a48811f 100644
--- a/doris_mcp_server/utils/db.py
+++ b/doris_mcp_server/utils/db.py
@@ -24,6 +24,7 @@
 
 import asyncio
 import logging
+import re
 import time
 from contextlib import asynccontextmanager
 from dataclasses import dataclass
@@ -36,6 +37,22 @@
 from .logger import get_logger
 
 
+_SQL_COMMENT_RE = re.compile(r"/\*.*?\*/|--[^\n]*", re.DOTALL)
+
+
+def get_first_sql_keyword(sql: str) -> str:
+    """Return the first SQL keyword (uppercase), ignoring leading comments/whitespace.
+
+    Strips `--` line comments and `/* */` block comments before extracting
+    the first token. A leading comment must not change how a statement is
+    classified (e.g. `-- note\\nSELECT 1` is still a SELECT).
+    """
+    if not sql:
+        return ""
+    stripped = _SQL_COMMENT_RE.sub("", sql).strip()
+    if not stripped:
+        return ""
+    return stripped.split(None, 1)[0].upper()
 
 
 @dataclass
diff --git a/doris_mcp_server/utils/query_executor.py b/doris_mcp_server/utils/query_executor.py
index 75abf34..b919cad 100644
--- a/doris_mcp_server/utils/query_executor.py
+++ b/doris_mcp_server/utils/query_executor.py
@@ -35,7 +35,7 @@
 
 import sqlparse
 
-from .db import DorisConnectionManager, QueryResult
+from .db import DorisConnectionManager, QueryResult, get_first_sql_keyword
 from .logger import get_logger
 from .sql_security_utils import get_auth_context
 
@@ -685,8 +685,11 @@ async def execute_sql_for_mcp(
                 else:
                     self.logger.warning("Security configuration not found, proceeding without validation")
 
-                # Add LIMIT if not present and it's a SELECT query
-                if sql.upper().startswith("SELECT") and "LIMIT" not in sql.upper():
+                # Add LIMIT if not present and it's a SELECT query.
+                # get_first_sql_keyword skips leading comments so `-- note\nSELECT ...`
+                # still gets the LIMIT cap (sql.startswith would silently bypass it).
+                sql_upper = sql.upper()
+                if get_first_sql_keyword(sql) == "SELECT" and "LIMIT" not in sql_upper:
                     if sql.endswith(";"):
                         sql = sql[:-1]
                     sql = f"{sql} LIMIT {limit}"

From 249ecff53aa2f4b2a69afac8f5f5a5b0ecb29892 Mon Sep 17 00:00:00 2001
From: mortalBibo <mortalBibo@gmail.com>
Date: Thu, 21 May 2026 17:57:57 +0800
Subject: [PATCH 3/3] Add regression tests for result-set detection contract

These tests pin the user-facing contract of DorisConnection.execute():
any SQL the driver reports as producing a result set must return its
rows, regardless of how the statement is phrased.

Guards against regression of:
- Issue #62 Bug 5 (CTE / WITH returning empty data)
- The leading-comment bug fixed in #75 (data: [] with row_count > 0
  for SELECT prefixed by '--' or '/* */')
- The same-root-cause LIMIT bypass fixed in the previous commit

Also adds unit tests for the new get_first_sql_keyword helper.
---
 test/utils/test_db.py | 161 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 2 deletions(-)

diff --git a/test/utils/test_db.py b/test/utils/test_db.py
index 4ad721a..e12d507 100644
--- a/test/utils/test_db.py
+++ b/test/utils/test_db.py
@@ -1,7 +1,11 @@
-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock
 import pytest
 
-from doris_mcp_server.utils.db import DorisConnection, DorisSessionCache
+from doris_mcp_server.utils.db import (
+    DorisConnection,
+    DorisSessionCache,
+    get_first_sql_keyword,
+)
 
 
 @pytest.fixture
@@ -76,3 +80,156 @@ def test_clear_cache(self, session_cache):
         connection_manager.release_connection.assert_any_call("query", mock_conn1)
         connection_manager.release_connection.assert_any_call("system", mock_conn2)
         assert connection_manager.release_connection.call_count == 2
+
+
+class TestGetFirstSqlKeyword:
+    """Unit tests for get_first_sql_keyword.
+
+    Used by query_executor.py:689 to detect SELECT before cursor.execute
+    (where cursor.description is not yet available), so the auto-injected
+    LIMIT {max_rows} cap also works when the SQL is comment-prefixed.
+    """
+
+    def test_plain_select(self):
+        assert get_first_sql_keyword("SELECT 1") == "SELECT"
+
+    def test_leading_whitespace(self):
+        assert get_first_sql_keyword("   \n\t SELECT 1") == "SELECT"
+
+    def test_lowercase(self):
+        assert get_first_sql_keyword("select 1") == "SELECT"
+
+    def test_line_comment_then_select(self):
+        sql = "-- a leading note\nSELECT 1"
+        assert get_first_sql_keyword(sql) == "SELECT"
+
+    def test_block_comment_then_select(self):
+        sql = "/* note */ SELECT 1"
+        assert get_first_sql_keyword(sql) == "SELECT"
+
+    def test_multiline_block_comment_then_select(self):
+        sql = "/*\n multi\n line\n*/\nSELECT 1"
+        assert get_first_sql_keyword(sql) == "SELECT"
+
+    def test_mixed_whitespace_and_comments(self):
+        sql = "  -- one\n  /* two */ \n  SELECT 1"
+        assert get_first_sql_keyword(sql) == "SELECT"
+
+    def test_comment_then_with_cte(self):
+        sql = "-- note\nWITH x AS (SELECT 1) SELECT * FROM x"
+        assert get_first_sql_keyword(sql) == "WITH"
+
+    def test_non_select_unaffected(self):
+        assert get_first_sql_keyword("INSERT INTO t VALUES (1)") == "INSERT"
+        assert get_first_sql_keyword("-- c\nINSERT INTO t VALUES (1)") == "INSERT"
+
+    def test_empty_and_only_comments(self):
+        assert get_first_sql_keyword("") == ""
+        assert get_first_sql_keyword("   ") == ""
+        assert get_first_sql_keyword("-- only a comment") == ""
+        assert get_first_sql_keyword("/* only */") == ""
+
+
+def _make_doris_connection(cursor_description, fetchall_rows, rowcount=0):
+    """Build a DorisConnection whose underlying cursor returns the given values.
+
+    The driver-level cursor is fully mocked: only `description`, `fetchall()`
+    and `rowcount` matter for the result-set-detection branch we want to test.
+    """
+    cursor = MagicMock()
+    cursor.execute = AsyncMock(return_value=None)
+    cursor.fetchall = AsyncMock(return_value=fetchall_rows)
+    cursor.description = cursor_description
+    cursor.rowcount = rowcount
+
+    cursor_ctx = MagicMock()
+    cursor_ctx.__aenter__ = AsyncMock(return_value=cursor)
+    cursor_ctx.__aexit__ = AsyncMock(return_value=None)
+
+    raw_connection = MagicMock()
+    raw_connection.cursor = MagicMock(return_value=cursor_ctx)
+
+    return DorisConnection(connection=raw_connection, session_id="test")
+
+
+class TestExecuteResultSetDetection:
+    """Behavior contract for DorisConnection.execute().
+
+    These tests pin the user-facing contract: any statement the driver
+    reports as producing a result set must have its rows returned, and any
+    statement that does not produce a result set must report rowcount.
+
+    Guards against regression of:
+    - Issue #62 Bug 5 (CTE / WITH returning empty data)
+    - The leading-comment bug (SELECT prefixed by `--` or `/* */` returning
+      empty data while row_count was non-zero)
+    - Future "missing keyword in the whitelist" bugs of the same class
+
+    The tests deliberately do not assert anything about how the SQL text is
+    parsed — they only assert that when `cursor.description` is populated,
+    rows are fetched, regardless of the SQL phrasing.
+    """
+
+    @pytest.mark.parametrize(
+        "sql",
+        [
+            "SELECT 1",
+            "   SELECT 1",
+            "-- leading line comment\nSELECT 1",
+            "/* leading block comment */ SELECT 1",
+            "/*\n multi\n line\n*/\nSELECT 1",
+            "  -- one\n  /* two */ \n  SELECT 1",
+            "(SELECT 1)",
+            "WITH t AS (SELECT 1) SELECT * FROM t",
+            "-- comment\nWITH t AS (SELECT 1) SELECT * FROM t",
+            "SHOW TABLES",
+            "DESC some_table",
+            "EXPLAIN SELECT 1",
+        ],
+        ids=[
+            "plain_select",
+            "leading_whitespace",
+            "line_comment_then_select",
+            "block_comment_then_select",
+            "multiline_block_comment",
+            "mixed_whitespace_and_comments",
+            "parenthesized_select",
+            "with_cte",
+            "comment_then_with_cte",
+            "show",
+            "desc",
+            "explain",
+        ],
+    )
+    async def test_returns_rows_when_driver_reports_result_set(self, sql):
+        rows = [{"col": 1}]
+        conn = _make_doris_connection(
+            cursor_description=[("col", None, None, None, None, None, None)],
+            fetchall_rows=rows,
+        )
+
+        result = await conn.execute(sql)
+
+        assert result.data == rows
+        assert result.row_count == len(rows)
+
+    @pytest.mark.parametrize(
+        "sql, affected",
+        [
+            ("INSERT INTO t VALUES (1)", 1),
+            ("UPDATE t SET x = 1", 5),
+            ("DELETE FROM t WHERE x = 1", 3),
+            ("CREATE TABLE t (x INT)", 0),
+        ],
+    )
+    async def test_no_fetch_when_driver_reports_no_result_set(self, sql, affected):
+        conn = _make_doris_connection(
+            cursor_description=None,
+            fetchall_rows=[],
+            rowcount=affected,
+        )
+
+        result = await conn.execute(sql)
+
+        assert result.data == []
+        assert result.row_count == affected