From c4fca9b8aa93bf15e4967142b10a1f190fda5f14 Mon Sep 17 00:00:00 2001 From: "jonas.brami" Date: Wed, 11 Mar 2026 11:55:20 +0000 Subject: [PATCH 1/3] [fix] Use cursor.description to detect result-returning queries Replace brittle sql_upper.startswith() keyword check with cursor.description, which reliably detects any statement that returns a result set (SELECT, SHOW, DESCRIBE, EXPLAIN, CTEs, etc.) without maintaining a hardcoded list of SQL keywords. --- doris_mcp_server/utils/db.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/doris_mcp_server/utils/db.py b/doris_mcp_server/utils/db.py index cd7b0ce..2a95140 100644 --- a/doris_mcp_server/utils/db.py +++ b/doris_mcp_server/utils/db.py @@ -95,15 +95,9 @@ async def execute(self, sql: str, params: tuple | None = None, auth_context=None async with self.connection.cursor(aiomysql.DictCursor) as cursor: await cursor.execute(sql, params) - # Check if it's a query statement (statement that returns result set) - # FIX for Issue #62 Bug 5: Added WITH support for Common Table Expressions (CTE) - sql_upper = sql.strip().upper() - if (sql_upper.startswith("SELECT") or - sql_upper.startswith("SHOW") or - sql_upper.startswith("DESCRIBE") or - sql_upper.startswith("DESC") or - sql_upper.startswith("EXPLAIN") or - sql_upper.startswith("WITH")): # FIX: Support CTE queries + # cursor.description is set by the DB driver for any statement that returns rows, + # avoiding a brittle hardcoded keyword list (e.g. missing WITH/CTE, comments before keywords). + if cursor.description: data = await cursor.fetchall() row_count = len(data) else: From 54a6ee7f04926d4c3dfc1239cf019c1ec7c81a4c Mon Sep 17 00:00:00 2001 From: mortalBibo Date: Thu, 21 May 2026 17:57:57 +0800 Subject: [PATCH 2/3] Fix max_rows bypass for comment-prefixed SELECT in execute_sql_for_mcp Same root cause as #75: sql.upper().startswith("SELECT") at query_executor.py:689 returns False when SQL begins with a leading '--' or '/* */' comment, so the auto-injected LIMIT {max_rows} is silently skipped and large queries can be dispatched unbounded. This callsite runs before cursor.execute, so cursor.description is not available. Use a small helper that strips leading SQL comments before extracting the first keyword. --- doris_mcp_server/utils/db.py | 17 +++++++++++++++++ doris_mcp_server/utils/query_executor.py | 9 ++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doris_mcp_server/utils/db.py b/doris_mcp_server/utils/db.py index 2a95140..a48811f 100644 --- a/doris_mcp_server/utils/db.py +++ b/doris_mcp_server/utils/db.py @@ -24,6 +24,7 @@ import asyncio import logging +import re import time from contextlib import asynccontextmanager from dataclasses import dataclass @@ -36,6 +37,22 @@ from .logger import get_logger +_SQL_COMMENT_RE = re.compile(r"/\*.*?\*/|--[^\n]*", re.DOTALL) + + +def get_first_sql_keyword(sql: str) -> str: + """Return the first SQL keyword (uppercase), ignoring leading comments/whitespace. + + Strips `--` line comments and `/* */` block comments before extracting + the first token. A leading comment must not change how a statement is + classified (e.g. `-- note\\nSELECT 1` is still a SELECT). + """ + if not sql: + return "" + stripped = _SQL_COMMENT_RE.sub("", sql).strip() + if not stripped: + return "" + return stripped.split(None, 1)[0].upper() @dataclass diff --git a/doris_mcp_server/utils/query_executor.py b/doris_mcp_server/utils/query_executor.py index 75abf34..b919cad 100644 --- a/doris_mcp_server/utils/query_executor.py +++ b/doris_mcp_server/utils/query_executor.py @@ -35,7 +35,7 @@ import sqlparse -from .db import DorisConnectionManager, QueryResult +from .db import DorisConnectionManager, QueryResult, get_first_sql_keyword from .logger import get_logger from .sql_security_utils import get_auth_context @@ -685,8 +685,11 @@ async def execute_sql_for_mcp( else: self.logger.warning("Security configuration not found, proceeding without validation") - # Add LIMIT if not present and it's a SELECT query - if sql.upper().startswith("SELECT") and "LIMIT" not in sql.upper(): + # Add LIMIT if not present and it's a SELECT query. + # get_first_sql_keyword skips leading comments so `-- note\nSELECT ...` + # still gets the LIMIT cap (sql.startswith would silently bypass it). + sql_upper = sql.upper() + if get_first_sql_keyword(sql) == "SELECT" and "LIMIT" not in sql_upper: if sql.endswith(";"): sql = sql[:-1] sql = f"{sql} LIMIT {limit}" From 249ecff53aa2f4b2a69afac8f5f5a5b0ecb29892 Mon Sep 17 00:00:00 2001 From: mortalBibo Date: Thu, 21 May 2026 17:57:57 +0800 Subject: [PATCH 3/3] Add regression tests for result-set detection contract These tests pin the user-facing contract of DorisConnection.execute(): any SQL the driver reports as producing a result set must return its rows, regardless of how the statement is phrased. Guards against regression of: - Issue #62 Bug 5 (CTE / WITH returning empty data) - The leading-comment bug fixed in #75 (data: [] with row_count > 0 for SELECT prefixed by '--' or '/* */') - The same-root-cause LIMIT bypass fixed in the previous commit Also adds unit tests for the new get_first_sql_keyword helper. --- test/utils/test_db.py | 161 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 2 deletions(-) diff --git a/test/utils/test_db.py b/test/utils/test_db.py index 4ad721a..e12d507 100644 --- a/test/utils/test_db.py +++ b/test/utils/test_db.py @@ -1,7 +1,11 @@ -from unittest.mock import MagicMock +from unittest.mock import AsyncMock, MagicMock import pytest -from doris_mcp_server.utils.db import DorisConnection, DorisSessionCache +from doris_mcp_server.utils.db import ( + DorisConnection, + DorisSessionCache, + get_first_sql_keyword, +) @pytest.fixture @@ -76,3 +80,156 @@ def test_clear_cache(self, session_cache): connection_manager.release_connection.assert_any_call("query", mock_conn1) connection_manager.release_connection.assert_any_call("system", mock_conn2) assert connection_manager.release_connection.call_count == 2 + + +class TestGetFirstSqlKeyword: + """Unit tests for get_first_sql_keyword. + + Used by query_executor.py:689 to detect SELECT before cursor.execute + (where cursor.description is not yet available), so the auto-injected + LIMIT {max_rows} cap also works when the SQL is comment-prefixed. + """ + + def test_plain_select(self): + assert get_first_sql_keyword("SELECT 1") == "SELECT" + + def test_leading_whitespace(self): + assert get_first_sql_keyword(" \n\t SELECT 1") == "SELECT" + + def test_lowercase(self): + assert get_first_sql_keyword("select 1") == "SELECT" + + def test_line_comment_then_select(self): + sql = "-- a leading note\nSELECT 1" + assert get_first_sql_keyword(sql) == "SELECT" + + def test_block_comment_then_select(self): + sql = "/* note */ SELECT 1" + assert get_first_sql_keyword(sql) == "SELECT" + + def test_multiline_block_comment_then_select(self): + sql = "/*\n multi\n line\n*/\nSELECT 1" + assert get_first_sql_keyword(sql) == "SELECT" + + def test_mixed_whitespace_and_comments(self): + sql = " -- one\n /* two */ \n SELECT 1" + assert get_first_sql_keyword(sql) == "SELECT" + + def test_comment_then_with_cte(self): + sql = "-- note\nWITH x AS (SELECT 1) SELECT * FROM x" + assert get_first_sql_keyword(sql) == "WITH" + + def test_non_select_unaffected(self): + assert get_first_sql_keyword("INSERT INTO t VALUES (1)") == "INSERT" + assert get_first_sql_keyword("-- c\nINSERT INTO t VALUES (1)") == "INSERT" + + def test_empty_and_only_comments(self): + assert get_first_sql_keyword("") == "" + assert get_first_sql_keyword(" ") == "" + assert get_first_sql_keyword("-- only a comment") == "" + assert get_first_sql_keyword("/* only */") == "" + + +def _make_doris_connection(cursor_description, fetchall_rows, rowcount=0): + """Build a DorisConnection whose underlying cursor returns the given values. + + The driver-level cursor is fully mocked: only `description`, `fetchall()` + and `rowcount` matter for the result-set-detection branch we want to test. + """ + cursor = MagicMock() + cursor.execute = AsyncMock(return_value=None) + cursor.fetchall = AsyncMock(return_value=fetchall_rows) + cursor.description = cursor_description + cursor.rowcount = rowcount + + cursor_ctx = MagicMock() + cursor_ctx.__aenter__ = AsyncMock(return_value=cursor) + cursor_ctx.__aexit__ = AsyncMock(return_value=None) + + raw_connection = MagicMock() + raw_connection.cursor = MagicMock(return_value=cursor_ctx) + + return DorisConnection(connection=raw_connection, session_id="test") + + +class TestExecuteResultSetDetection: + """Behavior contract for DorisConnection.execute(). + + These tests pin the user-facing contract: any statement the driver + reports as producing a result set must have its rows returned, and any + statement that does not produce a result set must report rowcount. + + Guards against regression of: + - Issue #62 Bug 5 (CTE / WITH returning empty data) + - The leading-comment bug (SELECT prefixed by `--` or `/* */` returning + empty data while row_count was non-zero) + - Future "missing keyword in the whitelist" bugs of the same class + + The tests deliberately do not assert anything about how the SQL text is + parsed — they only assert that when `cursor.description` is populated, + rows are fetched, regardless of the SQL phrasing. + """ + + @pytest.mark.parametrize( + "sql", + [ + "SELECT 1", + " SELECT 1", + "-- leading line comment\nSELECT 1", + "/* leading block comment */ SELECT 1", + "/*\n multi\n line\n*/\nSELECT 1", + " -- one\n /* two */ \n SELECT 1", + "(SELECT 1)", + "WITH t AS (SELECT 1) SELECT * FROM t", + "-- comment\nWITH t AS (SELECT 1) SELECT * FROM t", + "SHOW TABLES", + "DESC some_table", + "EXPLAIN SELECT 1", + ], + ids=[ + "plain_select", + "leading_whitespace", + "line_comment_then_select", + "block_comment_then_select", + "multiline_block_comment", + "mixed_whitespace_and_comments", + "parenthesized_select", + "with_cte", + "comment_then_with_cte", + "show", + "desc", + "explain", + ], + ) + async def test_returns_rows_when_driver_reports_result_set(self, sql): + rows = [{"col": 1}] + conn = _make_doris_connection( + cursor_description=[("col", None, None, None, None, None, None)], + fetchall_rows=rows, + ) + + result = await conn.execute(sql) + + assert result.data == rows + assert result.row_count == len(rows) + + @pytest.mark.parametrize( + "sql, affected", + [ + ("INSERT INTO t VALUES (1)", 1), + ("UPDATE t SET x = 1", 5), + ("DELETE FROM t WHERE x = 1", 3), + ("CREATE TABLE t (x INT)", 0), + ], + ) + async def test_no_fetch_when_driver_reports_no_result_set(self, sql, affected): + conn = _make_doris_connection( + cursor_description=None, + fetchall_rows=[], + rowcount=affected, + ) + + result = await conn.execute(sql) + + assert result.data == [] + assert result.row_count == affected