From e1ce8ad0776932ff03ebf12cbc7333f487fbc040 Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Sun, 7 Jun 2026 02:05:24 +0000 Subject: [PATCH] GH-3601: Cache shouldIgnoreStatistics version parsing result --- .../org/apache/parquet/CorruptStatistics.java | 19 +++++++ .../apache/parquet/CorruptStatisticsTest.java | 52 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java index c5846f9efa..d6b9ca4f74 100644 --- a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java @@ -18,6 +18,7 @@ */ package org.apache.parquet; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.parquet.SemanticVersion.SemanticVersionParseException; import org.apache.parquet.VersionParser.ParsedVersion; @@ -35,6 +36,8 @@ */ public class CorruptStatistics { private static final AtomicBoolean alreadyLogged = new AtomicBoolean(false); + private static final ConcurrentHashMap CACHE = new ConcurrentHashMap<>(); + private static final int MAX_CACHE_SIZE = 64; private static final Logger LOG = LoggerFactory.getLogger(CorruptStatistics.class); @@ -68,6 +71,12 @@ public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName return true; } + return CACHE.size() >= MAX_CACHE_SIZE + ? shouldIgnoreStatisticsForBinary(createdBy) + : CACHE.computeIfAbsent(createdBy, CorruptStatistics::shouldIgnoreStatisticsForBinary); + } + + private static boolean shouldIgnoreStatisticsForBinary(String createdBy) { try { ParsedVersion version = VersionParser.parse(createdBy); @@ -114,4 +123,14 @@ private static void warnOnce(String message) { LOG.warn(message); } } + + // Visible for testing + static int cacheSize() { + return CACHE.size(); + } + + // Visible for testing + static void clearCache() { + CACHE.clear(); + } } diff --git a/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java b/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java index e897b38512..4f8a30cb27 100644 --- a/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java +++ b/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java @@ -18,14 +18,21 @@ */ package org.apache.parquet; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.junit.Before; import org.junit.Test; public class CorruptStatisticsTest { + @Before + public void setUp() { + CorruptStatistics.clearCache(); + } + @Test public void testOnlyAppliesToBinary() { assertTrue(CorruptStatistics.shouldIgnoreStatistics( @@ -124,4 +131,49 @@ public void testDistributionCorruptStatistics() { assertTrue(CorruptStatistics.shouldIgnoreStatistics( "parquet-mr version 1.7.0 (build abcd)", PrimitiveTypeName.BINARY)); } + + @Test + public void testCachingBehavior() { + assertEquals(0, CorruptStatistics.cacheSize()); + + // Call many times with the same createdBy — cache should store exactly one entry + String createdBy = "parquet-mr version 1.6.0 (build cache-test)"; + for (int i = 0; i < 100; i++) { + CorruptStatistics.shouldIgnoreStatistics(createdBy, PrimitiveTypeName.BINARY); + } + assertEquals(1, CorruptStatistics.cacheSize()); + + // A different createdBy should add a second entry + CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.9.0 (build abcd)", PrimitiveTypeName.BINARY); + assertEquals(2, CorruptStatistics.cacheSize()); + } + + @Test + public void testCorrectnessWhenCacheIsFull() { + CorruptStatistics.clearCache(); + + // Fill cache to capacity with 64 distinct corrupt-version strings + for (int i = 0; i < 64; i++) { + assertTrue( + "Corrupt version should be ignored", + CorruptStatistics.shouldIgnoreStatistics( + "parquet-mr version 1.6." + i + " (build x)", PrimitiveTypeName.BINARY)); + } + assertEquals(64, CorruptStatistics.cacheSize()); + + // 65th distinct string bypasses cache -- must still return correct result + assertTrue( + "Cache-bypass path must still return correct result for corrupt version", + CorruptStatistics.shouldIgnoreStatistics( + "parquet-mr version 1.6.99 (build bypass)", PrimitiveTypeName.BINARY)); + + // Non-corrupt version also returns correct result when cache is full + assertFalse( + "Non-corrupt version must return false even when cache is full", + CorruptStatistics.shouldIgnoreStatistics( + "parquet-mr version 1.10.0 (build bypass2)", PrimitiveTypeName.BINARY)); + + // Cache did not grow beyond cap + assertEquals(64, CorruptStatistics.cacheSize()); + } }