diff --git a/public/cost_2026_01_08.csv b/public/cost_2026_01_08.csv new file mode 100644 index 0000000..3c5fbdf --- /dev/null +++ b/public/cost_2026_01_08.csv @@ -0,0 +1,19 @@ +model,AMPS_Hard,code_completion,code_generation,connections,consecutive_events,integrals_with_game,javascript,logic_with_navigation,math_comp,olympiad,paraphrase,plot_unscrambling,python,simplify,spatial,story_generation,summarize,tablejoin,tablereformat,theory_of_mind,typescript,typos,zebra_puzzle +deepseek-v4-pro,0.2504,0.2927,0.4335,1.1945,4.8001,7.6773,,1.3015,1.5286,2.905,1.2097,0.8195,,1.035,1.1736,0.8422,0.941,1.4651,0.1512,2.0217,,0.1168,3.8612 +gemini-3-flash-preview-high,1.5257,2.1738,2.8599,2.6343,5.9797,11.9349,,3.7343,4.7163,2.4212,4.0288,2.1455,,4.1076,2.7227,3.855,3.9197,3.3915,2.1384,2.7283,,2.4754,4.3226 +gemini-3-pro-preview-11-2025-high,2.8986,3.7064,5.7132,4.4913,20.253,34.3327,,9.314,14.3906,4.7474,4.8539,4.1325,,4.7546,4.5769,4.6127,4.4618,7.163,2.6717,6.486,,1.8465,13.6553 +gemini-3.1-pro-preview-high,2.9741,3.0368,6.0119,2.344,28.505,43.6557,,9.9464,8.8926,5.4719,7.4282,10.0822,,6.903,4.9059,6.9235,6.6073,6.4908,3.9294,4.2992,,2.1866,10.8268 +gemini-3.5-flash-high,1.1062,2.9346,4.9094,2.1636,25.9843,36.5173,,8.8282,6.4087,4.6562,7.4329,5.2138,,6.7386,3.6924,6.3501,6.0812,6.9276,3.9423,4.7356,,1.5401,12.8185 +gpt-5.1-2025-11-13-high,0.9311,1.9775,2.8915,4.3942,19.2097,34.805,,9.9141,8.268,2.6258,5.0585,8.8235,,4.0447,4.359,4.1457,4.1656,3.6882,1.9167,10.9247,,1.2402,23.5375 +gpt-5.1-codex-max-high,2.0986,3.5489,4.0357,8.8463,15.0326,49.2791,,11.5988,10.0669,10.3044,7.1546,8.6589,,5.9656,9.3179,5.7747,5.473,7.0886,2.271,11.4198,,2.3984,13.5836 +gpt-5.2-2025-12-11-high,0.455,1.3874,1.9742,1.9068,22.2134,27.061,,4.7384,4.8503,6.1639,6.2153,3.7762,,4.5995,2.0256,4.598,4.9272,1.6977,1.0306,3.9217,,0.5961,12.3162 +gpt-5.2-codex,1.2291,1.5988,1.8486,4.3947,22.8094,24.3917,,7.0443,5.0861,6.762,3.7657,5.8009,,3.3745,3.2051,3.6011,3.424,5.8971,1.1831,13.3448,,0.5076,10.5383 +gpt-5.3-codex-high,0.7865,1.0061,1.5207,1.6306,5.5683,13.8047,,4.2924,2.8525,3.1053,3.5004,4.5567,,2.8114,1.7137,2.5051,2.6617,2.8453,1.3109,4.5989,,0.7884,4.8932 +gpt-5.4-xhigh,2.6687,10.6719,20.5204,7.9421,29.8797,62.4069,,21.3757,10.8401,14.2973,27.31,18.1197,,23.2705,7.9064,21.6154,23.9794,19.6961,5.6004,25.3987,,2.4179,23.4908 +gpt-5.5-xhigh,1.5302,5.6389,12.3428,4.7743,47.4464,51.423,,14.0512,6.3185,10.4094,14.1513,22.1269,,11.1038,5.4863,10.5239,9.7203,9.5914,3.7872,15.8573,,1.5801,23.0294 +grok-build-0.1,0.0491,0.0369,0.0618,0.0298,0.334,0.156,,0.1853,0.119,0.109,0.0474,0.1144,,0.0416,0.0464,0.0467,0.044,0.1049,0.1965,0.077,,0.0406,0.1095 +kimi-k2.6-thinking,0.8528,1.3505,1.9175,5.2211,12.5083,21.9097,,4.2483,4.5579,5.6148,3.6765,4.1842,,3.9214,3.1094,3.0444,2.7652,5.2091,1.0471,4.663,,0.8503,6.0012 +kimi-k2.7-code,0.3807,0.8355,0.8216,1.3963,10.158,11.4094,,2.3841,2.1515,3.1487,2.7538,3.4542,,2.5998,1.3815,1.9603,2.1367,2.6118,0.5836,3.2884,,0.3385,3.7352 +minimax-m3,0.2187,0.3168,0.422,0.7462,3.4049,3.674,,0.5763,0.9437,0.2939,1.1241,0.6972,,0.9813,0.5243,0.8252,0.8864,0.7257,0.1823,0.4291,,0.1648,2.3803 +nemotron-3-ultra-550b-a55b,0.3803,0.3921,0.6255,1.4881,2.1573,4.0329,,0.314,0.9438,0.2226,1.3502,1.3248,,1.3503,0.5632,1.2476,1.3198,1.5851,0.4101,0.5208,,0.2795,0.0869 +qwen3.7-max,1.6922,1.0987,2.0062,1.997,13.7477,36.0024,,5.3916,6.0642,2.3137,4.2576,2.4092,,4.0372,4.4381,3.9201,3.603,1.8632,2.5036,1.5234,,1.112,9.2771 diff --git a/src/App.css b/src/App.css index 95906ef..a1e787d 100644 --- a/src/App.css +++ b/src/App.css @@ -208,6 +208,10 @@ td { td { padding: 6px; } + + .cost { + display: none; + } } /* Style adjustments for medium screens */ @@ -242,6 +246,19 @@ td { } .other-controls { + display: flex; + flex-wrap: wrap; + align-items: center; + justify-content: center; + align-self: stretch; + row-gap: 0.5rem; + margin-bottom: 0.5rem; +} + +.clear-filters-row { + display: flex; + justify-content: center; + align-self: stretch; margin-bottom: 1rem; } @@ -251,7 +268,6 @@ td { border: none; padding: 0.5rem 1rem; cursor: pointer; - margin-left: 1rem; } .section { @@ -302,4 +318,13 @@ thead th:first-child { .liveswebench-callout:hover { background-color: rgb(122, 220, 233); -} \ No newline at end of file +} + +.cost { + display: block; + margin-top: 2px; + font-size: 0.78em; + color: #888; + font-variant-numeric: tabular-nums; + white-space: nowrap; +} diff --git a/src/Table/Averaging.js b/src/Table/Averaging.js index 569ac12..2a8bca0 100644 --- a/src/Table/Averaging.js +++ b/src/Table/Averaging.js @@ -27,3 +27,9 @@ export const calculateAverage = (row, columns, fixedSize) => { const average = validValues.length > 0 ? validValues.reduce((a, b) => a + b, 0) / validValues.length : NaN; return isNaN(average) ? '-' : fixedSize ? average.toFixed(fixedSize) : average; }; + +export const sumColumns = (row, columns) => { + if (!row || !columns) return null; + const validValues = columns.map(col => parseFloat(row[col])).filter(val => !isNaN(val)); + return validValues.length > 0 ? validValues.reduce((a, b) => a + b, 0) : null; +}; diff --git a/src/Table/CSVTable.jsx b/src/Table/CSVTable.jsx index 7afd893..4519558 100644 --- a/src/Table/CSVTable.jsx +++ b/src/Table/CSVTable.jsx @@ -1,7 +1,7 @@ // src/Table/CSVTable.jsx import React, { useState, useEffect, useMemo, useCallback } from 'react'; import Papa from 'papaparse'; -import { calculateAverage, getGlobalAverage } from './Averaging'; +import { calculateAverage, getGlobalAverage, getGlobalAverageColumns, sumColumns } from './Averaging'; import { useTable } from "./SortTable"; import { getModelInfo, getVariantGroup } from './modelLinks'; import { useSearchParams } from 'react-router-dom'; @@ -11,6 +11,7 @@ import Select from 'react-select'; const CSVTable = ({dateStr}) => { const date = new Date(dateStr).toISOString().split('T')[0].replaceAll('-', '_'); const [data, setData] = useState([]); + const [cost, setCost] = useState({}); const [categories, setCategories] = useState({}); const [checkedCategories, setCheckedCategories] = useState({}); const [screenWidth, setScreenWidth] = useState(window.innerWidth); @@ -23,8 +24,9 @@ const CSVTable = ({dateStr}) => { const [showOpenWeights, setShowOpenWeights] = useState(false); const [showVariants, setShowVariants] = useState(false); const [showHighUnseenBias, setShowHighUnseenBias] = useState(true); + const [showCost, setShowCost] = useState(false); - const updateURL = (checkedCategories, newFilter, newSortField = null, newSortOrder = null, newShowProvider = null, newShowApiName = null, newShowReasoners = null, newShowOpenWeights = null, newShowVariants = null, newShowHighUnseenBias = null, newSearchQuery = null) => { + const updateURL = (checkedCategories, newFilter, newSortField = null, newSortOrder = null, newShowProvider = null, newShowApiName = null, newShowReasoners = null, newShowOpenWeights = null, newShowVariants = null, newShowHighUnseenBias = null, newSearchQuery = null, newShowCost = null) => { const params = new URLSearchParams(); let allAverages = true; @@ -72,6 +74,7 @@ const CSVTable = ({dateStr}) => { const effectiveShowOpenWeights = newShowOpenWeights !== null ? newShowOpenWeights : showOpenWeights; const effectiveShowVariants = newShowVariants !== null ? newShowVariants : showVariants; const effectiveShowHighUnseenBias = newShowHighUnseenBias !== null ? newShowHighUnseenBias : showHighUnseenBias; + const effectiveShowCost = newShowCost !== null ? newShowCost : showCost; if (!effectiveShowProvider) params.set('provider', 'false'); if (effectiveShowApiName) params.set('api', 'true'); @@ -79,6 +82,7 @@ const CSVTable = ({dateStr}) => { if (effectiveShowOpenWeights) params.set('openweight', 'true'); if (effectiveShowVariants) params.set('variants', 'true'); if (effectiveShowHighUnseenBias) params.set('highunseenbias', 'true'); + if (effectiveShowCost) params.set('cost', 'true'); if (allAverages && !anySubcategories) { const newParams = new URLSearchParams(); @@ -104,6 +108,7 @@ const CSVTable = ({dateStr}) => { if (effectiveShowOpenWeights) newParams.set('openweight', 'true'); if (effectiveShowVariants) newParams.set('variants', 'true'); if (effectiveShowHighUnseenBias) newParams.set('highunseenbias', 'true'); + if (effectiveShowCost) newParams.set('cost', 'true'); setSearchParams(newParams); return; } @@ -158,6 +163,7 @@ const CSVTable = ({dateStr}) => { const [sortedData, handleSorting, handleSearch, handleFilter, sortField, sortOrder, searchQuery, filter] = useTable(data, columns, checkedCategories, categories, 'model', getModelInfo); useEffect(() => { + setCost({}); fetch(process.env.PUBLIC_URL + `/table_${date}.csv`) .then(response => response.text()) .then(text => { @@ -171,6 +177,28 @@ const CSVTable = ({dateStr}) => { }); }); + fetch(process.env.PUBLIC_URL + `/cost_${date}.csv`) + .then(response => (response.ok ? response.text() : null)) + .then(text => { + if (!text || !text.startsWith('model,')) { + setCost({}); + return; + } + Papa.parse(text, { + header: true, + dynamicTyping: true, + skipEmptyLines: true, + complete: (result) => { + const map = {}; + result.data.forEach(r => { + if (r && r.model != null) map[String(r.model).toLowerCase()] = r; + }); + setCost(map); + } + }); + }) + .catch(() => setCost({})); + fetch(process.env.PUBLIC_URL + `/categories_${date}.json`) .then(response => response.json()) .then(json => { @@ -239,6 +267,9 @@ const CSVTable = ({dateStr}) => { } else if (key === 'highunseenbias') { setShowHighUnseenBias(value === 'true'); return; + } else if (key === 'cost') { + setShowCost(value === 'true'); + return; } else if (Object.keys(categories).includes(key)) { if (value.includes('a')) { updatedCategories[key].average = true; @@ -285,7 +316,7 @@ const CSVTable = ({dateStr}) => { return; } updateURL(checkedCategories, filter); - }, [showProvider, showApiName, showReasoners, showOpenWeights, showVariants, showHighUnseenBias]); + }, [showProvider, showApiName, showReasoners, showOpenWeights, showVariants, showHighUnseenBias, showCost]); const handleCheckboxChange = (clickedCategory, type) => { @@ -421,9 +452,10 @@ const CSVTable = ({dateStr}) => { setShowOpenWeights(false); setShowVariants(false); setShowHighUnseenBias(true); + setShowCost(false); // Update URL with default values (including empty search) - updateURL(defaultCategories, {}, 'ga', 'desc', true, false, true, false, false, false, ''); + updateURL(defaultCategories, {}, 'ga', 'desc', true, false, true, false, false, false, '', false); } // Utility to compute class for sorting @@ -433,6 +465,15 @@ const CSVTable = ({dateStr}) => { const numCheckedCategories = Object.values(checkedCategories).filter(cat => cat.average || cat.allSubcategories).length; + const hasCost = Object.keys(cost).length > 0; + const costOn = showCost && hasCost; + const fmtCost = (v) => { + if (v == null || isNaN(v)) return '—'; + if (v >= 0.1) return `$${v.toFixed(2)}`; + if (v >= 0.01) return `$${v.toFixed(3)}`; + return `$${v.toFixed(4)}`; + }; + const modelProviders = Array.from(new Set(data.map(row => getModelInfo(row.model)?.organization ?? 'Unknown'))).sort(); // Create a map to identify models with duplicate display names @@ -559,6 +600,12 @@ const CSVTable = ({dateStr}) => { setShowHighUnseenBias(!showHighUnseenBias)} id="showHighUnseenBias" /> Show High Unseen Question Bias Models + {hasCost && } + +