diff --git a/scripts/test-emoji-resolution.ts b/scripts/test-emoji-resolution.ts new file mode 100644 index 00000000..780974ad --- /dev/null +++ b/scripts/test-emoji-resolution.ts @@ -0,0 +1,286 @@ +/** + * Diagnostic script: test category emoji resolution against real production data. + * + * For each unique expense category in the DB, shows whether it gets an exact + * emoji match or needs HF semantic matching. When HF_TOKEN is set, actually + * calls the model and displays the matched key + cosine score. + * + * Outputs a full table with: category, count, expected emoji/key (from the + * expectations map below), actual result, and match/mismatch status. + * + * Usage (on server): + * bun run scripts/test-emoji-resolution.ts + * bun run scripts/test-emoji-resolution.ts --dry-run # skip HF calls + */ +import { Database } from 'bun:sqlite'; +import { InferenceClient } from '@huggingface/inference'; +import { + CATEGORY_EMOJIS, + DEFAULT_CATEGORY_EMOJI, + getCategoryEmoji, +} from '../src/config/category-emojis'; + +const SIMILARITY_MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'; +const SIMILARITY_THRESHOLD = 0.5; + +// ────────────────────────────────────────────────────────────────────── +// Expectations table: category → { emoji, matchedKey } we expect the +// resolver to produce. For exact matches matchedKey equals the category +// itself. For HF matches it's the closest known key we anticipate. +// Add new entries here as real prod categories evolve. +// ────────────────────────────────────────────────────────────────────── +const EXPECTATIONS: Record = { + // Exact matches (category name exists verbatim in CATEGORY_EMOJIS) + Продукты: { emoji: '🛒', key: 'Продукты' }, + Транспорт: { emoji: '🚗', key: 'Транспорт' }, + Здоровье: { emoji: '💊', key: 'Здоровье' }, + Развлечения: { emoji: '🎮', key: 'Развлечения' }, + Кафе: { emoji: '☕', key: 'Кафе' }, + Ресторан: { emoji: '🍽️', key: 'Ресторан' }, + Такси: { emoji: '🚕', key: 'Такси' }, + Одежда: { emoji: '👕', key: 'Одежда' }, + Подписки: { emoji: '🔄', key: 'Подписки' }, + Аптека: { emoji: '💊', key: 'Аптека' }, + Красота: { emoji: '💄', key: 'Красота' }, + Подарки: { emoji: '🎁', key: 'Подарки' }, + Коммуналка: { emoji: '💡', key: 'Коммуналка' }, + Бензин: { emoji: '⛽', key: 'Бензин' }, + Каршеринг: { emoji: '🚙', key: 'Каршеринг' }, + Алкоголь: { emoji: '🍷', key: 'Алкоголь' }, + Доставка: { emoji: '🛵', key: 'Доставка' }, + Образование: { emoji: '📚', key: 'Образование' }, + Фитнес: { emoji: '💪', key: 'Фитнес' }, + Путешествия: { emoji: '✈️', key: 'Путешествия' }, + 'Без категории': { emoji: '💰', key: 'Без категории' }, + + // HF semantic matches (category not in CATEGORY_EMOJIS, resolved via model) + 'Еда вне дома': { emoji: '🍔', key: 'Еда' }, + Настолки: { emoji: '🎯', key: 'Игры' }, + Стриминг: { emoji: '🔄', key: 'Подписки' }, + Маршрутка: { emoji: '🚌', key: 'Общественный транспорт' }, + Зубной: { emoji: '🦷', key: 'Стоматолог' }, + Уборка: { emoji: '🧹', key: 'Хозтовары' }, + Косметика: { emoji: '💄', key: 'Красота' }, + Бассейн: { emoji: '💪', key: 'Фитнес' }, + 'Корм для кота': { emoji: '🐾', key: 'Питомцы' }, + Подкасты: { emoji: '🔄', key: 'Подписки' }, + Коворкинг: { emoji: '💼', key: 'Работа' }, + Самокат: { emoji: '🚗', key: 'Транспорт' }, + Витамины: { emoji: '💊', key: 'Аптека' }, + Кальян: { emoji: '🍻', key: 'Бар' }, + Цветы: { emoji: '🎁', key: 'Подарки' }, + Штрафы: { emoji: '🧾', key: 'Налоги' }, + Донаты: { emoji: '❤️', key: 'Благотворительность' }, +}; + +// ────────────────────────────────────────────────────────────────────── + +const dryRun = process.argv.includes('--dry-run'); +const dbPath = process.env['DATABASE_PATH'] || './data/expenses.db'; + +const db = new Database(dbPath, { readonly: true }); + +interface CategoryRow { + category: string; + cnt: number; +} + +const rows = db + .query( + 'SELECT DISTINCT category, COUNT(*) as cnt FROM expenses GROUP BY category ORDER BY cnt DESC', + ) + .all(); + +if (rows.length === 0) { + console.log('\n No expenses in DB.\n'); + process.exit(0); +} + +const keys = Object.keys(CATEGORY_EMOJIS); +const token = process.env['HF_TOKEN']; +const client = token && !dryRun ? new InferenceClient(token) : null; + +// ── Resolve every category ────────────────────────────────────────── + +interface ResolvedCategory { + category: string; + count: number; + method: 'exact' | 'hf' | 'default' | 'skip'; + emoji: string; + matchedKey: string | null; + score: number | null; +} + +const results: ResolvedCategory[] = []; + +for (const { category, cnt } of rows) { + const exact = getCategoryEmoji(category); + + if (exact !== DEFAULT_CATEGORY_EMOJI) { + results.push({ + category, + count: cnt, + method: 'exact', + emoji: exact, + matchedKey: category, + score: null, + }); + continue; + } + + if (!client) { + results.push({ + category, + count: cnt, + method: 'skip', + emoji: DEFAULT_CATEGORY_EMOJI, + matchedKey: null, + score: null, + }); + continue; + } + + try { + const scores = await client.sentenceSimilarity({ + model: SIMILARITY_MODEL, + inputs: { source_sentence: category, sentences: keys }, + }); + + let bestIdx = -1; + let bestScore = Number.NEGATIVE_INFINITY; + for (let i = 0; i < scores.length; i++) { + const s = scores[i]; + if (typeof s === 'number' && s > bestScore) { + bestScore = s; + bestIdx = i; + } + } + + if (bestIdx >= 0 && bestScore >= SIMILARITY_THRESHOLD) { + const key = keys[bestIdx] ?? ''; + results.push({ + category, + count: cnt, + method: 'hf', + emoji: CATEGORY_EMOJIS[key] ?? DEFAULT_CATEGORY_EMOJI, + matchedKey: key, + score: bestScore, + }); + } else { + results.push({ + category, + count: cnt, + method: 'default', + emoji: DEFAULT_CATEGORY_EMOJI, + matchedKey: null, + score: bestScore, + }); + } + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(` ❌ HF error for "${category}": ${msg}`); + results.push({ + category, + count: cnt, + method: 'default', + emoji: DEFAULT_CATEGORY_EMOJI, + matchedKey: null, + score: null, + }); + } +} + +db.close(); + +// ── Print full table ──────────────────────────────────────────────── + +const COL = { + cat: 'Category', + cnt: 'Count', + method: 'Method', + actual: 'Actual', + key: 'Matched Key', + score: 'Score', + expected: 'Expected', + status: 'Status', +}; + +// Calculate column widths +const catW = Math.max(COL.cat.length, ...results.map((r) => r.category.length)); +const cntW = Math.max(COL.cnt.length, ...results.map((r) => String(r.count).length)); +const methodW = Math.max(COL.method.length, 7); +const actualW = Math.max(COL.actual.length, 4); +const keyW = Math.max(COL.key.length, ...results.map((r) => (r.matchedKey ?? '—').length)); +const scoreW = Math.max(COL.score.length, 5); +const expectedW = Math.max(COL.expected.length, 10); +const statusW = Math.max(COL.status.length, 4); + +function pad(s: string, w: number): string { + return s.padEnd(w); +} +function rpad(s: string, w: number): string { + return s.padStart(w); +} + +const sep = `${'─'.repeat(catW + 2)}┼${'─'.repeat(cntW + 2)}┼${'─'.repeat(methodW + 2)}┼${'─'.repeat(actualW + 2)}┼${'─'.repeat(keyW + 2)}┼${'─'.repeat(scoreW + 2)}┼${'─'.repeat(expectedW + 2)}┼${'─'.repeat(statusW + 2)}`; + +console.log(`\n=== Emoji Resolution Report (${results.length} categories) ===\n`); + +// Header +console.log( + ` ${pad(COL.cat, catW)} │ ${rpad(COL.cnt, cntW)} │ ${pad(COL.method, methodW)} │ ${pad(COL.actual, actualW)} │ ${pad(COL.key, keyW)} │ ${rpad(COL.score, scoreW)} │ ${pad(COL.expected, expectedW)} │ ${pad(COL.status, statusW)} `, +); +console.log(sep); + +let matchCount = 0; +let mismatchCount = 0; +let unknownCount = 0; + +for (const r of results) { + const exp = EXPECTATIONS[r.category]; + let expectedStr: string; + let statusStr: string; + + if (!exp) { + expectedStr = '?'; + statusStr = '—'; + unknownCount++; + } else if (r.emoji === exp.emoji) { + expectedStr = `${exp.emoji} ${exp.key}`; + statusStr = '✅'; + matchCount++; + } else { + expectedStr = `${exp.emoji} ${exp.key}`; + statusStr = '❌'; + mismatchCount++; + } + + const scoreStr = r.score !== null ? r.score.toFixed(3) : '—'; + + console.log( + ` ${pad(r.category, catW)} │ ${rpad(String(r.count), cntW)} │ ${pad(r.method, methodW)} │ ${pad(r.emoji, actualW)} │ ${pad(r.matchedKey ?? '—', keyW)} │ ${rpad(scoreStr, scoreW)} │ ${pad(expectedStr, expectedW)} │ ${pad(statusStr, statusW)} `, + ); +} + +// ── Summary ───────────────────────────────────────────────────────── + +const exactCount = results.filter((r) => r.method === 'exact').length; +const hfCount = results.filter((r) => r.method === 'hf').length; +const defaultCount = results.filter((r) => r.method === 'default').length; +const skipCountVal = results.filter((r) => r.method === 'skip').length; + +console.log(`\n=== Summary ===`); +console.log(` Total categories: ${results.length}`); +console.log(` Exact match: ${exactCount}`); +if (client) { + console.log(` HF matched: ${hfCount}`); + console.log(` HF miss (default):${defaultCount}`); +} else { + console.log(` Skipped (${dryRun ? 'dry-run' : 'no HF_TOKEN'}): ${skipCountVal}`); +} +console.log(); +console.log(` vs expectations:`); +console.log(` ✅ Match: ${matchCount}`); +console.log(` ❌ Mismatch: ${mismatchCount}`); +console.log(` — Unknown: ${unknownCount} (not in expectations table)`); +console.log(); diff --git a/src/bot/handlers/callback.handler.ts b/src/bot/handlers/callback.handler.ts index 67e55aa7..8fc1fc31 100644 --- a/src/bot/handlers/callback.handler.ts +++ b/src/bot/handlers/callback.handler.ts @@ -1256,7 +1256,7 @@ async function handleReceiptSummaryAction( break; case 'bulk_edit': - await handleReceiptBulkEdit(ctx, queueItem, bot, messageId, chatId); + await handleReceiptBulkEdit(ctx, queueItem, group, bot, messageId, chatId); break; case 'itemwise': @@ -1329,6 +1329,7 @@ async function handleReceiptAcceptAll( async function handleReceiptBulkEdit( ctx: Ctx['CallbackQuery'], queueItem: PhotoQueueItem, + group: Group, bot: BotInstance, messageId?: number, chatId?: number, @@ -1350,7 +1351,7 @@ async function handleReceiptBulkEdit( ); const summary = buildSummaryFromItems(items); - const summaryText = formatSummaryMessage(summary, items.length); + const summaryText = await formatSummaryMessage(summary, group.id); const message = `${summaryText}\n\n✏️ Напишите корректировку текстом, например:\nперенеси салфетки в Хозтовары`; diff --git a/src/bot/handlers/message.handler.ts b/src/bot/handlers/message.handler.ts index 008a62b6..2cdff19b 100644 --- a/src/bot/handlers/message.handler.ts +++ b/src/bot/handlers/message.handler.ts @@ -597,7 +597,7 @@ async function handleBulkCorrectionInput( }); // Format result message - const summaryText = formatSummaryMessage(newSummary, items.length); + const summaryText = await formatSummaryMessage(newSummary, group.id); const message = `${summaryText}\n\n✅ Корректировка применена!`; // Always send NEW message with result and buttons diff --git a/src/bot/services/expense-saver.ts b/src/bot/services/expense-saver.ts index 806c586f..9dbd74d4 100644 --- a/src/bot/services/expense-saver.ts +++ b/src/bot/services/expense-saver.ts @@ -6,6 +6,10 @@ import { database } from '../../database'; import { sendMessage } from '../../services/bank/telegram-sender'; import { convertCurrency, formatAmount, getExchangeRate } from '../../services/currency/converter'; import { googleConn } from '../../services/google/sheets'; +import { + buildReceiptSummaryMessage, + type ReceiptSummaryItem, +} from '../../services/receipt/summary-message'; import { createLogger } from '../../utils/logger.ts'; import { buildMiniAppUrl } from '../../utils/miniapp-url'; import { silentSyncBudgets } from './budget-sync'; @@ -286,9 +290,17 @@ export async function saveReceiptExpenses( // Delete all processed receipt items (confirmed + skipped) database.receiptItems.deleteProcessedByPhotoQueueId(photoQueueId); - // Notify user - const totalItems = confirmedItems.length; - const totalCategories = itemsByCategory.size; + // Notify user using the shared summary builder + const summaryItems: ReceiptSummaryItem[] = confirmedItems + .filter((item) => item.confirmed_category !== null) + .map((item) => ({ + name: item.name_ru, + qty: item.quantity, + price: item.price, + total: item.total, + category: item.confirmed_category as string, + currency: item.currency, + })); const miniAppUrl = buildMiniAppUrl('scanner', group.telegram_group_id); const scanButton = miniAppUrl @@ -296,9 +308,11 @@ export async function saveReceiptExpenses( : undefined; await sendMessage( - `✅ Чек обработан!\n📦 Товаров: ${totalItems}\n📂 Категорий: ${totalCategories}`, + await buildReceiptSummaryMessage(summaryItems, group.id), scanButton ? { reply_markup: scanButton } : undefined, ); - logger.info(`[RECEIPT] Saved ${totalItems} items from receipt (${totalCategories} categories)`); + logger.info( + `[RECEIPT] Saved ${confirmedItems.length} items from receipt (${itemsByCategory.size} categories)`, + ); } diff --git a/src/config/category-emojis.ts b/src/config/category-emojis.ts index 02b41cc2..5d34ad3b 100644 --- a/src/config/category-emojis.ts +++ b/src/config/category-emojis.ts @@ -1,132 +1,262 @@ /** - * Emoji mappings for common expense categories - * Used in budget and expense displays + * Emoji mappings for common expense categories. + * Category names in this project are user-defined per group, so this is just + * a best-effort lookup for common Russian/English names. Unknown categories + * fall back to the default emoji in getCategoryEmoji(). + * + * Rule of thumb for what belongs here: realistic budget lines users actually + * track (Каршеринг, Страховка, Мебель, Ветеринар). Not items or one-off + * activities (Фрукты, Завтрак, Экскурсия) — those fall under broader + * categories like "Продукты" or "Развлечения". */ export const CATEGORY_EMOJIS: Record = { // Food & Dining Еда: '🍔', Продукты: '🛒', - Ресторан: '🍽️', Кафе: '☕', + Ресторан: '🍽️', + Бар: '🍻', + Кофе: '☕', + Алкоголь: '🍷', + Доставка: '🛵', Food: '🍔', Groceries: '🛒', - Restaurant: '🍽️', Cafe: '☕', + Restaurant: '🍽️', + Bar: '🍻', + Coffee: '☕', + Alcohol: '🍷', + Delivery: '🛵', - // Transportation + // Transport Транспорт: '🚗', Такси: '🚕', Бензин: '⛽', Парковка: '🅿️', + Авто: '🚗', + Машина: '🚗', + Автосервис: '🔧', + Каршеринг: '🚙', + Метро: '🚇', + 'Общественный транспорт': '🚌', Transport: '🚗', Taxi: '🚕', Gas: '⛽', Parking: '🅿️', + Car: '🚗', + CarService: '🔧', + Carsharing: '🚙', + Metro: '🚇', + PublicTransport: '🚌', // Entertainment Развлечения: '🎮', Кино: '🎬', Игры: '🎯', + Хобби: '🎨', + Подписки: '🔄', + Концерт: '🎤', + Музыка: '🎵', Entertainment: '🎮', Movies: '🎬', Games: '🎯', + Hobby: '🎨', + Subscriptions: '🔄', + Concert: '🎤', + Music: '🎵', // Health Здоровье: '💊', Аптека: '💊', Врач: '⚕️', + Стоматолог: '🦷', Спорт: '⚽', + Фитнес: '💪', Health: '💊', Pharmacy: '💊', Doctor: '⚕️', + Dentist: '🦷', Sport: '⚽', + Fitness: '💪', Gym: '💪', // Shopping Одежда: '👕', Обувь: '👟', Покупки: '🛍️', + Аксессуары: '👜', Clothes: '👕', Shoes: '👟', Shopping: '🛍️', + Accessories: '👜', // Housing + Жильё: '🏠', Жилье: '🏠', + Дом: '🏠', + Квартира: '🏡', Аренда: '🏡', - Коммуналка: '🔌', + Коммуналка: '💡', + Быт: '🧹', Ремонт: '🔧', + Мебель: '🛋️', + Хозтовары: '🧹', + 'Бытовая химия': '🧴', Housing: '🏠', + Home: '🏠', + Apartment: '🏡', Rent: '🏡', - Utilities: '🔌', + Utilities: '💡', Repair: '🔧', + Furniture: '🛋️', + Household: '🧹', // Personal - Личное: '👤', - Подарки: '🎁', Красота: '💄', - Personal: '👤', - Gifts: '🎁', + Подарки: '🎁', + Личное: '👤', + Личные: '👤', + Парикмахер: '💇', + Салон: '💅', Beauty: '💄', + Gifts: '🎁', + Personal: '👤', + Hairdresser: '💇', + Salon: '💅', // Education Образование: '📚', Книги: '📖', Курсы: '🎓', + Школа: '🏫', + Университет: '🎓', Education: '📚', Books: '📖', Courses: '🎓', + School: '🏫', + University: '🎓', - // Technology + // Tech & Communication Техника: '💻', Гаджеты: '📱', - Софт: '💿', + Электроника: '🔌', + Связь: '📱', + Интернет: '🌐', + Телефон: '📱', Tech: '💻', Gadgets: '📱', - Software: '💿', + Electronics: '🔌', + Mobile: '📱', + Internet: '🌐', + Phone: '📱', // Travel Путешествия: '✈️', + Экскурсии: '🧭', + Экскурсия: '🧭', Отель: '🏨', Билеты: '🎫', Travel: '✈️', Hotel: '🏨', Tickets: '🎫', + Excursion: '🧭', - // Family & Kids + // Family & Pets Дети: '👶', Семья: '👨‍👩‍👧', + Игрушки: '🧸', + Питомцы: '🐾', + Ветеринар: '🐾', + // Specific pets — separate emoji per species + Кот: '🐈', + Кошка: '🐈', + Коты: '🐈', + Кошки: '🐈', + Кис: '🐈', + Собака: '🐕‍🦺', + Собаки: '🐕‍🦺', + Пёс: '🐕‍🦺', + Пес: '🐕‍🦺', + Кролик: '🐇', + Зайка: '🐇', + Хомяк: '🐹', + Рыбки: '🐠', + Аквариум: '🐠', + Попугай: '🦜', + Птица: '🦜', + Черепаха: '🐢', Kids: '👶', Family: '👨‍👩‍👧', - - // Pets - Питомцы: '🐾', + Toys: '🧸', Pets: '🐾', + Vet: '🐾', + Cat: '🐈', + Dog: '🐕‍🦺', + Rabbit: '🐇', + Hamster: '🐹', + Fish: '🐠', + Parrot: '🦜', + Turtle: '🐢', + + // Work & Finance + Работа: '💼', + Офис: '💼', + Банк: '🏦', + Налоги: '🧾', + Страховка: '🛡️', + Кредит: '💳', + Кредитка: '💳', + Инвестиции: '📈', + Резерв: '📈', + Благотворительность: '❤️', + Work: '💼', + Office: '💼', + Bank: '🏦', + Taxes: '🧾', + Insurance: '🛡️', + Credit: '💳', + Investments: '📈', + Charity: '❤️', // Other - Другое: '📦', - Разное: '📦', - Other: '📦', - Misc: '📦', + Другое: '💸', + Прочее: '💸', + Разное: '💸', + Затраты: '💸', + 'Без категории': '💰', + Other: '💸', + Misc: '💸', + Uncategorized: '💰', }; +/** Default emoji returned when no match can be found. */ +export const DEFAULT_CATEGORY_EMOJI = '💰'; + /** - * Get emoji for category name (case-insensitive) - * Returns default emoji if category not found + * Look up emoji for a category name by exact (case-insensitive) match. + * Returns null if no exact key matches — lets callers distinguish + * "found" from "default fallback". */ -export function getCategoryEmoji(category: string): string { - // Try exact match first +function lookupExact(category: string): { emoji: string; key: string } | null { if (CATEGORY_EMOJIS[category]) { - return CATEGORY_EMOJIS[category]; + return { emoji: CATEGORY_EMOJIS[category], key: category }; } - // Try case-insensitive match const lowerCategory = category.toLowerCase(); for (const [key, emoji] of Object.entries(CATEGORY_EMOJIS)) { if (key.toLowerCase() === lowerCategory) { - return emoji; + return { emoji, key }; } } - // Default emoji for unknown categories - return '💰'; + return null; +} + +/** + * Synchronous emoji lookup by exact match. Falls back to the default emoji. + * Used in hot paths (budget, sum commands) where async isn't practical. + * For semantic fallback via HF, use resolveCategoryEmoji instead. + */ +export function getCategoryEmoji(category: string): string { + return lookupExact(category)?.emoji ?? DEFAULT_CATEGORY_EMOJI; } diff --git a/src/database/index.ts b/src/database/index.ts index 02a5b939..95bb76d5 100644 --- a/src/database/index.ts +++ b/src/database/index.ts @@ -7,6 +7,7 @@ import { BankCredentialsRepository } from './repositories/bank-credentials.repos import { BankTransactionsRepository } from './repositories/bank-transactions.repository'; import { type BudgetReadRepository, BudgetRepository } from './repositories/budget.repository'; import { CategoryRepository } from './repositories/category.repository'; +import { CategoryEmojiCacheRepository } from './repositories/category-emoji-cache.repository'; import { ChatMessageRepository } from './repositories/chat-message.repository'; import { DevTaskRepository } from './repositories/dev-task.repository'; import { ExpenseRepository } from './repositories/expense.repository'; @@ -31,6 +32,7 @@ export class DatabaseService { public groupSpreadsheets: GroupSpreadsheetRepository; public users: UserRepository; public categories: CategoryRepository; + public categoryEmojiCache: CategoryEmojiCacheRepository; public pendingExpenses: PendingExpenseRepository; public expenses: ExpenseRepository; public budgets: BudgetReadRepository; @@ -55,6 +57,7 @@ export class DatabaseService { this.groupSpreadsheets = new GroupSpreadsheetRepository(this.db); this.users = new UserRepository(this.db); this.categories = new CategoryRepository(this.db); + this.categoryEmojiCache = new CategoryEmojiCacheRepository(this.db); this.pendingExpenses = new PendingExpenseRepository(this.db); this.expenses = new ExpenseRepository(this.db); this._budgetWriter = new BudgetRepository(this.db); diff --git a/src/database/repositories/category-emoji-cache.repository.ts b/src/database/repositories/category-emoji-cache.repository.ts new file mode 100644 index 00000000..d4c9c3d3 --- /dev/null +++ b/src/database/repositories/category-emoji-cache.repository.ts @@ -0,0 +1,48 @@ +/** Category emoji cache repository — stores LLM-resolved emoji per group × category */ +import type { Database } from 'bun:sqlite'; + +interface CategoryEmojiCacheRow { + group_id: number; + category: string; + emoji: string; + matched_key: string | null; + created_at: string; +} + +export class CategoryEmojiCacheRepository { + constructor(private db: Database) {} + + /** + * Look up a cached emoji for (group, category). Category is case-insensitive. + * Returns null when nothing cached yet — caller decides whether to call the LLM. + */ + get(groupId: number, category: string): string | null { + const key = category.trim().toLowerCase(); + if (!key) return null; + + const query = this.db.query( + 'SELECT * FROM category_emoji_cache WHERE group_id = ? AND category = ?', + ); + const row = query.get(groupId, key); + return row?.emoji ?? null; + } + + /** + * Store a resolved emoji for (group, category). Upserts on conflict. + * matchedKey records which CATEGORY_EMOJIS entry (or virtual key) was chosen + * for debugging; pass null when the resolver fell back to the default emoji. + */ + set(groupId: number, category: string, emoji: string, matchedKey: string | null): void { + const key = category.trim().toLowerCase(); + if (!key) return; + + const query = this.db.query(` + INSERT INTO category_emoji_cache (group_id, category, emoji, matched_key) + VALUES (?, ?, ?, ?) + ON CONFLICT(group_id, category) DO UPDATE SET + emoji = excluded.emoji, + matched_key = excluded.matched_key + `); + query.run(groupId, key, emoji, matchedKey); + } +} diff --git a/src/database/schema.ts b/src/database/schema.ts index f79fff12..ef63bb15 100644 --- a/src/database/schema.ts +++ b/src/database/schema.ts @@ -1194,6 +1194,24 @@ export function runMigrations(db: Database): void { } }, }, + { + name: '045_create_category_emoji_cache', + up: () => { + // Caches LLM-resolved emoji per group × category. Keyed on group_id so + // the same category name in different groups can resolve to different + // emojis (each group's custom_prompt may define distinct meanings). + db.exec(` + CREATE TABLE IF NOT EXISTS category_emoji_cache ( + group_id INTEGER NOT NULL, + category TEXT NOT NULL, + emoji TEXT NOT NULL, + matched_key TEXT, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (group_id, category) + ); + `); + }, + }, ]; // Check and run migrations diff --git a/src/services/receipt/category-emoji-resolver.ts b/src/services/receipt/category-emoji-resolver.ts new file mode 100644 index 00000000..16fc4c14 --- /dev/null +++ b/src/services/receipt/category-emoji-resolver.ts @@ -0,0 +1,200 @@ +/** + * Async category emoji resolver with LLM fallback. + * + * When a user-defined category has no exact entry in CATEGORY_EMOJIS, we ask + * an LLM to pick the best-matching known emoji key, optionally biased by the + * group's /prompt (participants, custom terms). Result is cached in SQLite + * keyed on (group_id, category) so the LLM is called once per unknown category + * per group. + * + * Virtual keys are used for concepts that aren't in the static map: + * __person_man__ → 👨 (adult male name) + * __person_woman__ → 👩 (adult female name) + * __person_boy__ → 👦 (boy name) + * __person_girl__ → 👧 (girl name) + * __person_baby__ → 👶 (infant/toddler) + * __fallback__ → 💰 (unknown, no reasonable match) + */ +import { InferenceClient } from '@huggingface/inference'; +import { + CATEGORY_EMOJIS, + DEFAULT_CATEGORY_EMOJI, + getCategoryEmoji, +} from '../../config/category-emojis'; +import { env } from '../../config/env'; +import { database } from '../../database'; +import { createLogger } from '../../utils/logger.ts'; + +const logger = createLogger('category-emoji-resolver'); + +const client = new InferenceClient(env.HF_TOKEN); + +const LLM_MODEL = 'Qwen/Qwen3-32B'; +const LLM_PROVIDER = 'cerebras'; + +const VIRTUAL_EMOJIS: Record = { + __person_man__: '👨', + __person_woman__: '👩', + __person_boy__: '👦', + __person_girl__: '👧', + __person_baby__: '👶', + __fallback__: DEFAULT_CATEGORY_EMOJI, +}; + +/** + * Resolve emoji for a category name. Tries in order: + * 1. Exact match in CATEGORY_EMOJIS (case-insensitive) — returned fast. + * 2. Cached LLM resolution for this (group, category). + * 3. Fresh LLM call (if HF_TOKEN is configured). One retry on failure. + * 4. Default emoji. + * + * Paths 2-4 write to the cache, so repeat lookups never hit the LLM. + */ +export async function resolveCategoryEmoji(category: string, groupId: number): Promise { + const trimmed = category.trim(); + if (!trimmed) return DEFAULT_CATEGORY_EMOJI; + + const exact = lookupExact(trimmed); + if (exact) return exact; + + const cached = database.categoryEmojiCache.get(groupId, trimmed); + if (cached) return cached; + + if (env.HF_TOKEN) { + const match = await matchWithLLM(trimmed, groupId); + if (match) { + database.categoryEmojiCache.set(groupId, trimmed, match.emoji, match.key); + return match.emoji; + } + } + + database.categoryEmojiCache.set(groupId, trimmed, DEFAULT_CATEGORY_EMOJI, null); + return DEFAULT_CATEGORY_EMOJI; +} + +/** + * Resolve emojis for many categories at once — useful when formatting a + * receipt summary. Deduplicates inputs before hitting cache/LLM. + */ +export async function resolveCategoryEmojis( + categories: readonly string[], + groupId: number, +): Promise> { + const result = new Map(); + const unique = Array.from(new Set(categories.map((c) => c.trim()).filter(Boolean))); + for (const cat of unique) { + result.set(cat, await resolveCategoryEmoji(cat, groupId)); + } + return result; +} + +function lookupExact(category: string): string | null { + const direct = CATEGORY_EMOJIS[category]; + if (direct) return direct; + + const lower = category.toLowerCase(); + for (const [key, emoji] of Object.entries(CATEGORY_EMOJIS)) { + if (key.toLowerCase() === lower) return emoji; + } + return null; +} + +async function matchWithLLM( + category: string, + groupId: number, +): Promise<{ emoji: string; key: string } | null> { + const knownKeys = Object.keys(CATEGORY_EMOJIS); + const virtualKeys = Object.keys(VIRTUAL_EMOJIS); + const allowedKeys = [...knownKeys, ...virtualKeys]; + + const group = database.groups.findById(groupId); + const customPrompt = group?.custom_prompt?.trim() ?? ''; + + const systemPrompt = buildSystemPrompt(allowedKeys); + const userPrompt = buildUserPrompt(category, customPrompt); + + for (let attempt = 1; attempt <= 2; attempt++) { + try { + const response = await client.chatCompletion({ + provider: LLM_PROVIDER, + model: LLM_MODEL, + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userPrompt }, + ], + max_tokens: 200, + temperature: 0.1, + }); + + const content = response.choices[0]?.message?.content?.trim() ?? ''; + if (!content) { + logger.warn({ data: { category, attempt } }, 'LLM returned empty content'); + continue; + } + + const cleaned = content.replace(/[\s\S]*?<\/think>/gi, '').trim(); + const jsonMatch = + cleaned.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/) ?? cleaned.match(/(\{[\s\S]*\})/); + if (!jsonMatch?.[1]) { + logger.warn( + { data: { category, attempt, content: cleaned.slice(0, 200) } }, + 'No JSON in LLM response', + ); + continue; + } + + const parsed = JSON.parse(jsonMatch[1]) as { matched_key?: unknown }; + const key = typeof parsed.matched_key === 'string' ? parsed.matched_key : ''; + const emoji = resolveKey(key); + if (!emoji) { + logger.warn( + { data: { category, attempt, key } }, + 'LLM picked a key not in the allowed set', + ); + continue; + } + + logger.info({ data: { category, key, emoji, attempt } }, 'LLM emoji match'); + return { emoji, key }; + } catch (err) { + logger.warn({ err, data: { category, attempt } }, 'LLM emoji resolve failed'); + } + } + + return null; +} + +function resolveKey(key: string): string | null { + if (!key) return null; + if (VIRTUAL_EMOJIS[key]) return VIRTUAL_EMOJIS[key]; + if (CATEGORY_EMOJIS[key]) return CATEGORY_EMOJIS[key]; + const lower = key.toLowerCase(); + for (const [k, emoji] of Object.entries(CATEGORY_EMOJIS)) { + if (k.toLowerCase() === lower) return emoji; + } + return null; +} + +function buildSystemPrompt(allowedKeys: string[]): string { + return `Ты подбираешь эмоджи для пользовательской категории трат. +Верни ТОЛЬКО JSON вида: {"matched_key": "<ключ>"} +Ключ должен быть ровно одним из списка разрешённых ниже. +Никакого текста вне JSON, никаких markdown-блоков, никаких пояснений. + +Правила выбора: +1. Если в контексте группы явно указано, чему соответствует категория — используй это (например, если написано «Ку — это коммунальные услуги», верни "Коммуналка"). +2. Если категория — имя человека: определи пол и возраст по контексту и имени, верни __person_man__ / __person_woman__ / __person_boy__ / __person_girl__ / __person_baby__. Без контекста — по стандартному значению имени (Алексей → мужчина, Елена → женщина). +3. Если категория — конкретный вид питомца (кот, собака, хомяк, рыбки и т.п.) — подбирай ключ этого вида (Кот, Собака, Хомяк, Рыбки). Для общего "питомцы" — Питомцы. +4. Если категория — бытовая статья расхода, совпадающая по смыслу с одним из ключей — выбери самый близкий (например, "Расходыквартиры" → "Коммуналка"). +5. Если совсем ничего не подходит — верни "__fallback__". + +Разрешённые ключи (выбирай ТОЧНО один из этих строк): +${allowedKeys.join(', ')}`; +} + +function buildUserPrompt(category: string, customPrompt: string): string { + const contextBlock = customPrompt ? `Контекст группы (/prompt):\n${customPrompt}\n\n` : ''; + return `${contextBlock}Категория: "${category}"\nВерни JSON с matched_key.`; +} + +export { getCategoryEmoji }; diff --git a/src/services/receipt/ocr-extractor.test.ts b/src/services/receipt/ocr-extractor.test.ts index 10e1b8d8..618f4732 100644 --- a/src/services/receipt/ocr-extractor.test.ts +++ b/src/services/receipt/ocr-extractor.test.ts @@ -5,12 +5,7 @@ import { afterEach, describe, expect, it, mock } from 'bun:test'; import path from 'node:path'; -import { - mockFetchError, - mockFetchJson, - mockFetchText, - restoreFetch, -} from '../../test-utils/mocks/fetch'; +import { mockFetchError, mockFetchJson, restoreFetch } from '../../test-utils/mocks/fetch'; import { createMockLogger } from '../../test-utils/mocks/logger'; const logMock = createMockLogger(); @@ -82,7 +77,18 @@ describe('extractTextFromImage', () => { it('throws when HuggingFace API returns 503 service unavailable', async () => { const { extractTextFromImage } = await import('./ocr-extractor'); - mockFetchText('Service Unavailable', 503); + // HF Inference SDK v4 retries infinitely on 503 (unbounded recursion bug in + // utils/request.js — `return innerRequest(...)` with no attempt counter). + // A mock that returns 503 forever → infinite retry loop → 60+ GB memory + // growth and machine crash. First call returns 503 to exercise the retry + // path; subsequent calls throw so the retry terminates and our code sees + // the failure as expected. + let calls = 0; + globalThis.fetch = mock(async () => { + calls++; + if (calls === 1) return new Response('Service Unavailable', { status: 503 }); + throw new Error('simulated downstream failure after one 503'); + }) as unknown as typeof fetch; const fakeBuffer = Buffer.from('fake-image-data'); await expect(extractTextFromImage(fakeBuffer)).rejects.toThrow(); diff --git a/src/services/receipt/ocr-extractor.ts b/src/services/receipt/ocr-extractor.ts index 29e8cb7c..8bbf136d 100644 --- a/src/services/receipt/ocr-extractor.ts +++ b/src/services/receipt/ocr-extractor.ts @@ -17,7 +17,7 @@ export function startTempImageCleanup(): void { const CLEANUP_INTERVAL = 5 * 60 * 1000; // 5 minutes const MAX_AGE = 5 * 60 * 1000; // 5 minutes - setInterval(async () => { + const timer = setInterval(async () => { try { const tempDir = path.join(process.cwd(), 'temp-images'); @@ -54,6 +54,11 @@ export function startTempImageCleanup(): void { } }, CLEANUP_INTERVAL); + // Don't block the event loop from shutting down on this interval. + // Prod: process.exit is the terminator; tests: spec files can finish + // cleanly without hanging forever on the 5-minute timer. + timer.unref?.(); + logger.info('[OCR_CLEANUP] Started periodic temp image cleanup (every 5 minutes)'); } diff --git a/src/services/receipt/photo-processor.ts b/src/services/receipt/photo-processor.ts index 08e4d77b..dfdbb3cf 100644 --- a/src/services/receipt/photo-processor.ts +++ b/src/services/receipt/photo-processor.ts @@ -358,7 +358,7 @@ export async function showReceiptConfirmationOptions( // Build summary from items const summary = buildSummaryFromItems(pendingItems); - const summaryMessage = formatSummaryMessage(summary, pendingItems.length); + const summaryMessage = await formatSummaryMessage(summary, group.id); // Store summary in photo queue database.photoQueue.update(photoQueueId, { diff --git a/src/services/receipt/receipt-summarizer.test.ts b/src/services/receipt/receipt-summarizer.test.ts index a81521b2..006006ea 100644 --- a/src/services/receipt/receipt-summarizer.test.ts +++ b/src/services/receipt/receipt-summarizer.test.ts @@ -2,8 +2,24 @@ // formatSummaryMessage, validateSummaryTotals, summaryToCategoryMap // applyCorrectionWithAI requires HuggingFace; tested separately via fetch mock -import { describe, expect, it } from 'bun:test'; +import { describe, expect, it, mock } from 'bun:test'; +import { getCategoryEmoji } from '../../config/category-emojis'; import type { ReceiptItem } from '../../database/types'; + +// Stub the async resolver used by summary-message — these tests care only +// that the flatten-then-delegate logic works, not about HF or the DB cache. +mock.module('./category-emoji-resolver', () => ({ + resolveCategoryEmoji: async (category: string) => getCategoryEmoji(category), + resolveCategoryEmojis: async (categories: readonly string[]) => { + const map = new Map(); + for (const c of categories) { + map.set(c, getCategoryEmoji(c)); + } + return map; + }, + getCategoryEmoji, +})); + import { buildSummaryFromItems, formatSummaryMessage, @@ -140,144 +156,46 @@ describe('buildSummaryFromItems', () => { }); describe('formatSummaryMessage', () => { - const simpleSummary: ReceiptSummary = { - categories: [ - { - name: 'Еда', - items: [ - { name: 'Молоко', total: 100 }, - { name: 'Хлеб', total: 50 }, - ], - }, - ], - totalAmount: 150, - currency: 'EUR', - }; - - describe('structure', () => { - it('starts with receipt header with item count', () => { - const msg = formatSummaryMessage(simpleSummary, 2); - expect(msg).toContain('2'); - expect(msg).toContain('🧾'); - }); - - it('includes total amount at end', () => { - const msg = formatSummaryMessage(simpleSummary, 2); - expect(msg).toContain('150.00'); - expect(msg).toContain('€'); - }); - - it('includes category names', () => { - const msg = formatSummaryMessage(simpleSummary, 2); - expect(msg).toContain('Еда'); - }); - - it('uses HTML bold tags for categories', () => { - const msg = formatSummaryMessage(simpleSummary, 2); - expect(msg).toContain(''); - expect(msg).toContain(''); - }); - - it('includes item names', () => { - const msg = formatSummaryMessage(simpleSummary, 2); - expect(msg).toContain('Молоко'); - expect(msg).toContain('Хлеб'); - }); - - it('returns a string', () => { - const msg = formatSummaryMessage(simpleSummary, 0); - expect(typeof msg).toBe('string'); - }); - }); - - describe('item display limits', () => { - it('shows all items when 3 or fewer', () => { - const summary: ReceiptSummary = { - categories: [ - { - name: 'Cat', - items: [ - { name: 'A', total: 1 }, - { name: 'B', total: 2 }, - { name: 'C', total: 3 }, - ], - }, - ], - totalAmount: 6, - currency: 'EUR', - }; - const msg = formatSummaryMessage(summary, 3); - expect(msg).toContain('A'); - expect(msg).toContain('B'); - expect(msg).toContain('C'); - expect(msg).not.toContain('еще'); - }); - - it('truncates to 3 items and shows "и еще N позиций" for more than 3', () => { - const summary: ReceiptSummary = { - categories: [ - { - name: 'Cat', - items: [ - { name: 'Item1', total: 1 }, - { name: 'Item2', total: 2 }, - { name: 'Item3', total: 3 }, - { name: 'Item4', total: 4 }, - { name: 'Item5', total: 5 }, - ], - }, - ], - totalAmount: 15, - currency: 'EUR', - }; - const msg = formatSummaryMessage(summary, 5); - expect(msg).toContain('еще 2'); - }); + // formatSummaryMessage is a thin wrapper that flattens ReceiptSummary + // (which has no qty/price — those are lost after AI correction round-trip) + // and delegates to buildReceiptSummaryMessage. Detailed formatting behavior + // is covered by summary-message.test.ts; these tests only verify the + // flattening + delegation works correctly. + it('flattens categories into items and preserves names, totals, currency', async () => { + const summary: ReceiptSummary = { + categories: [ + { + name: 'Еда', + items: [ + { name: 'Молоко', total: 100 }, + { name: 'Хлеб', total: 50 }, + ], + }, + ], + totalAmount: 150, + currency: 'EUR', + }; + const msg = await formatSummaryMessage(summary, 1); + expect(msg).toContain('Еда'); + expect(msg).toContain('Молоко'); + expect(msg).toContain('Хлеб'); + expect(msg).toContain('150.00'); + expect(msg).toContain('€'); }); - describe('HTML escaping', () => { - it('escapes < and > in category names', () => { - const summary: ReceiptSummary = { - categories: [{ name: '', category: 'A & B' })], + 1, + ); + expect(msg).not.toContain('