diff --git a/.github/codeql/codeql-config.yml b/.github/codeql/codeql-config.yml new file mode 100644 index 0000000..28a1606 --- /dev/null +++ b/.github/codeql/codeql-config.yml @@ -0,0 +1,25 @@ +# CodeQL analysis scope for tg-cli. +# +# The security gate analyzes the SHIPPED product surface — the `tg` / `tg-ctl` +# binaries built from `features/` plus the workflow files. `scripts/` holds +# one-off developer asset-generators (emoji/sticker set builders) that are run +# BY HAND, never bundled into the released CLIs, and never process untrusted +# input. CodeQL's js/file-access-to-http there ("an outbound request URL/body +# depends on local file data") is true and intentional: they read a bot token / +# prompt from a local config file and call an HTTP API with it. That is the job, +# not a vulnerability — and it is out of scope for a gate meant to protect the +# product. +# +# Why a directory `paths-ignore` and not a query-level exclude: `query-filters` +# match on query METADATA only (id/tags/kind), with no per-path scoping, so +# excluding `js/file-access-to-http` by id would also blind the gate to that +# class in shipped `features/` code. Scoping by PATH keeps every query — this one +# included — fully enforced on the product surface; only the by-hand `scripts/` +# dir is out of scope. Tradeoff: a FUTURE script is not auto-analyzed; given the +# dir is three hand-run generators, that is acceptable. Move a script into +# `features/` (or import it from shipped code) and it is gated again. +# +# This is a SCOPE decision (which code the gate guards), not a per-finding +# suppression of a real bug in shipped code. +paths-ignore: + - scripts diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 917cbe3..556151c 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -117,6 +117,9 @@ jobs: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} queries: security-extended + # Scope analysis to the shipped product surface; `scripts/` holds + # by-hand dev asset-generators, not gated code (see the config file). + config-file: ./.github/codeql/codeql-config.yml - name: Perform CodeQL Analysis if: steps.detect.outputs.has_source == 'true' diff --git a/features/auto-attach/transmitter.ts b/features/auto-attach/transmitter.ts index 535d20d..db16615 100644 --- a/features/auto-attach/transmitter.ts +++ b/features/auto-attach/transmitter.ts @@ -13,6 +13,7 @@ // feature is OFF, because they are not part of the feature — they are correct // behavior for any send. +import { decodeHtmlEntities, stripHtmlTags } from '../render/html'; import { isRichHtml, normalizeRichHtml, validateRichHtml } from '../render/rich'; import { splitMessage } from './split'; import { CAPTION_LIMIT, MESSAGE_LIMIT, type Format, type SendItem, type SendPlan } from './types'; @@ -48,14 +49,14 @@ export interface Transport { // not a full HTML length per Telegram's exact rules. export function visibleLength(text: string, format: Format): number { if (format !== 'html') return text.length; - const noTags = text.replace(/<[^>]+>/g, ''); - const unescaped = noTags - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/&/g, '&') - .replace(/"/g, '"') - .replace(/'/g, "'"); - return unescaped.length; + // Strip tags first, then decode entities in a single pass (stripHtmlTags / + // decodeHtmlEntities in render/html). This replaces a `/<[^>]+>/g` strip (which + // left a dangling ` existsSync(d.startsWith('~/') ? join(home, d.slice(2)) : d)); } -function escapeRegex(s: string): string { - return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); -} - function appendMarked(path: string, tool: string, blurb: string): void { mkdirSync(dirname(path), { recursive: true }); const start = ``; const end = ``; - let existing = existsSync(path) ? readFileSync(path, 'utf8') : ''; - const re = new RegExp(escapeRegex(start) + '[\\s\\S]*?' + escapeRegex(end) + '\\n?', 'g'); + let existing = readTextIfExists(path) ?? ''; + const re = new RegExp(escapeRegExp(start) + '[\\s\\S]*?' + escapeRegExp(end) + '\\n?', 'g'); existing = existing.replace(re, ''); const block = `${start}\n${blurb}\n${end}\n`; writeFileSync(path, existing.trim() ? existing.replace(/\s+$/, '') + '\n\n' + block : block); @@ -223,9 +234,14 @@ function appendMarked(path: string, tool: string, blurb: string): void { function ensureSessionStartHook(home: string): boolean { const settingsPath = join(home, '.claude', 'settings.json'); if (!existsSync(join(home, '.claude'))) return false; + // Read the existing settings ONCE into memory, then parse from and back up + // from that same in-memory copy. A separate existsSync()/readFileSync() per + // use would re-stat the path each time, racing with concurrent writers + // (js/file-system-race); a single read snapshots the bytes we then act on. + const original = readTextIfExists(settingsPath); let data: any; try { - data = existsSync(settingsPath) ? JSON.parse(readFileSync(settingsPath, 'utf8')) : {}; + data = original !== undefined ? JSON.parse(original) : {}; } catch { return false; // don't clobber a file we can't parse } @@ -244,7 +260,10 @@ function ensureSessionStartHook(home: string): boolean { } } sessionStart.push({ hooks: [{ type: 'command', command: HOOK_COMMAND }] }); - if (existsSync(settingsPath)) writeFileSync(settingsPath + '.bak', readFileSync(settingsPath)); + // Back up the snapshot we already read (not a fresh read of the path) before + // overwriting, so the .bak is exactly what we parsed and there is no second + // check-then-read race. + if (original !== undefined) writeFileSync(settingsPath + '.bak', original); writeFileSync(settingsPath, JSON.stringify(data, null, 2) + '\n'); return true; } diff --git a/features/render/html.ts b/features/render/html.ts index 15e9a77..2e8151f 100644 --- a/features/render/html.ts +++ b/features/render/html.ts @@ -18,6 +18,38 @@ export function escapeHtml(s: string): string { return s.replace(/&/g, "&").replace(//g, ">") } +// Strip HTML tags for VISIBLE-length / text-content purposes (not a security +// sanitizer — output is never re-inserted into a page). One pass over the +// string removes every `<...>` tag AND any trailing TAG-LIKE fragment that was +// cut off before its closing `>` (e.g. a truncated `]+>/g` leaves that dangling fragment behind, which both +// over-counts text length and is what CodeQL flags as incomplete multi- +// character sanitization. The trailing branch only fires on `<` immediately +// followed by a tag-name start (letter), `/`, or `!` — so a lone literal `<` +// inside ordinary text (e.g. `a < b`) is preserved and not greedily eaten to +// end-of-string. Single pass, deterministic. +export function stripHtmlTags(s: string): string { + return s.replace(/<[^>]*>|<[/!A-Za-z][^>]*$/g, "") +} + +// Decode the small set of HTML entities `tg` itself emits (via escapeHtml plus +// the documented quote/apostrophe forms), in a SINGLE pass. Decoding `&` +// first and `<` second — the obvious chained-replace — double-unescapes +// `&lt;` into `<` (CodeQL js/double-escaping). A single regex with a lookup +// table consumes each entity exactly once, left to right, so no replacement can +// feed the next one. Unknown entities are left verbatim. +const HTML_ENTITIES: Record = { + "&": "&", + "<": "<", + ">": ">", + """: '"', + "'": "'", + "'": "'", +} +export function decodeHtmlEntities(s: string): string { + return s.replace(/&(?:amp|lt|gt|quot|apos|#39);/g, (m) => HTML_ENTITIES[m] ?? m) +} + export function detectHtmlTags(text: string): boolean { const htmlPattern = /<(b|strong|i|em|u|ins|s|strike|del|span|tg-spoiler|a|tg-emoji|tg-time|code|pre|blockquote)\b/i return htmlPattern.test(text) diff --git a/features/render/rich.ts b/features/render/rich.ts index b5f9f07..483abe0 100644 --- a/features/render/rich.ts +++ b/features/render/rich.ts @@ -17,6 +17,9 @@ // text messages go rich vs. plain, and that a rich message is NOT 4096-split // and never rides as a media caption) lives in the transmitter. +import { stripHtmlTags } from './html'; +import { escapeRegExp } from '../util/regex'; + // Rich-ONLY tags: present in the rich-html-style allowlist but NOT in the basic // sendMessage allowlist (features/render/html.ts detectHtmlTags). Seeing any of // these in an HTML body means the message must go via sendRichMessage. The @@ -84,7 +87,7 @@ const RICH_TAG_RE = new RegExp( ' b.length - a.length) - .map((t) => t.replace(/[-]/g, '\\-')) + .map((t) => escapeRegExp(t)) .join('|') + ')(?=[\\s/>]|$)', 'i', @@ -210,6 +213,12 @@ const VOID_TAGS = new Set(['img', 'hr', 'br', 'input', 'tg-map', 'tg-emoji']); // left as-is. const NAMED_ENTITY_RE = /&(?:lt|gt|amp|quot|apos|nbsp|hellip|mdash|ndash|lsquo|rsquo|ldquo|rdquo);/g; const NUMERIC_ENTITY_RE = /&#(?:[0-9]+|x[0-9a-fA-F]+);/g; +// NOT a js/double-escaping chain (unlike the decode this PR replaced elsewhere): +// both replaces map an entity to the SAME placeholder ``, which contains no `&`, +// so the second pass can never re-process the first's output. The two patterns +// also match disjoint sets (named vs `&#…;` numeric). It collapses entities for a +// CHARACTER-COUNT budget only — it must not be turned into decodeHtmlEntities, +// which decodes to real chars and would change the count. function decodeEntitiesForCount(s: string): string { return s.replace(NAMED_ENTITY_RE, '').replace(NUMERIC_ENTITY_RE, ''); } @@ -241,7 +250,7 @@ export function validateRichHtml(html: string): RichValidation { const altText = (html.match(/\balt\s*=\s*"([^"]*)"|alt\s*=\s*'([^']*)'/gi) ?? []) .map((a) => a.replace(/^[^=]*=\s*["']?/, '').replace(/["']$/, '')) .join(''); - const text = decodeEntitiesForCount(html.replace(/<[^>]+>/g, '') + altText); + const text = decodeEntitiesForCount(stripHtmlTags(html) + altText); const chars = [...text].length; if (chars > RICH_LIMITS.maxChars) { return { diff --git a/features/transport/telegram.ts b/features/transport/telegram.ts index 9eb1b7c..4370bcd 100644 --- a/features/transport/telegram.ts +++ b/features/transport/telegram.ts @@ -8,7 +8,7 @@ // exactly the transmitter's Transport (sendMessage/sendPhoto/sendDocument/ // sendMediaGroup); the transmitter drives caption-overflow, the >4096 split and // the photos→text→documents ordering on top of it. -import { readFileSync, statSync } from 'fs'; +import { closeSync, fstatSync, openSync, readSync } from 'fs'; import { BOM_MAX_BYTES, maybeAddBom } from '../auto-attach/encoding'; import { type Transport } from '../auto-attach/transmitter'; import { type SendItem } from '../auto-attach/types'; @@ -50,6 +50,48 @@ export async function checkResponse(resp: Response, method: string): Promise maxBytes) return null; + const buf = Buffer.allocUnsafe(size); + let read = 0; + while (read < size) { + const n = readSync(fd, buf, read, size - read, read); + if (n === 0) break; // file shrank under us mid-read + read += n; + } + // A short read means the file changed beneath us; bail to null so the caller + // streams the file via Bun.file unchanged rather than stitching a BOM onto a + // truncated body (which would defeat the very encoding fix this serves). + if (read !== size) return null; + // Guard the other direction too: if the file GREW after the initial fstat we + // would have read only a prefix while still satisfying read === size. Re-stat + // the same fd and reject a size change so we never BOM a partial body. + if (fstatSync(fd).size !== size) return null; + return new Uint8Array(buf.subarray(0, read)); + } catch { + return null; // unreadable here → let Bun.file surface the real error downstream + } finally { + if (fd !== undefined) { + try { + closeSync(fd); + } catch { + // already closed / invalid fd — nothing to do + } + } + } +} + // Upload bytes for an item. Text DOCUMENTS with non-ASCII UTF-8 content get // a BOM prepended to the uploaded copy (features/auto-attach/encoding.ts) so // Telegram's preview stops guessing legacy codepages for Cyrillic; the file @@ -61,18 +103,13 @@ function blobFor(item: SendItem): { body: Blob | ReturnType; na return { body: new Blob([processed]), name: item.source.filename }; } if (item.type === 'document') { - try { - const size = statSync(item.source.path).size; - if (size > 0 && size <= BOM_MAX_BYTES) { - const bytes = new Uint8Array(readFileSync(item.source.path)); - const processed = maybeAddBom(bytes, item.source.path); - if (processed !== bytes) { - const name = item.source.path.slice(item.source.path.lastIndexOf('/') + 1); - return { body: new Blob([processed]), name }; - } + const bytes = readSmallFile(item.source.path, BOM_MAX_BYTES); + if (bytes && bytes.length > 0) { + const processed = maybeAddBom(bytes, item.source.path); + if (processed !== bytes) { + const name = item.source.path.slice(item.source.path.lastIndexOf('/') + 1); + return { body: new Blob([processed]), name }; } - } catch { - // unreadable here → let Bun.file surface the real error downstream } } return { body: Bun.file(item.source.path) }; diff --git a/features/util/regex.ts b/features/util/regex.ts new file mode 100644 index 0000000..e3d81ac --- /dev/null +++ b/features/util/regex.ts @@ -0,0 +1,12 @@ +// --- Regex utilities (pure) --- +// +// escapeRegExp turns an arbitrary string into a literal that is safe to splice +// into a `new RegExp(...)` source. It escapes the FULL set of ECMAScript regex +// metacharacters — not just one or two — so building a pattern from external or +// structured tokens can never let a stray `.`, `\`, `(`, `[`, etc. change the +// pattern's meaning. A partial escaper (e.g. one that handles `-` but leaves +// `\` untouched) is what CodeQL flags as js/incomplete-sanitization; this is the +// single, complete source of truth both render/rich and install-skill use. +export function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\-]/g, "\\$&"); +} diff --git a/tests/html-sanitize.test.ts b/tests/html-sanitize.test.ts new file mode 100644 index 0000000..2af86fe --- /dev/null +++ b/tests/html-sanitize.test.ts @@ -0,0 +1,103 @@ +import { expect, test } from 'bun:test'; +import { decodeHtmlEntities, stripHtmlTags } from '../features/render/html'; +import { escapeRegExp } from '../features/util/regex'; +import { isRichHtml } from '../features/render/rich'; + +// These pin the CodeQL findings the shared helpers were extracted to close: +// js/incomplete-multi-character-sanitization (stripHtmlTags), js/double-escaping +// (decodeHtmlEntities), and js/incomplete-sanitization (escapeRegExp). Each test +// is the exact failure mode CodeQL flagged for the old inline code. + +// --- stripHtmlTags --- + +test('stripHtmlTags removes well-formed tags', () => { + expect(stripHtmlTags('hi there')).toBe('hi there'); + expect(stripHtmlTags('link')).toBe('link'); +}); + +test('stripHtmlTags removes a trailing UNTERMINATED tag (no closing >)', () => { + // The js/incomplete-multi-character-sanitization case: a naive /<[^>]+>/g + // leaves a dangling `c { + expect(stripHtmlTags('plain text')).toBe('plain text'); + // A trailing `<` followed by a space (or digit) is ordinary text, not a + // cut-off tag, so it is NOT greedily consumed to end-of-string. + expect(stripHtmlTags('keep < this')).toBe('keep < this'); + expect(stripHtmlTags('a < b')).toBe('a < b'); + // A balanced `<…>` still reads as a tag and is stripped (matches the prior + // /<[^>]+>/g behavior); only the dangling-trailing case changed. + expect(stripHtmlTags('a < b and c > d')).toBe('a d'); +}); + +// --- decodeHtmlEntities --- + +test('decodeHtmlEntities decodes the documented entity set', () => { + expect(decodeHtmlEntities('a & b < c > d')).toBe('a & b < c > d'); + expect(decodeHtmlEntities('say "hi" 'yo'')).toBe('say "hi" \'yo\''); +}); + +test('decodeHtmlEntities does NOT double-unescape &lt;', () => { + // js/double-escaping: chained `&`→`&` then `<`→`<` turns `&lt;` + // into `<`. A single pass must yield the literal `<`. + expect(decodeHtmlEntities('&lt;')).toBe('<'); + expect(decodeHtmlEntities('&amp;')).toBe('&'); +}); + +test('decodeHtmlEntities leaves unknown entities verbatim', () => { + expect(decodeHtmlEntities('© &unknown; &')).toBe('© &unknown; &'); +}); + +// --- escapeRegExp --- + +test('escapeRegExp escapes the full ECMAScript metacharacter set', () => { + const src = '.*+?^${}()|[]\\-'; + // Embed the escaped source between sentinels and anchor with ^…$, so an + // UNescaped anchor (`^`/`$`) or any metachar that changed the pattern's + // meaning would fail to match the literal sentinel-wrapped string. A bare + // `re.test(src)` would pass even for a broken impl (anchors match empty). + const re = new RegExp('^X' + escapeRegExp(src) + 'X$'); + expect(re.test('X' + src + 'X')).toBe(true); + // And the exact-match guard: the whole input is consumed, not a substring. + const m = ('X' + src + 'X').match(new RegExp(escapeRegExp(src))); + expect(m?.[0]).toBe(src); +}); + +test('escapeRegExp neutralizes a backslash (the js/incomplete-sanitization gap)', () => { + // A hyphen-only escaper left `\` raw, so a token containing `\d` would change + // the pattern. A full escaper treats it literally. + const re = new RegExp(escapeRegExp('a\\d')); + expect(re.test('a\\d')).toBe(true); + expect(re.test('a5')).toBe(false); +}); + +test('escapeRegExp keeps hyphenated rich tag names literal in an alternation', () => { + const re = new RegExp('(?:' + ['tg-math-block', 'tg-math'].map(escapeRegExp).join('|') + ')'); + expect(re.test('tg-math-block')).toBe(true); + expect(re.test('tg-math')).toBe(true); +}); + +test('RICH_TAG_RE (built with escapeRegExp) still detects hyphenated rich tags', () => { + // Pins the real regex in rich.ts, not just escapeRegExp in isolation — so a + // future edit to the escape/sort/lookahead composition fails a test here. + expect(isRichHtml('a E=mc^2 b')).toBe(true); + expect(isRichHtml('x')).toBe(true); + expect(isRichHtml('
x
')).toBe(true); + // A basic-only body must NOT be misdetected as rich. + expect(isRichHtml('bold and link')).toBe(false); +}); + +test('escapeRegExp escapes `-` so it is a literal, not a range, inside a char class', () => { + // `[a-c]` would match `b`; with `-` escaped, the class is the three literal + // chars `a`, `-`, `c` and must NOT match `b`. + const re = new RegExp('^[' + escapeRegExp('a-c') + ']$'); + expect(re.test('a')).toBe(true); + expect(re.test('-')).toBe(true); + expect(re.test('c')).toBe(true); + expect(re.test('b')).toBe(false); +}); diff --git a/tests/read-small-file.test.ts b/tests/read-small-file.test.ts new file mode 100644 index 0000000..3f320dc --- /dev/null +++ b/tests/read-small-file.test.ts @@ -0,0 +1,53 @@ +import { test, expect, beforeEach, afterEach } from 'bun:test'; +import { mkdtempSync, rmSync, writeFileSync } from 'fs'; +import { tmpdir } from 'os'; +import { join } from 'path'; +import { readSmallFile } from '../features/transport/telegram'; + +// readSmallFile replaced a statSync()+readFileSync() pair (js/file-system-race) +// with a single-fd open→fstat→read. These pin its contract: the BOM-prepend +// path in blobFor depends on getting back the FULL bytes for an in-range file +// and null otherwise (so it falls through to a streamed Bun.file). + +let dir: string; + +beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), 'tg-readsmall-')); +}); + +afterEach(() => { + rmSync(dir, { recursive: true, force: true }); +}); + +test('returns the full bytes for a file within the size cap', () => { + const p = join(dir, 'a.txt'); + const content = 'привет mir'; // multi-byte UTF-8 + writeFileSync(p, content); + const bytes = readSmallFile(p, 1024); + expect(bytes).not.toBeNull(); + expect(new TextDecoder().decode(bytes!)).toBe(content); +}); + +test('returns null for an empty file (nothing to BOM)', () => { + const p = join(dir, 'empty.txt'); + writeFileSync(p, ''); + expect(readSmallFile(p, 1024)).toBeNull(); +}); + +test('returns null when the file exceeds the cap (caller should stream it)', () => { + const p = join(dir, 'big.bin'); + writeFileSync(p, Buffer.alloc(2048, 0x41)); + expect(readSmallFile(p, 1024)).toBeNull(); +}); + +test('returns exactly maxBytes-sized file (boundary)', () => { + const p = join(dir, 'exact.bin'); + writeFileSync(p, Buffer.alloc(16, 0x42)); + const bytes = readSmallFile(p, 16); + expect(bytes).not.toBeNull(); + expect(bytes!.length).toBe(16); +}); + +test('returns null for a missing/unreadable path (no throw)', () => { + expect(readSmallFile(join(dir, 'does-not-exist'), 1024)).toBeNull(); +}); diff --git a/tests/table.test.ts b/tests/table.test.ts index a1a7608..e1c27f9 100644 --- a/tests/table.test.ts +++ b/tests/table.test.ts @@ -1,4 +1,5 @@ import { expect, test } from 'bun:test'; +import { decodeHtmlEntities } from '../features/render/html'; import { escapeCell, hasWideGlyph, parseTableRows, renderTable, toTablePre } from '../features/render/table'; // --- parseTableRows --- @@ -121,7 +122,9 @@ test('toTablePre alignment is computed on RAW cells (escaping does not skew colu const body = html.replace(/^
/, '').replace(/<\/pre>$/, '');
   const lines = body.split('\n');
   // The data row, with `&` un-escaped back, must be the same width as the border.
-  const unescaped = lines.map((l) => l.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>'));
+  // Single-pass decode (decodeHtmlEntities) instead of chained replaces, which
+  // double-unescape `&lt;` (js/double-escaping).
+  const unescaped = lines.map((l) => decodeHtmlEntities(l));
   const widths = new Set(unescaped.map((l) => [...l].length));
   expect(widths.size).toBe(1);
 });
diff --git a/tests/transmitter.test.ts b/tests/transmitter.test.ts
index d53f79f..e92917a 100644
--- a/tests/transmitter.test.ts
+++ b/tests/transmitter.test.ts
@@ -8,6 +8,18 @@ test('visibleLength ignores HTML tags + counts unescaped entities', () => {
   expect(visibleLength('a < b & c', 'html')).toBe('a < b & c'.length);
 });
 
+test('visibleLength: single-pass decode does not double-unescape, tag-strip is complete', () => {
+  // js/double-escaping guard: `&lt;` must decode to the literal `<`
+  // (4 chars), never collapse to `<`.
+  expect(visibleLength('&lt;', 'html')).toBe('<'.length);
+  // js/incomplete-multi-character-sanitization guard: a dangling, unterminated
+  // tag at end-of-string contributes ZERO visible length (it is stripped, not
+  // left behind as it was under the old /<[^>]+>/g). An unescaped trailing ` 1024 but VISIBLE <= 1024 still rides as caption', () => {
   const calls: Array<{ method: string }> = [];
   const t: Transport = {