diff --git a/CLAUDE.md b/CLAUDE.md index 06516810..da38b68b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -65,6 +65,23 @@ A new `lib/` is never enough on its own; wire the whole ring or the build/tes - Go values enter scripts via the `predecl` parameter (see `lib/serial`'s use of `startime.Time` / `dataconv.ConvertStruct`). - The default suite must stay **hermetic** (no real network/DNS; `lib/net` uses local stubs) — real-network tests go behind `//go:build integration`. +## Documentation standard for `lib/*` READMEs + +Module READMEs are read by humans skimming and by AI agents parsing — optimize for both: a scannable, complete surface table up front, runnable examples, explicit boundaries. Every script-visible member (function, constant, type) must be documented, and `TestDocCoverage` enforces it (see below). + +Required structure, in order: + +1. **Title + purpose** — `# ` then 1–2 sentences: what it does; what it mirrors or succeeds (e.g. "a subset of Python's `re`"); the capability profile (pure / filesystem / network / process / log), so a reader knows the side effects without reading code. +2. **Functions** — a single scannable table listing **every** function, grouped if large: `| function | description |`, with the signature in the function cell as `` `name(args) → result` ``. `try_*` variants may share a row but each name must still appear as a backtick token (`` `try_get` ``) so coverage passes. This table is the contract `TestDocCoverage` checks. +3. **Constants** (if any) — `| constant | meaning |`, every one present. +4. **Types** (if any, e.g. `Pattern`, `Match`) — a subsection per type with a methods/attributes table. +5. **Details & examples** — per function or group: the signature, parameters only where non-obvious, the return, **what it errors on** (the honest-boundary principle), and at least one **runnable example ending in `# Output:`**. +6. **Notes / boundaries** — engine, determinism, limits, differences from the mirrored API. + +Style: names always in backticks; lead with the table before prose; examples real, minimal, runnable; drop framework boilerplate (no empty `#### Parameters` table for a one-arg function — fold into a sentence); state errors and edge behavior explicitly; flag any non-snake_case name (e.g. http's `postForm`). + +**Doc coverage check** — `tools/doccov/coverage.star` + `TestDocCoverage` enforce that every member of every `lib/*` module appears in its README (matched as a backtick-quoted identifier). The Go test enumerates the authoritative surface (each module's registered members across the Module/Struct/flat shapes) and runs the `.star` matcher through a Machine, so the check **dogfoods the `regex` module**. Run it with `go test -run TestDocCoverage -v .` — the report lists any undocumented members. (Scope: `lib/*` modules; the go.starlark.net-backed `math`/`struct`/`time` have no lib README and are skipped. Type methods are a standard requirement verified by review, not by the automated member check.) + ## Release discipline - **Never tag or publish autonomously.** Draft the release title + notes, show the user, and tag only after explicit approval. Patch bump by default. A published tag is immutable in the Go module proxy. diff --git a/Makefile b/Makefile index 4c04ac22..cf34657a 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,7 @@ export PACK=main export FLAGS="-s -w -X '$(PACK).AppName=$(BINARY)' -X '$(PACK).BuildDate=`date '+%Y-%m-%dT%T%z'`' -X '$(PACK).BuildHost=`hostname`' -X '$(PACK).GoVersion=`go version`' -X '$(PACK).GitBranch=`git symbolic-ref -q --short HEAD`' -X '$(PACK).GitCommit=`git rev-parse --short HEAD`' -X '$(PACK).GitSummary=`git describe --tags --dirty --always`' -X '$(PACK).CIBuildNum=${BUILD_NUM}'" # commands -.PHONY: default ci test test_loop bench build +.PHONY: default ci test test_loop bench build doc-check default: @echo "build target is required for $(BINARY)" @exit 1 @@ -39,6 +39,13 @@ ci: $(GOTEST) -v -race -cover -covermode=atomic -coverprofile=coverage.txt -count 1 ./... $(GOTEST) -v -parallel=4 -run="none" -benchtime="2s" -benchmem -bench=. +# Documentation coverage gate: every script-visible member of every lib/* +# module must be documented in its README. Runs tools/doccov/coverage.star +# through a Machine (dogfoods the regex module). Already part of `make ci` via +# ./... ; this target runs just it, with the undocumented-member report. +doc-check: + $(GOTEST) -v -run TestDocCoverage -count 1 . + build: make -C cmd/starlet build diff --git a/doc_coverage_test.go b/doc_coverage_test.go new file mode 100644 index 00000000..76ad7c20 --- /dev/null +++ b/doc_coverage_test.go @@ -0,0 +1,94 @@ +package starlet_test + +import ( + "os" + "path/filepath" + "sort" + "strings" + "testing" + + "github.com/1set/starlet" + "go.starlark.net/starlarkstruct" +) + +// libReadmeDir maps a builtin module name to its lib/ documentation +// directory. The only non-identity case is go_idiomatic -> goidiomatic; +// modules backed by go.starlark.net (math, struct, time) have no lib README +// and are skipped below. +func libReadmeDir(module string) string { + return strings.ReplaceAll(module, "_", "") +} + +// moduleSurface enumerates the script-visible names a module exports, across +// the three registration shapes: a starlarkstruct.Module (its Members), a +// starlarkstruct.Struct (its AttrNames), or a flat StringDict (its keys). +func moduleSurface(t *testing.T, name string) []string { + loader := starlet.GetBuiltinModule(name) + if loader == nil { + return nil + } + sd, err := loader() + if err != nil { + t.Fatalf("load module %q: %v", name, err) + } + var out []string + for k, v := range sd { + switch m := v.(type) { + case *starlarkstruct.Module: + for mk := range m.Members { + out = append(out, mk) + } + case *starlarkstruct.Struct: + out = append(out, m.AttrNames()...) + default: + out = append(out, k) + } + } + sort.Strings(out) + return out +} + +// TestDocCoverage asserts that every script-visible member of every lib/* +// module is documented in that module's README. The matching logic lives in +// tools/doccov/coverage.star and runs through a starlet Machine, so the check +// dogfoods the regex module. +func TestDocCoverage(t *testing.T) { + script, err := os.ReadFile(filepath.Join("tools", "doccov", "coverage.star")) + if err != nil { + t.Fatalf("read coverage script: %v", err) + } + + surface := map[string]interface{}{} + docs := map[string]interface{}{} + var skipped []string + for _, name := range starlet.GetAllBuiltinModuleNames() { + readme, err := os.ReadFile(filepath.Join("lib", libReadmeDir(name), "README.md")) + if err != nil { + skipped = append(skipped, name) // external module without a lib README + continue + } + docs[name] = string(readme) + names := moduleSurface(t, name) + members := make([]interface{}, len(names)) + for i, n := range names { + members[i] = n + } + surface[name] = members + } + + m := starlet.NewWithNames(starlet.StringAnyMap{"surface": surface, "docs": docs}, nil, []string{"regex"}) + m.SetScriptContent(script) + out, err := m.Run() + if err != nil { + t.Fatalf("doc coverage script failed: %v", err) + } + if report, ok := out["report"].(string); ok { + t.Log("\n" + report) + } + sort.Strings(skipped) + t.Logf("skipped (go.starlark.net modules, no lib README): %v", skipped) + + if missing, ok := out["missing"].([]interface{}); ok && len(missing) > 0 { + t.Errorf("%d module member(s) are not documented in their README — see the report above", len(missing)) + } +} diff --git a/lib/atom/README.md b/lib/atom/README.md index 6c20471e..2a188d6d 100644 --- a/lib/atom/README.md +++ b/lib/atom/README.md @@ -1,188 +1,117 @@ # atom -atom provides atomic operations for integers, floats, and strings. +`atom` provides atomic operations for integers, floats, and strings — lock-free counters and compare-and-swap cells safe to share across concurrent script work (backed by Go's `sync/atomic` via `go.uber.org/atomic`). Capability profile: **pure** (no filesystem, network, process, or log side effects). ## Functions -### `new_int(value=0) -> AtomicInt` +| function | description | +|----------|-------------| +| `new_int(value=0) -> atom_int` | create an atomic integer cell, optionally seeded with `value` | +| `new_float(value=0.0) -> atom_float` | create an atomic float cell, optionally seeded with `value` | +| `new_string(value="") -> atom_string` | create an atomic string cell, optionally seeded with `value` | -create a new AtomicInt with an optional initial value +The constructors are the module's only top-level members; all mutation happens through methods on the returned cells (see Types). -#### Parameters - -| name | type | description | -|---------|-------|------------------------------| -| `value` | `int` | initial value, defaults to 0 | +## Types -#### Examples +The cells created above are custom Starlark values. Each is truthy when its value is non-zero / non-empty, hashable (usable as a dict key), and ordered (`==`, `!=`, `<`, `<=`, `>`, `>=` compare by current value against another cell of the same type). `str()` renders as ``, ``, ``. -**basic** +### `atom_int` -create a new AtomicInt with default value +An atomic `int64` cell. Methods: -```python -load("atom", "new_int") -ai = new_int() -ai.inc() -print(ai.get()) -# Output: 1 -``` +| method | description | +|--------|-------------| +| `get() -> int` | return the current value | +| `set(value)` | store `value` (int); returns `None` | +| `cas(old, new) -> bool` | atomically set to `new` only if the current value equals `old`; returns whether the swap happened | +| `add(delta) -> int` | add `delta` (int) and return the new value | +| `sub(delta) -> int` | subtract `delta` (int) and return the new value | +| `inc() -> int` | add 1 and return the new value | +| `dec() -> int` | subtract 1 and return the new value | -**with value** +### `atom_float` -create a new AtomicInt with a specific value +An atomic `float64` cell. `set`, `cas`, `add`, and `sub` accept either a float or an int (ints are widened to float). Methods: -```python -load("atom", "new_int") -ai = new_int(42) -ai.add(42) -print(ai.get()) -# Output: 84 -``` +| method | description | +|--------|-------------| +| `get() -> float` | return the current value | +| `set(value)` | store `value` (float or int); returns `None` | +| `cas(old, new) -> bool` | atomically set to `new` only if the current value equals `old`; returns whether the swap happened | +| `add(delta) -> float` | add `delta` (float or int) and return the new value | +| `sub(delta) -> float` | subtract `delta` (float or int) and return the new value | -### `new_float(value=0.0) -> AtomicFloat` +### `atom_string` -create a new AtomicFloat with an optional initial value +An atomic string cell. Methods: -#### Parameters +| method | description | +|--------|-------------| +| `get() -> string` | return the current value | +| `set(value)` | store `value` (string); returns `None` | +| `cas(old, new) -> bool` | atomically set to `new` only if the current value equals `old`; returns whether the swap happened | -| name | type | description | -|---------|---------|--------------------------------| -| `value` | `float` | initial value, defaults to 0.0 | +## Details & examples -#### Examples +### `new_int` -**basic** - -create a new AtomicFloat with default value +`new_int(value=0) -> atom_int` — `value` is an optional initial `int` (defaults to `0`). Errors when `value` is not an int (e.g. `new_int('42')` → `new_int: for parameter value: got string, want int`). ```python -load("atom", "new_float") -af = new_float() -print(af.get()) -# Output: 0.0 +load("atom", "new_int") +x = new_int() +x.inc() +x.set(20) +print(x.add(5), x.sub(3), x.cas(22, 100), x.get()) +# Output: 25 22 True 100 ``` -**with value** +### `new_float` -create a new AtomicFloat with a specific value +`new_float(value=0.0) -> atom_float` — `value` is an optional initial number; an int is accepted and widened to float. Errors when `value` is a non-numeric type (e.g. `new_float('42.1')` → `new_float: for parameter value: got string, want float`). ```python load("atom", "new_float") -af = new_float(3.14) -print(af.get()) -# Output: 3.14 +x = new_float(1) +x.set(20.1) +print(x.add(5), x.cas(22.1, 200.5), x.get()) +# Output: 25.1 False 25.1 ``` -### `new_string(value="") -> AtomicString` - -create a new AtomicString with an optional initial value - -#### Parameters - -| name | type | description | -|---------|----------|--------------------------------------------| -| `value` | `string` | initial value, defaults to an empty string | - -#### Examples +### `new_string` -**basic** - -create a new AtomicString with default value +`new_string(value="") -> atom_string` — `value` is an optional initial `string` (defaults to `""`). Errors when `value` is not a string (e.g. `new_string(1)` → `new_string: for parameter value: got int, want string`). ```python load("atom", "new_string") -as = new_string() -print(as.get()) # Output: "" +x = new_string("hello") +x.set("world") +print(x.cas("world", "new"), x.get(), x.cas("world", "new2"), x.get()) +# Output: True new False new ``` -**with value** +### Methods, errors, and concurrency -create a new AtomicString with a specific value +- `get`, `inc`, and `dec` take no arguments — passing any errors (e.g. `x.get(2)` → `get: got 1 arguments, want 0`). +- `set`, `add`, `sub`, and `cas` validate their argument types and error on a mismatch (e.g. `x.add('2')` on an `atom_int` → `add: for parameter delta: got string, want int`). +- An unknown attribute errors via the standard Starlark message (e.g. `x.guess()` → `atom_int has no .guess field or method`). +- Operations are atomic, so a cell can be safely mutated from comprehensions or callbacks sharing it: ```python -load("atom", "new_string") -as = new_string("hello") -print(as.get()) -# Output: "hello" +load("atom", "new_int") +x = new_int() +def work(): + x.inc() +[work() for _ in range(10)] +print(x.get()) +# Output: 10 ``` -## Types - -### `AtomicInt` - -an atomic integer type with various atomic operations - -**Methods** - -#### `get() -> int` - -returns the current value - -#### `set(value: int)` - -sets the value - -#### `cas(old: int, new: int) -> bool` - -compares and swaps the value if it matches old - -#### `add(delta: int) -> int` - -adds delta to the value and returns the new value - -#### `sub(delta: int) -> int` - -subtracts delta from the value and returns the new value - -#### `inc() -> int` - -increments the value by 1 and returns the new value - -#### `dec() -> int` - -decrements the value by 1 and returns the new value - -### `AtomicFloat` - -an atomic float type with various atomic operations - -**Methods** - -#### `get() -> float` - -returns the current value - -#### `set(value: float)` - -sets the value - -#### `cas(old: float, new: float) -> bool` - -compares and swaps the value if it matches old - -#### `add(delta: float) -> float` - -adds delta to the value and returns the new value - -#### `sub(delta: float) -> float` - -subtracts delta from the value and returns the new value - -### `AtomicString` - -an atomic string type with various atomic operations - -**Methods** - -#### `get() -> string` - -returns the current value - -#### `set(value: string)` - -sets the value - -#### `cas(old: string, new: string) -> bool` +## Notes / boundaries -compares and swaps the value if it matches old +- **Type names.** Script-visible type names are `atom_int`, `atom_float`, `atom_string` (as returned by `type()`); the underlying Go types are `AtomicInt`, `AtomicFloat`, `AtomicString`. +- **Truthiness.** A cell is falsy when its value is `0` / `0.0` / `""` and truthy otherwise (`bool(new_int(0))` is `False`). +- **Comparison and hashing.** Cells are comparable only against the same cell type and are hashable by current value, so they work as dict keys. Mutating a cell after using it as a key leaves the existing entry under the old hash — treat keyed cells as you would any mutable key. +- **Range.** `atom_int` is a 64-bit signed integer; `atom_float` is IEEE-754 `float64`. `add`/`sub` wrap or lose precision exactly as the underlying 64-bit types do. +- **Float widening.** `atom_float` accepts ints for `set`/`cas`/`add`/`sub` and stores them as floats; `cas` compares with float equality, so seed `old` with the exact stored float. diff --git a/lib/base64/README.md b/lib/base64/README.md index 5a264dc4..dbca596d 100644 --- a/lib/base64/README.md +++ b/lib/base64/README.md @@ -1,53 +1,68 @@ # base64 -`base64` defines base64 encoding & decoding functions, often used to represent binary as text. +`base64` provides base64 encoding and decoding for Starlark, commonly used to represent binary data as ASCII text. It wraps Go's `encoding/base64` and supports the standard and URL-safe alphabets, each with padded and raw (unpadded) variants. Capability profile: **pure** (no filesystem, network, process, or log side effects). ## Functions -### `encode(src,encoding="standard") string` +| function | description | +|----------|-------------| +| `encode(data, encoding="standard") -> string` | base64-encode a string or bytes, returning the encoded text | +| `decode(data, encoding="standard") -> string` | base64-decode a string or bytes, returning the decoded text | -return the base64 encoding of src +## Encoding dialects -#### Parameters +The optional `encoding` argument selects the alphabet and padding. An empty string is treated as `"standard"`. Any other value raises `unsupported encoding format: `. -| name | type | description | -|------------|----------|-------------------------------------------------------------------------------------------------| -| `src` | `string` | source string to encode to base64 | -| `encoding` | `string` | optional. string to set encoding dialect. allowed values are: standard,standard_raw,url,url_raw | +| encoding | meaning | +|----------|---------| +| `"standard"` | standard base64 with padding, RFC 4648 (default) | +| `"standard_raw"` | standard base64 without padding, RFC 4648 §3.2 | +| `"url"` | URL- and filename-safe base64 with padding, RFC 4648 | +| `"url_raw"` | URL- and filename-safe base64 without padding | -#### Examples +## Details & examples -**basic** +### `encode` -encode a string as base64 +`encode(data, encoding="standard") -> string` + +`data` is the input to encode and accepts a `string` or `bytes`; any other type raises `base64.encode: for parameter data: got , want string or bytes`. The result is always a `string`. Errors on an unknown `encoding` value (see the dialects table). ```python load("base64", "encode") -encoded = encode("hello world!") -print(encoded) -# Output: aGVsbG8gd29ybGQh +print(encode("hello")) +print(encode("hello", encoding="standard_raw")) +print(encode("hello friend!", encoding="url")) +print(encode("hello friend!", encoding="url_raw")) +# Output: +# aGVsbG8= +# aGVsbG8 +# aGVsbG8gZnJpZW5kIQ== +# aGVsbG8gZnJpZW5kIQ ``` -### `decode(src,encoding="standard") string` - -parse base64 input, giving back the plain string representation - -#### Parameters - -| name | type | description | -|------------|----------|-------------------------------------------------------------------------------------------------| -| `src` | `string` | source string of base64-encoded text | -| `encoding` | `string` | optional. string to set decoding dialect. allowed values are: standard,standard_raw,url,url_raw | +### `decode` -#### Examples +`decode(data, encoding="standard") -> string` -**basic** - -encode a string as base64 +`data` is the base64-encoded input and accepts a `string` or `bytes`; any other type raises `base64.decode: for parameter data: got , want string or bytes`. The result is the decoded text as a `string`. Errors on an unknown `encoding` value, and on malformed input for the chosen dialect (e.g. decoding the unpadded `"aGVsbG8"` with the default `standard` encoding raises `illegal base64 data at input byte 4` — use `encoding="standard_raw"` for unpadded input). ```python load("base64", "decode") -decoded = decode("aGVsbG8gd29ybGQh") -print(decoded) -# Output: hello world! +print(decode("aGVsbG8=")) +print(decode("aGVsbG8", encoding="standard_raw")) +print(decode("aGVsbG8gZnJpZW5kIQ==", encoding="url")) +print(decode("aGVsbG8gZnJpZW5kIQ", encoding="url_raw")) +# Output: +# hello +# hello +# hello friend! +# hello friend! ``` + +## Notes / boundaries + +- Engine: thin wrapper over Go's standard `encoding/base64`; behavior and error messages follow that package. +- The padded dialects (`standard`, `url`) require correct `=` padding on decode; the raw dialects (`standard_raw`, `url_raw`) require its absence. Mixing them raises an `illegal base64 data` error. +- Deterministic: the same input and dialect always yield the same output. +- `decode` returns a `string`; decoded bytes that are not valid UTF-8 are still returned as a Starlark string holding those bytes. diff --git a/lib/csv/README.md b/lib/csv/README.md index 67e8b63c..1774902e 100644 --- a/lib/csv/README.md +++ b/lib/csv/README.md @@ -1,35 +1,50 @@ # csv -csv parses and writes comma-separated values files (csv). +`csv` parses and writes comma-separated values, mirroring a subset of Go's `encoding/csv`: read a CSV string into rows or header-keyed dicts, and write rows or dicts back to a CSV string. Capability profile: **Pure** — it operates only on in-memory strings and has no filesystem, network, process, or log side effects. -Every function has a `try_`-prefixed variant (`try_read_all`, `try_read_dict`, `try_write_all`, `try_write_dict`) that never aborts the script: it returns a `(value, error)` pair where exactly one side is `None`, the same shape as the `json` module's `try_*` functions. +Every function has a `try_`-prefixed variant (`try_read_all`, `try_read_dict`, `try_write_all`, `try_write_dict`) that never aborts the script: it returns a `(value, error)` tuple where exactly one side is `None` (the value on success, the error string on failure), the same shape as the `json` module's `try_*` functions. ## Functions -### `read_all(source, comma=",", comment="", lazy_quotes=False, trim_leading_space=False, fields_per_record=0, skip=0, limit=0) [][]string` +| function | description | +|----------|-------------| +| `read_all(source, comma=",", comment="", lazy_quotes=False, trim_leading_space=False, fields_per_record=0, skip=0, limit=0) -> list` / `try_read_all(...) -> tuple` | read all rows from a CSV string into a list of string lists | +| `read_dict(source, comma=",", comment="", lazy_quotes=False, trim_leading_space=False, fields_per_record=0, skip=0, limit=0) -> list` / `try_read_dict(...) -> tuple` | read a CSV string whose first row (after `skip`) is the header into a list of dicts keyed by the header fields | +| `write_all(data, comma=",") -> string` / `try_write_all(...) -> tuple` | write a list of lists to a CSV-encoded string | +| `write_dict(data, header, comma=",") -> string` / `try_write_dict(...) -> tuple` | write a list of dicts to a CSV-encoded string using the given header columns | -read all rows from a source string, returning a list of string lists +The `try_*` variants take the same arguments as their base function and return a `(value, error)` tuple: `(result, None)` on success, `(None, "")` on failure. They never raise. -Rows are read one at a time: a positive `limit` stops parsing at that many rows (malformed content beyond the limit is never reached), and rows consumed by `skip` do not pin the expected field count when `fields_per_record` is 0 — the first kept row does. +## Constants -#### Parameters +| constant | meaning | +|----------|---------| +| `ModuleName` | Go-side constant (`"csv"`) naming this module for `load()`; not a script-visible member. | -| name | type | description | -|----------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `source` | `string` | input string of csv data | -| `comma` | `string` | comma is the field delimiter, defaults to "," (a comma). comma must be a valid character and must not be \r, \n, or the Unicode replacement character (0xFFFD). | -| `comment` | `string` | comment, if not "", is the comment character. Lines beginning with the comment character without preceding whitespace are ignored. With leading whitespace the comment character becomes part of the field, even if trim_leading_space is True. comment must be a valid character and must not be \r, \n, or the Unicode replacement character (0xFFFD). It must also not be equal to comma. | -| `lazy_quotes` | `bool` | If lazy_quotes is True, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field. | -| `trim_leading_space` | `bool` | If trim_leading_space is True, leading white space in a field is ignored. This is done even if the field delimiter, comma, is white space. | -| `fields_per_record` | `int` | fields_per_record is the number of expected fields per record. If fields_per_record is positive, read_all requires each record to have the given number of fields. If fields_per_record is 0, read_all sets it to the number of fields in the first record, so that future records must have the same field count. If fields_per_record is negative, no check is made and records may have a variable number of fields. | -| `skip` | `int` | Number of rows to skip before starting to read, omitting from returned rows. | -| `limit` | `int` | Maximum number of rows to read, stops reading when this limit is reached. If limit is 0, all rows after skip are read. | +## Reading -#### Examples +### `read_all` -**basic** +`read_all(source, comma=",", comment="", lazy_quotes=False, trim_leading_space=False, fields_per_record=0, skip=0, limit=0) -> list` -read a csv string into a list of string lists +Reads `source` and returns a list of rows, each row a list of strings. A UTF-8 BOM at the start of `source` is stripped. `source` may be a string or bytes. + +Parameters: + +| name | type | description | +|------|------|-------------| +| `source` | `string`/`bytes` | input CSV data | +| `comma` | `string` | field delimiter, a single character; defaults to `","`. Must not be `\r`, `\n`, or U+FFFD. | +| `comment` | `string` | if not `""`, a single comment character; lines beginning with it (without preceding whitespace) are skipped. Must not equal `comma`. | +| `lazy_quotes` | `bool` | if `True`, a quote may appear in an unquoted field and a non-doubled quote in a quoted field | +| `trim_leading_space` | `bool` | if `True`, leading white space in a field is ignored | +| `fields_per_record` | `int` | expected fields per record: positive requires exactly that many; `0` (default) pins the count to the first kept record; negative disables the check (rows may vary in length) | +| `skip` | `int` | number of rows to skip before reading; skipped rows are omitted from the result | +| `limit` | `int` | maximum number of rows to return; `0` reads all rows after `skip` | + +Rows are read one at a time: a positive `limit` stops parsing at that many rows, so malformed content beyond the limit is never reached. Rows consumed by `skip` do not pin the expected field count when `fields_per_record` is `0` — the first kept row does. + +Errors on: a `comma` or `comment` that is not exactly one character (or `comment` equal to `comma`); a record whose field count violates `fields_per_record`; a malformed/unterminated-quote parse error that is actually reached. An empty `source` returns `[]`. ```python load("csv", "read_all") @@ -43,9 +58,7 @@ print(data) # Output: [["type", "name", "number_of_legs"], ["dog", "spot", "4"], ["cat", "spot", "3"], ["spider", "samantha", "8"]] ``` -**skip_and_limit** - -read a csv string with skip and limit +With `skip` and `limit` (skip the header, keep one data row): ```python load("csv", "read_all") @@ -59,89 +72,166 @@ print(data) # Output: [["dog", "spot", "4"]] ``` -### `read_dict(source, comma=",", comment="", lazy_quotes=False, trim_leading_space=False, fields_per_record=0, skip=0, limit=0) []dict` +With a custom delimiter and comment character: + +```python +load("csv", "read_all") +csv_string = """a|b|c +#1,2,3 +4|5|6 +7|8|9 +""" +print(read_all(csv_string, comma="|", comment="#")) +# Output: [["a", "b", "c"], ["4", "5", "6"], ["7", "8", "9"]] +``` + +A malformed row past `limit` is never reached, so this succeeds: -read csv data whose first row (after `skip`) is the header, returning a list of dicts keyed by the header fields +```python +load("csv", "read_all") +csv_string = 'a,b\nc,d\n"bad\n' +print(read_all(csv_string, limit=2)) +# Output: [["a", "b"], ["c", "d"]] +``` -Takes the same parameters as `read_all`. `limit` counts data rows (the header is not included). A duplicate header field is an error; an empty source yields an empty list. With the default `fields_per_record=0` every data row must have as many fields as the header. +### `read_dict` -#### Examples +`read_dict(source, comma=",", comment="", lazy_quotes=False, trim_leading_space=False, fields_per_record=0, skip=0, limit=0) -> list` -**basic** +Takes the same parameters as `read_all`. The first remaining row (after `skip`) is treated as the header; each subsequent row becomes a dict keyed by the header fields. `limit` counts data rows only — the header is not included. -read a csv string into a list of dicts +Errors on: a duplicate header field; the same `comma`/`comment`/parse/`fields_per_record` errors as `read_all`. An empty `source` returns `[]`. With `fields_per_record=-1`, rows shorter than the header simply omit the missing keys, and cells beyond the header are dropped. ```python load("csv", "read_dict") -data_str = """type,name,number_of_legs -dog,spot,4 -cat,spot,3 +data_str = """a,b +1,2 +3,4 """ -data = read_dict(data_str) -print(data) -# Output: [{"type": "dog", "name": "spot", "number_of_legs": "4"}, {"type": "cat", "name": "spot", "number_of_legs": "3"}] +print(read_dict(data_str)) +# Output: [{"a": "1", "b": "2"}, {"a": "3", "b": "4"}] ``` -### `write_all(source, comma=",") string` +Variable-length rows with `fields_per_record=-1` map only the fields present: + +```python +load("csv", "read_dict") +print(read_dict('a\n1,2\n3\n', fields_per_record=-1)) +# Output: [{"a": "1"}, {"a": "3"}] +``` -write all rows from source to a csv-encoded string +## Writing -Cell values are rendered per type: strings as-is; ints and floats in plain decimal notation (never scientific, e.g. `1000000.0` → `1000000`); `True`/`False` as `true`/`false`; `None` as an empty cell; time values as RFC 3339. Nested lists/dicts and non-finite floats (`nan`, `inf`) are reported as errors instead of being silently written in Go syntax. +Cell values are rendered by type, never with Go's default formatting: -#### Parameters +- `string` — written as-is +- `int` / `float` — plain decimal, never scientific notation (`1000000.0` → `1000000`, `0.00001` → `0.00001`) +- `bool` — `true` / `false` (lowercase, matching `json.encode`) +- `None` — an empty cell +- `time` value — RFC 3339 (e.g. `2023-01-15T12:30:45Z`) -| name | type | description | -|----------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `source` | `[][]string` | array of arrays of strings to write to csv | -| `comma` | `string` | comma is the field delimiter, defaults to "," (a comma). comma must be a valid character and must not be \r, \n, or the Unicode replacement character (0xFFFD). | +A non-finite float (`nan`, `inf`) errors (`float value ... is not representable in CSV`); any other type, including nested lists/dicts, errors (`unsupported cell type ...`) rather than being written as Go syntax. -#### Examples +### `write_all` -**basic** +`write_all(data, comma=",") -> string` -write a list of string lists to a csv string +`data` must be a list of lists (rows of cells). Returns the CSV text, each record terminated by `\n`. + +Errors on: a `comma` that is not exactly one character; `data` that is not an array; a row that is not an array; a cell of an unsupported or non-finite-float type. An unconvertible Starlark value (e.g. a function) errors at the conversion step (`unrecognized starlark type: ...`). ```python load("csv", "write_all") data = [ -["type", "name", "number_of_legs"], -["dog", "spot", "4"], -["cat", "spot", "3"], -["spider", "samantha", "8"], + ["type", "name", "number_of_legs"], + ["dog", "spot", "4"], + ["cat", "spot", "3"], + ["spider", "samantha", "8"], ] -csv_str = write_all(data) -print(csv_str) -# Output: "type,name,number_of_legs\ndog,spot,4\ncat,spot,3\nspider,samantha,8\n" +print(write_all(data)) +# Output: type,name,number_of_legs +# dog,spot,4 +# cat,spot,3 +# spider,samantha,8 +# ``` -### `write_dict(data, header, comma=",") string` - -write a list of dictionaries to a csv string based on the provided header - -Cell values are rendered with the same per-type rules as `write_all`; a key missing from a row produces an empty cell, the same as an explicit `None`. +Per-type cell rendering: -#### Parameters +```python +load("csv", "write_all") +print(write_all([[1000000.0, 0.00001, -2.5]])) +print(write_all([[None, True, False]])) +# Output: 1000000,0.00001,-2.5 +# +# ,true,false +# +``` -| name | type | description | -|----------|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `data` | `[]dict` | array of dictionaries where each dictionary is a row with field names as keys | -| `header` | `[]string` | array of strings representing the header (column names) of the csv | -| `comma` | `string` | comma is the field delimiter, defaults to "," (a comma). comma must be a valid character and must not be \r, \n, or the Unicode replacement character (0xFFFD). | +### `write_dict` -#### Examples +`write_dict(data, header, comma=",") -> string` -**basic** +`data` must be a list of dicts; `header` is an iterable of column-name strings. The header row is written first, then one row per dict, taking cells in `header` order. A key missing from a dict produces an empty cell (the same as an explicit `None`); dict keys not in `header` are ignored. -write a list of dictionaries to a csv string based on header +Errors on: an empty `header`; a `header` element that is not a string; a `comma` that is not exactly one character; `data` that is not an array; an element of `data` that is not a dict; a cell of an unsupported or non-finite-float type. ```python load("csv", "write_dict") data = [ -{"type": "dog", "name": "spot", "number_of_legs": 4}, -{"type": "cat", "name": "spot", "number_of_legs": 3}, -{"type": "spider", "name": "samantha", "number_of_legs": 8}, + {"type": "dog", "name": "spot", "number_of_legs": 4}, + {"type": "cat", "name": "spot", "number_of_legs": 3}, + {"type": "spider", "name": "samantha", "number_of_legs": 8}, ] -csv_str = write_dict(data, header=["type", "name", "number_of_legs"]) -print(csv_str) -# Output: "type,name,number_of_legs\ndog,spot,4\ncat,spot,3\nspider,samantha,8\n" +print(write_dict(data, header=["type", "name", "number_of_legs"])) +# Output: type,name,number_of_legs +# dog,spot,4 +# cat,spot,3 +# spider,samantha,8 +# +``` + +Missing keys and extra keys (`number_of_legs` is absent in the second row; `C` is not in `header`): + +```python +load("csv", "write_dict") +x = write_dict([{"a": 200, "b": 100, "c": 500}, {"b": 1024, "C": 2048}], header=["c", "b"]) +print(x) +# Output: c,b +# 500,100 +# ,1024 +# ``` + +## try_* variants + +Each base function has a `try_`-prefixed variant — `try_read_all`, `try_read_dict`, `try_write_all`, `try_write_dict` — that returns `(value, None)` on success and `(None, "")` on failure instead of aborting the script. Argument-unpacking errors are captured the same way. + +```python +load("csv", "try_read_all") +rows, err = try_read_all('a,b\nc,d\n') +print(rows, err) +bad, err2 = try_read_all('"bad\n') +print(bad, "parse error" in err2) +# Output: [["a", "b"], ["c", "d"]] None +# None True +``` + +```python +load("csv", "try_write_all") +text, err = try_write_all([[1, 2]]) +print(text, err) +bad, err2 = try_write_all([[[1]]]) +print(bad, "unsupported cell type" in err2) +# Output: 1,2 +# None +# None True +``` + +## Notes / boundaries + +- **Engine.** Backed by Go's `encoding/csv` (RFC 4180 semantics). Reading transparently strips a leading UTF-8 BOM and normalizes lone `\r` line endings. +- **Pure.** No filesystem, network, process, or logging effects; functions operate only on the strings/bytes passed in and return strings or values. +- **Honest boundary.** Writing never silently corrupts: unsupported cell types and non-finite floats raise rather than emitting Go-syntax text. Use `try_*` to handle failures inline instead of aborting. +- **Reading returns strings.** Every cell from `read_all` / `read_dict` is a string; no numeric or boolean coercion is performed on input. +- **All member names are snake_case** — no non-standard identifiers in this module. diff --git a/lib/file/README.md b/lib/file/README.md index f902d299..8ee9cbdb 100644 --- a/lib/file/README.md +++ b/lib/file/README.md @@ -1,543 +1,183 @@ # file -`file` provides functions to interact with the file system. The library is inspired by file helpers from Amoy. +`file` provides functions to read, write, append, inspect, and copy files on the local file system, plus a small BOM helper. It is inspired by file helpers from common Go toolkits. Capability profile: **FileSystem** — every function except `trim_bom` touches the host file system (reads, writes, stats, or copies real files). ## Functions -### `trim_bom(rd) string` +### Read + +| function | description | +|----------|-------------| +| `read_bytes(name) -> bytes` | Read the whole file and return its contents as bytes. | +| `read_string(name) -> str` | Read the whole file and return its contents as a string. | +| `read_lines(name) -> list` | Read the whole file and return its lines (without line endings) as a list of strings. | +| `read_json(name) -> value` | Read the file and decode its contents as a single JSON document. | +| `read_jsonl(name) -> list` | Read the file as JSON Lines (one JSON document per line; blank lines are skipped) and return a list of values. | +| `head_lines(name, n) -> list` | Return the first `n` lines of the file (or fewer if the file is shorter). | +| `tail_lines(name, n) -> list` | Return the last `n` lines of the file (or fewer if the file is shorter). | +| `count_lines(name) -> int` | Count the number of lines in the file. | + +### Write (overwrite) + +| function | description | +|----------|-------------| +| `write_bytes(name, data) -> None` | Create or truncate the file and write `data` (string or bytes). | +| `write_string(name, data) -> None` | Create or truncate the file and write `data` (string or bytes) as text. | +| `write_lines(name, data) -> None` | Create or truncate the file and write each item of `data` as a line. | +| `write_json(name, data) -> None` | Create or truncate the file and write `data` as a JSON document. | +| `write_jsonl(name, data) -> None` | Create or truncate the file and write each item of `data` as a JSON line. | + +### Append + +| function | description | +|----------|-------------| +| `append_bytes(name, data) -> None` | Append `data` (string or bytes) to the file, creating it if absent. | +| `append_string(name, data) -> None` | Append `data` (string or bytes) as text to the file, creating it if absent. | +| `append_lines(name, data) -> None` | Append each item of `data` as a line, creating the file if absent. | +| `append_json(name, data) -> None` | Append `data` as a JSON document, creating the file if absent. | +| `append_jsonl(name, data) -> None` | Append each item of `data` as a JSON line, creating the file if absent. | + +### Inspect & copy & utility + +| function | description | +|----------|-------------| +| `stat(name, follow=False) -> FileStat` | Return a `FileStat` describing the file or directory. | +| `copyfile(src, dst, overwrite=False) -> str` | Copy a regular file and return the destination path. | +| `trim_bom(rd) -> str \| bytes` | Strip a leading UTF-8 BOM from a string or bytes value (no file I/O). | -Removes the Byte Order Mark (BOM) from a byte literal string or bytes. - -#### Parameters - -| name | type | description | -|------|---------|-------------| -| `rd` | `string | byes` | - -#### Examples - -**basic** - -Removes the Byte Order Mark (BOM) from a string. - -```python -load("file", "trim_bom") -s = b'\xef\xbb\xbfhello' -print(trim_bom(s)) -# Output: hello -``` - -### `count_lines(name) int` - -Counts the total lines in a file located at the provided path. - -#### Parameters - -| name | type | description | -|--------|----------|----------------------------------------------------| -| `name` | `string` | The path of the file whose lines are to be counted | - -#### Examples - -**basic** - -Count the lines of a file. - -```python -load("file", "count_lines") -name = 'path/to/file.txt' -print(count_lines(name)) -# Output: 10 -``` - -### `head_lines(name, n) []string` - -Returns the first 'n' lines of a file. - -#### Parameters - -| name | type | description | -|--------|----------|-------------------------------------------------| -| `name` | `string` | The path of the file | -| `n` | `int` | The number of lines from the top to be returned | - -#### Examples - -**basic** - -Get the top 10 lines of a file. - -```python -load('file', 'head_lines') -print(head_lines('path/to/file.txt', 10)) -# Output: ['line1', 'line2', ... 'line10'] -``` - -### `tail_lines(name, n) []string` - -Returns the last 'n' lines of a file. - -#### Parameters - -| name | type | description | -|--------|----------|----------------------------------------------------| -| `name` | `string` | The path of the file | -| `n` | `int` | The number of lines from the bottom to be returned | - -#### Examples - -**basic** - -Get the bottom 10 lines of a file. - -```python -load('file', 'tail_lines') -print(tail_lines('path/to/file.txt', 10)) -# Output: ['line91', 'line92', ... 'line100'] -``` - -### `read_bytes(name)` - -Reads a file and returns its contents as bytes. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------| -| `name` | `string` | The path of the file to be read | - -#### Examples - -**basic** - -Read a file in bytes. - -```python -load('file', 'read_bytes') -print(read_bytes('path/to/file.txt')) -# Output: b'file_content' -``` - -### `read_string(name)` - -Reads a file and returns its contents as string. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------| -| `name` | `string` | The path of the file to be read | - -#### Examples - -**basic** - -Read a file in string. - -```python -load('file', 'read_string') -print(read_string('path/to/file.txt')) -# Output: 'file_content' -``` +## Types -### `read_lines(name)` +### `FileStat` -Reads a file and returns its contents as a list of lines. +Returned by `stat`. A struct (printed type name `file_stat`) carrying file metadata fields plus four hashing methods that read the file on demand. -#### Parameters +| member | type | description | +|--------|------|-------------| +| `name` | `str` | Base name of the file or directory. | +| `path` | `str` | Absolute path of the file or directory. | +| `ext` | `str` | File extension including the dot (e.g. `.txt`), or `''` if none. | +| `size` | `int` | Size in bytes. | +| `type` | `str` | One of `file`, `dir`, `symlink`, `fifo`, `socket`, `char`, `block`, `irregular`, `unknown`. | +| `modified` | `time.Time` | Last modification time. | +| `get_md5() -> str` | method | Hex MD5 of the file contents. | +| `get_sha1() -> str` | method | Hex SHA-1 of the file contents. | +| `get_sha256() -> str` | method | Hex SHA-256 of the file contents. | +| `get_sha512() -> str` | method | Hex SHA-512 of the file contents. | -| name | type | description | -|--------|----------|---------------------------------| -| `name` | `string` | The path of the file to be read | +The `get_*` methods open and read the file each time they are called; they error if the path is a directory or unreadable (e.g. `is a directory`, `permission denied`). -#### Examples +## Details & examples -**basic** +### Reading -Get lines of a file in a list. +`read_bytes(name)`, `read_string(name)`, and `read_lines(name)` read the entire file. `read_lines` strips the trailing newline of each line and handles both `\n` and `\r\n` endings; an empty file yields `[]`. All three error if the path does not exist (e.g. `open no-such-file:`). ```python -load('file', 'read_lines') -print(read_lines('path/to/file.txt')) -# Output: ['line1', 'line2', 'line3', ....] +load('file', 'read_bytes', 'read_string', 'read_lines') +print(read_bytes('testdata/aloha.txt') == b'ALOHA\n') # bytes, trailing newline kept +print(read_string('testdata/aloha.txt') == 'ALOHA\n') # same content as a string +print(read_lines('testdata/line_win.txt')) # newline stripped, \r\n handled +# Output: +# True +# True +# ["Line 1", "Line 2", "Line 3"] ``` -### `read_json(name) dict` - -Reads a file and decodes its contents as JSON, returning the corresponding Starlark object (dict or any types). - -#### Parameters - -| name | type | description | -|--------|----------|--------------------------------------| -| `name` | `string` | The path of the JSON file to be read | - -#### Examples - -**basic** - -Read a JSON file. +`read_json(name)` decodes the whole file as one JSON document and returns the matching Starlark value (`dict`, `list`, `int`, `bool`, `None`, etc.). `read_jsonl(name)` decodes one JSON document per line, skipping blank lines, and returns a list. Both error if the file is missing (`open no-such-file:`) or the content is not valid JSON (`read_json` → `json.decode: at offset ...`; `read_jsonl` → `line N: json.decode: at offset ...`). ```python load('file', 'read_json') -data = read_json('path/to/file.json') -print(data) -# Output: {'key': 'value', 'array': [1, 2, 3]} +data = read_json('testdata/json1.json') +print(data['num'], data['bool'], data['arr']) +# Output: 42 True [1, 2, 3] ``` -### `read_jsonl(name) list` - -Reads a file with each line containing a JSON object and returns a list of Starlark objects. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------------| -| `name` | `string` | The path of the JSONL file to be read | - -#### Examples - -**basic** - -Read a JSONL file. +`head_lines(name, n)` and `tail_lines(name, n)` return at most `n` lines from the start or end of the file. `n` must be a positive integer, otherwise they error (`expected positive integer, got -7`); a missing file errors as `open no-such-file:`. `count_lines(name)` returns the line count (0 for an empty file). ```python -load('file', 'read_jsonl') -data = read_jsonl('path/to/file.jsonl') -print(data) -# Output: [{'key1': 'value1'}, {'key2': 'value2'}] +load('file', 'head_lines', 'tail_lines', 'count_lines') +print(head_lines('testdata/line_win.txt', 2)) +print(tail_lines('testdata/line_win.txt', 2)) +print(count_lines('testdata/line_mac.txt')) +# Output: +# ["Line 1", "Line 2"] +# ["Line 2", "Line 3"] +# 3 ``` -### `write_bytes(name, data)` +### Writing and appending -Writes/overwrites bytes or a byte literal string to a file. If the file isn't present, a new file would be created. +The `write_*` functions create the file if absent and truncate it if present; the `append_*` functions create the file if absent and append to it otherwise. All of them return `None` and error if the target cannot be opened (e.g. writing to a directory path: `open testdata/:`). Both arguments are required; calling without `name` or `data` errors with `missing argument for name` / `missing argument for data`. -#### Parameters - -| name | type | description | -|--------|----------|------------------------------------------------------------| -| `name` | `string` | The path of the file to be written to | -| `data` | `string` | The byte literal string or bytes to be written to the file | - -#### Examples - -**basic** - -Write a byte string to a file. +- `write_bytes` / `append_bytes` and `write_string` / `append_string` accept a string or bytes value; any other type errors with `expected string or bytes, got `. +- `write_lines` / `append_lines` accept a `list`, `tuple`, or `set` (each item is rendered with one trailing newline); a bare string or bytes is treated as a single line. Other types error with `expected list/tuple/set, got `. Non-string items are stringified (e.g. `123`, `[True, False]`). +- `write_json` / `append_json` write a string or bytes value verbatim; any other value is JSON-encoded. Values that cannot be encoded error (e.g. a lambda: `json.encode: cannot encode function as JSON`). +- `write_jsonl` / `append_jsonl` write a string or bytes value as one line; for a `list`/`tuple`/`set` each item is JSON-encoded onto its own line; any other value is encoded onto a single line. ```python -load('file', 'write_bytes') -name = 'new_file.txt' -data = b'Hello, This is a new file.' -write_bytes(name, data) +load('file', 'write_lines', 'append_lines') +fp = 'out.txt' +write_lines(fp, ['Hello', 'World']) +append_lines(fp, ['Great', 'Job']) +print(read_string(fp)) +# Output: +# Hello +# World +# Great +# Job +# ``` -### `write_string(name, data)` - -Writes/overwrites a string to a file. If the file isn't present, a new file would be created. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------------| -| `name` | `string` | The path of the file to be written to | -| `data` | `string` | The string to be written to the file | - -#### Examples - -**basic** - -Write a string to a file. - -```python -load('file', 'write_string') -write_string('new_file.txt', 'Hello, This is a new file.') -``` - -### `write_lines(name, data)` - -Writes/overwrites a list, tuple or set of lines to a file. If the file isn't present, a new file would be created. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------------| -| `name` | `string` | The path of the file to be written to | -| `data` | `list | set | - -#### Examples - -**List** - -Write a list of lines to a file. - -```python -load('file', 'write_lines') -lines = ['This is line1', 'This is line2', 'This is line3'] -write_lines('new_file.txt', lines) -``` - -### `write_json(name, data)` - -Writes the given Starlark object as JSON to a file. If the file exists, it will be overwritten. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------------| -| `name` | `string` | The path of the file to be written to | -| `data` | `dict | list | - -#### Examples - -**basic** - -Write a dictionary as JSON to a file. - -```python -load('file', 'write_json') -data = {"key": "value", "array": [1, 2, 3]} -write_json('new_file.json', data) -``` - -### `write_jsonl(name, data)` - -Writes the given data as JSON lines to a file. If the file exists, it will be overwritten. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------------| -| `name` | `string` | The path of the file to be written to | -| `data` | `list | set | - -#### Examples - -**basic** - -Write a list of JSON objects to a file as JSONL. - ```python load('file', 'write_jsonl') -data = [{"key1": "value1"}, {"key2": "value2"}] -write_jsonl('new_file.jsonl', data) -``` - -### `append_bytes(name, data)` - -Appends bytes or a byte literal string to a file. If the file isn't present, a new file would be created. - -#### Parameters - -| name | type | description | -|--------|----------|-------------------------------------------------------------| -| `name` | `string` | The path of the file to be written to | -| `data` | `string` | The byte literal string or bytes to be appended to the file | - -#### Examples - -**basic** - -Append a byte string to a file. - -```python -load('file', 'append_bytes') -append_bytes('existing_file.txt', b'Hello, This is appended data.') -``` - -### `append_string(name, data)` - -Appends a string to a file. If the file isn't present, a new file would be created. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------------| -| `name` | `string` | The path of the file to be written to | -| `data` | `string` | The string to be appended to the file | - -#### Examples - -**basic** - -Append a string to a file. - -```python -load('file', 'append_string') -append_string('existing_file.txt', 'Hello, This is appended data.') -``` - -### `append_lines(name, data)` - -Appends a list, tuple or set of lines to a file. If the file isn't present, a new file would be created. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------------| -| `name` | `string` | The path of the file to be written to | -| `data` | `list | set | - -#### Examples - -**basic** - -Append a list of lines to a file. - -```python -load('file', 'append_lines') -append_lines('existing_file.txt', ['This is line1', 'This is line2', 'This is line3']) -``` - -### `append_json(name, data)` - -Appends the given Starlark object as JSON to a file. If the file does not exist, it will be created. - -#### Parameters - -| name | type | description | -|--------|----------|----------------------------------------| -| `name` | `string` | The path of the file to be appended to | -| `data` | `dict | list | - -#### Examples - -**basic** - -Append a dictionary as JSON to a file. - -```python -load('file', 'append_json') -data = {"key": "value"} -append_json('existing_file.json', data) +write_jsonl('out.jsonl', [{'a': 520}, {'b': True}]) +# out.jsonl now contains: +# {"a":520} +# {"b":true} ``` -### `append_jsonl(name, data)` - -Appends the given data as JSON lines to a file. If the file does not exist, it will be created. +### `stat(name, follow=False) -> FileStat` -#### Parameters - -| name | type | description | -|--------|----------|----------------------------------------| -| `name` | `string` | The path of the file to be appended to | -| `data` | `list | set | - -#### Examples - -**basic** - -Append a list of JSON objects to a file as JSONL. - -```python -load('file', 'append_jsonl') -data = [{"key1": "value1"}, {"key2": "value2"}] -append_jsonl('existing_file.jsonl', data) -``` - -### `stat(name, follow=False) FileStat` - -Returns a FileStat object representing information about the given file or directory. - -#### Parameters - -| name | type | description | -|----------|----------|---------------------------------------| -| `name` | `string` | The path of the file or directory. | -| `follow` | `bool` | If true, symbolic links are followed. | - -#### Examples - -**file information** - -Retrieve information about a file. +Returns a `FileStat` for the path. By default symbolic links are reported as links (`lstat`); pass `follow=True` to resolve the link and stat its target. Errors if the path does not exist (`file.stat: lstat : ...`). ```python load('file', 'stat') -info = stat('path/to/file.txt') -print(info.name, info.size, info.type) -# Output: file.txt 3759 file +s = stat('testdata/aloha.txt') +print(s.name, s.size, s.type, s.ext) +print(s.get_md5()) +# Output: +# aloha.txt 6 file .txt +# 6a12867bd5e0810f2dae51da4a51f001 ``` -**directory information** +### `copyfile(src, dst, overwrite=False) -> str` -Retrieve information about a directory. - -```python -load('file', 'stat') -info = stat('path/to/folder', follow=True) -print(info.name, info.size, info.type) -# Output: folder 448 dir -``` - -### `copyfile(src, dst, overwrite=False) string` - -Copies a file from source to destination, and returns the destination file path. -If the destination exists and overwrite is set to False, an error is returned. If the destination is a directory, the file is copied into that directory with its original filename. Symbolic links are followed. Mode, access, and modification times are preserved. - -#### Parameters - -| name | type | description | -|-------------|----------|-----------------------------------------------------------------------------------| -| `src` | `string` | The path of the source file to be copied. | -| `dst` | `string` | The path of the destination file or directory. The parent directory must exist. | -| `overwrite` | `bool` | If true, allows overwriting the destination file if it exists. Defaults to False. | - -#### Examples - -**basic copy** - -Copy a file to a new location without overwrite. +Copies the regular file at `src` to `dst` and returns the destination path. If `dst` is an existing directory, the file is copied into it under its original base name. Symbolic links are followed; the file mode and access/modification times are preserved on a best-effort basis (errors setting them are ignored). It errors when: `src` or `dst` is empty (`source path is empty` / `destination path is empty`), `src` is not a regular file (`source file is not a regular file`), `src` and `dst` resolve to the same file (`source and destination are the same file`), or `dst` exists and `overwrite` is `False` (`file already exists`). ```python load('file', 'copyfile') -src = 'path/to/source.txt' -dst = 'path/to/destination.txt' -copyfile(src, dst) -# The file at 'path/to/source.txt' is copied to 'path/to/destination.txt' +dst = copyfile('testdata/aloha.txt', 'copy.txt') +print(dst) +# Output: copy.txt ``` -**overwrite copy** +### `trim_bom(rd) -> str | bytes` -Copy a file to a new location with overwrite enabled. +Removes a leading UTF-8 byte order mark (`\xef\xbb\xbf`) from a string or bytes value, returning the same type it received. Input without a BOM is returned unchanged. It is the only pure function here (no file I/O). It takes exactly one argument (`takes exactly one argument (0 given)`) and rejects other types (`expected string or bytes, got int`). ```python -load('file', 'copyfile') -src = 'path/to/source.txt' -dst = 'path/to/existing_destination.txt' -copyfile(src, dst, overwrite=True) -# The file at 'path/to/source.txt' is copied to 'path/to/existing_destination.txt', overwriting it. +load('file', 'trim_bom') +print(trim_bom(b'\xef\xbb\xbfhello')) # bytes in, bytes out; print shows the content +print(trim_bom('hello')) # no BOM, returned unchanged +# Output: +# hello +# hello ``` -**copy to directory** - -Copy a file into a directory. - -```python -load('file', 'copyfile') -src = 'path/to/source.txt' -dst = 'path/to/directory' -copyfile(src, dst) -# The file at 'path/to/source.txt' is copied into 'path/to/directory' with its original filename. -``` - -## Types - -### `FileStat` +## Notes / boundaries -Represents information about a file. - -**Fields** - -| name | type | description | -|----------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `name` | `string` | The name of the file. | -| `path` | `string` | The full path of the file. | -| `ext` | `string` | The file extension. | -| `size` | `int` | The size of the file in bytes. | -| `type` | `string` | The type of the file: `file` for regular file, `dir` for directory, `symlink` for symbolic link, `fifo` for FIFO pipe, `socket` for network socket, `char` for character device file, `block` for block device file, `irregular` for irregular file type, `unknown` for unknown file type. | -| `modified` | `time.Time` | The last modified time of the file. | -| `get_md5()` | `function` | Returns the MD5 hash of the file contents. | -| `get_sha1()` | `function` | Returns the SHA-1 hash of the file contents. | -| `get_sha256()` | `function` | Returns the SHA-256 hash of the file contents. | -| `get_sha512()` | `function` | Returns the SHA-512 hash of the file contents. | +- **Side effects.** All functions except `trim_bom` perform real file system I/O; this is a FileSystem-capability module and must be gated as such by the host. +- **Line endings.** Readers strip line endings and accept both `\n` and `\r\n`; the line writers always emit `\n`. +- **JSON engine.** JSON encoding/decoding goes through `dataconv` (the same engine as the `json` module); only JSON-encodable Starlark values can be written, and decode errors surface the offset. +- **Atomicity.** Writes are not atomic — a truncating write that fails partway can leave a partially written file. Appends use `O_APPEND`. +- All exported names are `snake_case`; the `FileStat` hashing methods (`get_md5`, `get_sha1`, `get_sha256`, `get_sha512`) follow the same convention. diff --git a/lib/goidiomatic/README.md b/lib/goidiomatic/README.md index 436928e4..cd16a9c2 100644 --- a/lib/goidiomatic/README.md +++ b/lib/goidiomatic/README.md @@ -1,554 +1,266 @@ # go_idiomatic -`go_idiomatic` provides a Starlark module that defines Go idiomatic functions and values. +`go_idiomatic` provides Go-flavored helpers, constants, and constructors for Starlark scripts — base conversions, `length`/`sum`/`distinct` utilities, struct and shared-dict factories, and `print`-style output. It is **not** a mirror of any single Python module; it borrows the spirit of several Python builtins (`hex`/`oct`/`bin`/`sum`/`bytes.hex`) while adding Star\* idioms. -## Functions - -### `length(obj) int` - -Returns the length of the object, for string it returns the number of Unicode code points, instead of bytes like `len()`. - -#### Examples - -**String** - -Calculate the length of a CJK string. - -```python -load("go_idiomatic", "length") -s = "你好" -print(length(s), len(s)) -# Output: 2 6 -``` - -**Misc** - -Calculate the length of a list, set and map. - -```python -load("go_idiomatic", "length") -print(length([1, 2, 3]), length(set([1, 2])), length({1: 2})) -# Output: 3 2 1 -``` +Capability profile: **Log + Process**. `eprint`/`pprint` write to stderr or the thread's `Print` handler (Log); `sleep` blocks the thread and `exit`/`quit` halt the program (Process). Everything else is pure. -### `sum(iterable, start=0)` +Load name for `load()`: `go_idiomatic`. -Returns the sum of `start` and the items of an iterable from left to right. The iterable's items and the `start` value are normally numbers. - -#### Examples - -**Basic** - -Calculate the sum of a list. - -```python -load("go_idiomatic", "sum") -print(sum([1, 2, 3])) -# Output: 6 -``` - -**Start** - -Calculate the sum of a list with a start value. +## Functions -```python -load("go_idiomatic", "sum") -print(sum([1, 2, 3], 10)) -# Output: 16 +| function | description | +|----------|-------------| +| `length(obj) -> int` | Length of a string (in Unicode code points), bytes, or any sequence. | +| `sum(iterable, start=0) -> number` | Sum of `start` plus the items of `iterable`, left to right. | +| `distinct(iterable) -> iterable` | Iterable with duplicates removed; shape follows the input type. | +| `hex(x) -> str` | Lowercase hexadecimal string of an int, prefixed `0x`. | +| `oct(x) -> str` | Octal string of an int, prefixed `0o`. | +| `bin(x) -> str` | Binary string of an int, prefixed `0b`. | +| `bytes_hex(bytes, sep="", bytes_per_sep=1) -> str` | Hex string of each byte, with an optional grouping separator. | +| `is_nil(x) -> bool` | Whether `x` is `nil`/`None` or a Go wrapper holding a nil value. | +| `sleep(secs)` | Block the current thread for `secs` seconds (cancellable). | +| `exit(code=0)` / `quit(code=0)` | Halt the program with an exit code; `quit` is an alias of `exit`. | +| `module(name, **kv) -> module` | Build a `starlarkstruct` module named `name`. | +| `struct(**kv) -> struct` | Build an anonymous comparable struct. | +| `make_struct(name, **kv) -> struct` | Build a struct whose constructor name is `name`. | +| `shared_dict() -> shared_dict` | Create an empty thread-safe shared dictionary. | +| `make_shared_dict(name="", data=None) -> shared_dict` | Create a shared dictionary with an optional type name and initial data. | +| `to_dict(v) -> dict` | Convert a dict, `module`, `struct`, `GoStruct`, or `shared_dict` into a plain dict. | +| `eprint(*args, sep=" ")` | `print`-style output to stderr. | +| `pprint(*args, sep=" ")` | `print`-style output formatted as indented JSON. | + +## Constants + +| constant | meaning | +|----------|---------| +| `true` | Alias for the Starlark boolean `True`. | +| `false` | Alias for the Starlark boolean `False`. | +| `nil` | Alias for `None`. | + +```python +load("go_idiomatic", "true", "false", "nil") +print(true, false, nil) +# Output: True False None ``` -### `distinct(iterable)` - -Returns an iterable with distinct elements from the given iterable, i.e., without duplicates. For a list and custom types, it returns a new list with distinct elements. For a tuple, it returns a new tuple with distinct elements. For a dict, it returns the keys in a list. For a set, it just returns the original set. - -#### Parameters +## Types -| name | type | description | -|------------|------------|------------------------------------------------| -| `iterable` | `iterable` | The iterable to process for distinct elements. | +### `shared_dict` -#### Examples +A thread-safe, mutable dictionary returned by `shared_dict()` and `make_shared_dict()`. Every read and write is locked, so multiple Starlark threads may share and mutate one instance without races. Its `type()` is `shared_dict` by default, or the custom name passed to `make_shared_dict`. It supports indexing (`sd["k"] = v`, `sd["k"]`), membership (`in`), the `.len()` method, and `==`/`!=` comparison with another `shared_dict`. It is **not** iterable, and the builtin `len()` does not apply — use the `.len()` method. -**List** +Beyond the standard dict methods it inherits (`clear`, `get`, `items`, `keys`, `pop`, `popitem`, `setdefault`, `update`, `values`), it adds the methods below. -Get distinct elements from a list. +| method | description | +|--------|-------------| +| `len() -> int` | Number of items in the dictionary. | +| `perform(fn)` | Call `fn(self)` while holding the lock, for atomic compound updates. | +| `to_dict() -> dict` | Shallow clone into a plain dict; mutating the clone never affects the original. | +| `to_json() -> str` | Serialize the contents to a JSON string. | +| `from_json(json_str)` | Decode a JSON object string and merge its pairs into the dictionary. | ```python -load("go_idiomatic", "distinct") -print(distinct([1, 2, 2, 3, 3, 3])) -# Output: [1, 2, 3] +load("go_idiomatic", "make_shared_dict") +sd = make_shared_dict("mydict", {"a": 1, "b": 2}) +print(type(sd), sd.len()) +# Output: mydict 2 ``` -**Tuple** - -Get distinct elements from a tuple. +`perform` runs the callback under the dict's lock so a read-modify-write stays atomic: ```python -load("go_idiomatic", "distinct") -print(distinct((1, 2, 2, 3, 3, 3))) -# Output: (1, 2, 3) +load("go_idiomatic", "make_shared_dict") +sd = make_shared_dict() +def bump(d): d["cnt"] = d.get("cnt", 0) + 1 +sd.perform(bump) +sd.perform(bump) +print(sd) +# Output: shared_dict({"cnt": 2}) ``` -**Dict** - -Get distinct keys from a dictionary. +`to_dict` clones; mutating the clone leaves the source empty: ```python -load("go_idiomatic", "distinct") -print(distinct({1: 'a', 2: 'b', 3: 'c'})) -# Output: [1, 2, 3] +load("go_idiomatic", "make_shared_dict") +sd = make_shared_dict() +clone = sd.to_dict() +clone["k"] = "v" +print(sd, clone) +# Output: shared_dict({}) {"k": "v"} ``` -**Set** - -Return original set (already distinct). +`to_json` serializes and `from_json` merges: ```python -load("go_idiomatic", "distinct") -print(distinct(set([1, 2, 3, 3]))) -# Output: {1, 2, 3} +load("go_idiomatic", "make_shared_dict") +sd = make_shared_dict() +sd.from_json('{"new_key": "new_value"}') +print(sd.to_json()) +# Output: {"new_key":"new_value"} ``` -### `hex(x)` - -Convert an integer number to a lowercase hexadecimal string prefixed with `0x`. +## Details & examples -#### Examples +### `length(obj) -> int` -**Basic** - -Convert an integer to a hexadecimal string. +For a string, returns the number of Unicode code points (unlike the builtin `len()`, which counts UTF-8 bytes); for bytes, the byte count; otherwise the `Len()` of any `starlark.Sequence` (list, tuple, dict, set, and `starlight` slices/maps). Errors with `length() takes exactly one argument` if not given exactly one positional argument, and `length() function isn't supported for '' type object` for a non-sized type (e.g. `bool`). ```python -load("go_idiomatic", "hex") -print(hex(255)) -# Output: 0xff +load("go_idiomatic", "length") +print(length("水光肌"), len("水光肌")) +# Output: 3 9 ``` -**Negative** - -Convert a negative integer to a hexadecimal string. - ```python -load("go_idiomatic", "hex") -print(hex(-42)) -# Output: -0x2a +load("go_idiomatic", "length") +print(length([1, 2, 3]), length(set(["a", "b"])), length({"a": 1, "b": 2})) +# Output: 3 2 2 ``` -### `oct(x)` - -Convert an integer number to an octal string prefixed with `0o`. - -#### Examples - -**Basic** +### `sum(iterable, start=0) -> number` -Convert an integer to an octal string. +Adds every item of `iterable` to `start`. Items and `start` must be numbers; `None` items are skipped (treated as zero). Mixing int and float yields an int when the result is whole. Errors with `unsupported type: , expected float or int` on a non-numeric item, and `got , want iterable` if `iterable` is not iterable. ```python -load("go_idiomatic", "oct") -print(oct(255)) -# Output: 0o377 +load("go_idiomatic", "sum") +print(sum([1, 2, 3]), sum([1, 2, 4], start=8), sum([1, 2, None])) +# Output: 6 15 3 ``` -**Negative** +### `distinct(iterable) -> iterable` -Convert a negative integer to an octal string. +Removes duplicates, preserving first-seen order. Returns a **list** for a list or custom iterable, a **tuple** for a tuple, the result of `.keys()` (a list) for a dict, and the original **set** unchanged for a set. Errors with `unhashable type: ` if an element cannot be hashed, and `got , want iterable` for a non-iterable argument. ```python -load("go_idiomatic", "oct") -print(oct(-56)) -# Output: -0o70 +load("go_idiomatic", "distinct") +print(distinct([1, 2, 2, 3, 3, 3]), distinct((1, 2, 2, 3))) +# Output: [1, 2, 3] (1, 2, 3) ``` -### `bin(x)` - -Convert an integer number to a binary string prefixed with `0b`. - -#### Examples +### `hex(x) -> str` / `oct(x) -> str` / `bin(x) -> str` -**Basic** - -Convert an integer to a binary string. +Convert an integer to a base-16 / base-8 / base-2 string with the `0x` / `0o` / `0b` prefix. A negative number keeps the sign before the prefix (`-0xf`); zero is `0x0` / `0o0` / `0b0`. Arbitrary-precision integers are supported. Each errors with `missing argument for x` if called with no argument. ```python -load("go_idiomatic", "bin") -print(bin(255)) -# Output: 0b11111111 +load("go_idiomatic", "hex", "oct", "bin") +print(hex(255), oct(255), bin(255)) +# Output: 0xff 0o377 0b11111111 ``` -**Negative** - -Convert a negative integer to a binary string. - ```python -load("go_idiomatic", "bin") -print(bin(-10)) -# Output: -0b1010 +load("go_idiomatic", "hex", "oct", "bin") +print(hex(-15), oct(-56), bin(-255)) +# Output: -0xf -0o70 -0b11111111 ``` -### `bytes_hex(bytes,sep="",bytes_per_sep=1)` - -Return a string containing two hexadecimal digits for each byte in the instance. -If you want to make the hex string easier to read, you can specify a single character separator sep parameter to include in the output. -By default, this separator will be included between each byte. -A second optional bytes_per_sep parameter controls the spacing. Positive values calculate the separator position from the right, negative values from the left. +### `bytes_hex(bytes, sep="", bytes_per_sep=1) -> str` -#### Parameters - -| name | type | description | -|-----------------|----------|------------------------------------| -| `bytes` | `bytes` | The bytes to convert. | -| `sep` | `string` | The separator to use. | -| `bytes_per_sep` | `int` | The number of bytes per separator. | - -#### Examples - -**Basic** - -Convert bytes to a hexadecimal string. +Two lowercase hex digits per byte. With a one-character `sep`, the separator is inserted between groups of `bytes_per_sep` bytes — a positive count groups from the right, a negative count from the left. Errors with `missing argument for bytes` if `bytes` is omitted. ```python load("go_idiomatic", "bytes_hex") -print(bytes_hex(b"hello")) -# Output: 68656c6c6f +print(bytes_hex(b"123456")) +# Output: 313233343536 ``` -**Separator** - -Convert bytes to a hexadecimal string with a separator. - ```python load("go_idiomatic", "bytes_hex") -print(bytes_hex(b"hello", sep=":")) -# Output: 68:65:6c:6c:6f +print(bytes_hex(b"123456", "_", 4)) +print(bytes_hex(b"123456", "_", -4)) +# Output: 3132_33343536 +# 31323334_3536 ``` -**Bytes per separator** +### `is_nil(x) -> bool` -Convert bytes to a hexadecimal string with a separator and bytes per separator. +`True` for `None`/`nil`, or for a `starlight` Go wrapper (`GoSlice`, `GoMap`, `GoStruct`, `GoInterface`) whose underlying Go value is nil. Errors with `unsupported type: ` for any other Starlark value (e.g. an int) — it is intentionally not a general truthiness test. ```python -load("go_idiomatic", "bytes_hex") -print(bytes_hex(b"hello", sep=":", bytes_per_sep=2)) -# Output: 68:656c:6c6f +load("go_idiomatic", "is_nil") +print(is_nil(None)) +# Output: True ``` ### `sleep(secs)` -Sleeps for the given number of seconds. - -#### Examples - -**Basic** - -Sleep for 1 second. +Blocks the current thread for `secs` seconds (int or float). `secs` must be non-negative — otherwise `secs must be non-negative`. The sleep is cancelled if the thread's context is done, returning that context error. Errors with `missing argument for secs` if omitted, or `want float or int` for a non-number. ```python load("go_idiomatic", "sleep") -sleep(1) -``` - -### `exit(code=0)` - -Exits the program with the given exit code. - -#### Examples - -**Default** - -Exit with default code (0). - -```python -load("go_idiomatic", "exit") -exit() +sleep(0.01) +# Output: ``` -**Non-zero** +### `exit(code=0)` / `quit(code=0)` -Exit with code 1. +Stores `code` as the thread-local `exit_code` and returns the sentinel error `ErrSystemExit` to unwind the program: `starlet runtime system exit (Use Ctrl-D in REPL to exit)`. `code` must fit an unsigned 8-bit range (0–255) — `exit(-1)` errors with `out of range`. `quit` is an exact alias. ```python load("go_idiomatic", "exit") exit(1) +# Output: ``` -### `quit(code=0)` - -Alias for `exit()`. - -#### Examples - -**Default** - -Exit with default code (0). - -```python -load("go_idiomatic", "quit") -quit() -``` - -**Non-zero** - -Exit with code 1. - -```python -load("go_idiomatic", "quit") -quit(1) -``` - -### `module(name, **kv)` +### `module(name, **kv) -> module` -Returns the module with the given name and keyword arguments. -The main difference between the `module` and the `struct` is that the string representation of the `module` does not enumerate its fields. -The module can't be compared with `==` and `!=`, but the `struct` can. - -#### Parameters - -| name | type | description | -|--------|------------|----------------------------------------| -| `name` | `string` | The name of the module to return. | -| `kv` | `**kwargs` | Key-value pairs to provide attributes. | - -#### Examples - -**Basic** - -Get the `os` module with pid attribute. +Builds a `starlarkstruct` module. Unlike `struct`, its string form hides the fields (``) and it is **not** comparable with `==`/`!=`. Takes exactly one positional argument (the name); extra positionals error with `got N arguments, want 1`. ```python load("go_idiomatic", "module") -os = module("os", pid=1) -print(os) -# Output: -``` - -### `struct(**kv)` - -Returns a new struct with the given keyword arguments. - -#### Parameters - -| name | type | description | -|------|------------|----------------------------------------| -| `kv` | `**kwargs` | Key-value pairs to provide attributes. | - -#### Examples - -**Basic** - -Create a struct with name and age attributes. - -```python -load("go_idiomatic", "struct") -person = struct(name="Alice", age=30) -print(person) -# Output: struct(age = 30, name = "Alice") +m = module("rose", a=100, b="hello") +print(m, m.a, m.b) +# Output: 100 hello ``` -### `make_struct(name, **kv)` - -Returns a new struct with the given name as constructor and keyword arguments. -Comparing two structs with `==` and `!=` will compare their constructors first and then their fields. +### `struct(**kv) -> struct` / `make_struct(name, **kv) -> struct` -#### Parameters - -| name | type | description | -|--------|------------|----------------------------------------| -| `name` | `string` | The name to use as constructor. | -| `kv` | `**kwargs` | Key-value pairs to provide attributes. | - -#### Examples - -**Basic** - -Create a struct with name and age attributes. +`struct` builds an anonymous, field-comparable struct (printed as `struct(field = value, ...)`, fields sorted). `make_struct` is the same but takes a leading positional `name` used as the constructor in its string form and in equality (two structs are equal only if constructor *and* fields match). `struct` rejects positional arguments (`unexpected positional arguments`); `make_struct` requires exactly one (`got N arguments, want 1`). ```python -load("go_idiomatic", "make_struct") -person = make_struct("Person", name="Alice", age=30) -print(person) -# Output: Person(age = 30, name = "Alice") +load("go_idiomatic", "struct", "make_struct") +print(struct(rose="red", lily="white")) +print(make_struct("rose", color="red", price=100)) +# Output: struct(lily = "white", rose = "red") +# rose(color = "red", price = 100) ``` -### `shared_dict()` +### `shared_dict() -> shared_dict` / `make_shared_dict(name="", data=None) -> shared_dict` -Creates a new instance of a thread-safe, mutable shared dictionary. -This allows for concurrent access and modification by multiple Starlark threads, ensuring data consistency and preventing race conditions. -The function initializes a SharedDict with default settings. - -#### Examples - -**Basic** - -Create a new shared dictionary. - -```python -load("go_idiomatic", "shared_dict") -sd = shared_dict() -print(sd) -# Output: shared_dict({}) -``` - -### `make_shared_dict(name="", data=None)` - -Creates a customized shared dictionary with an optional name and initial data. -The name parameter allows for more descriptive representations and debugging, while the data parameter lets you initialize the shared dictionary with pre-existing key-value pairs. - -#### Parameters - -| name | type | description | -|--------|----------|------------------------------------------------------------------------------------------------------------------------------------------| -| `name` | `string` | An optional name for the shared dictionary. Defaults to an empty string, which results in the default name "shared_dict". | -| `data` | `dict` | An optional Starlark dictionary to initialize the shared dictionary with. Defaults to None, which results in an empty shared dictionary. | - -#### Examples - -**Named Shared Dict** - -Create a named shared dictionary without initial data. - -```python -load("go_idiomatic", "make_shared_dict") -sd = make_shared_dict(name="my_dict") -print(sd) -# Output: my_dict({}) -``` - -**Named Shared Dict with Data** - -Create a named shared dictionary with initial data. +Both create a `shared_dict` (see the Types section). `shared_dict()` takes no arguments and yields an empty dict named `shared_dict`. `make_shared_dict` accepts an optional type `name` and an optional `data` dict to seed it. `shared_dict(123)` errors with `got 1 arguments, want 0`; passing a non-string `name` errors with `want string`. ```python load("go_idiomatic", "make_shared_dict") -initial_data = {"key1": "value1", "key2": "value2"} -sd = make_shared_dict(name="custom_dict", data=initial_data) -print(sd) -# Output: custom_dict({"key1": "value1", "key2": "value2"}) -``` - -### `to_dict(v)` - -Converts various Starlark values into a Starlark dictionary. Works with native Starlark dict, module, struct, and GoStruct, SharedDict. -For GoStruct, it serializes the underlying Go struct to JSON and then deserializes it to a Starlark dict. - -#### Parameters - -| name | type | description | -|------|-------|----------------------------------------------| -| `v` | `any` | The value to be converted into a dictionary. | - -#### Examples - -**Module to Dict** - -Convert a Starlark module to a dict. - -```python -load("go_idiomatic", "to_dict") -m = module("example", a=1, b=2) -print(to_dict(m)) -# Output: {"a": 1, "b": 2} +print(make_shared_dict("manaʻo", {"abc": 123})) +# Output: manaʻo({"abc": 123}) ``` -**Struct to Dict** +### `to_dict(v) -> dict` -Convert a custom Starlark struct to a dict. +Converts a value into a plain Starlark `dict`. Accepts a native `dict` (cloned), a `module` or `struct` (members become entries), a `GoStruct` (marshalled to JSON via Go's encoder then decoded to a dict — Go field names are preserved, nil fields become `None`), and a `shared_dict` (clone of its contents). Any other type errors with `unsupported type: `; a Go struct holding a JSON-unencodable field (e.g. a channel) surfaces the encoder error (`json: unsupported type: chan int`). ```python -load("go_idiomatic", "to_dict") -person = struct(name="Alice", age=30) -print(to_dict(person)) +load("go_idiomatic", "to_dict", "struct", "module") +print(to_dict(struct(name="Alice", age=30))) +print(to_dict(module("mod", foo="bar", num=42))) # Output: {"age": 30, "name": "Alice"} -``` - -**GoStruct to Dict** - -Convert a GoStruct to a dict. - -```python -load("go_idiomatic", "to_dict") -gs.Name = "Bob" -gs.Age = 25 -print(to_dict(gs)) -# Output: {"age": 25, "name": "Bob"} -``` - -**SharedDict to Dict** - -Convert a SharedDict to a dict. - -```python -load("go_idiomatic", "shared_dict", "to_dict") -sd = shared_dict() -sd["key"] = "value" -print(to_dict(sd)) -# Output: {"key": "value"} -``` - -**Dict to Dict** - -Clone an existing Starlark dict. - -```python -load("go_idiomatic", "to_dict") -original_dict = {"a": 1, "b": 2} -cloned_dict = to_dict(original_dict) -print(cloned_dict) -# Output: {"a": 1, "b": 2} +# {"foo": "bar", "num": 42} ``` ### `eprint(*args, sep=" ")` -Works like the standard `print()` function but prints the given arguments to `stderr` instead of `Print` handler defined in Go. -This is useful for logging errors or important warnings that should be separated from standard output. - -#### Parameters - -| name | type | description | -|--------|----------|-------------------------------------------------------------------------| -| `args` | `*args` | The values to be printed. | -| `sep` | `string` | An optional separator between values. Defaults to a single space (" "). | - -#### Examples - -**Basic** - -Print an error message to stderr. - -```python -load("go_idiomatic", "eprint") -eprint("Error:", "An unexpected error occurred") -``` - -**Custom Separator** - -Print multiple values to stderr with a custom separator. +Like the builtin `print()` but always writes to **stderr**, bypassing the Go `Print` handler — useful for diagnostics kept out of normal output. `sep` (a string) joins the arguments. A non-string `sep` errors with `got , want string`. ```python load("go_idiomatic", "eprint") eprint("Path", "/home/user/docs", sep=" -> ") -# Output: Path -> /home/user/docs +# Output: ``` -### `pprint(*args, sep=" ")` - -Works like the standard `print()` function but formats the given arguments in pretty JSON format with indentation. -If an argument cannot be converted to JSON, it falls back to converting the value to a string. -This is particularly useful for printing complex data structures in a human-readable format. +(Output goes to stderr, so stdout shows nothing.) -#### Parameters - -| name | type | description | -|--------|----------|----------------------------------------------------------------------------------------------------------------| -| `args` | `*args` | The values to be printed. These can be any Starlark values, including lists, dictionaries, and custom structs. | -| `sep` | `string` | An optional separator between values. Defaults to a single space (" "). | - -#### Examples - -**Basic** +### `pprint(*args, sep=" ")` -Pretty print a dictionary. +Like `print()` but renders each argument as **indented JSON** (4-space indent) before writing through the thread's `Print` handler (falling back to stderr). Values that cannot be JSON-encoded — including self-referential structures — fall back to their string form, so `pprint` never fails on cyclic input. A non-string `sep` errors with `got , want string`. ```python load("go_idiomatic", "pprint") @@ -563,145 +275,11 @@ pprint({"key": "value", "list": [1, 2, 3]}) # } ``` -**Multiple Values** - -Pretty print multiple values with a custom separator. - -```python -load("go_idiomatic", "pprint") -pprint({"key1": "value1"}, {"key2": "value2"}, sep="\n---\n") -# Output: { -# "key1": "value1" -# } -# --- -# { -# "key2": "value2" -# } -``` - -## Types - -### `nil` - -Value as an alias for `None`. +## Notes / boundaries -### `true` - -Value as an alias for `True`. - -### `false` - -Value as an alias for `False`. - -### `SharedDict` - -A thread-safe, mutable dictionary that can be concurrently accessed and modified by multiple Starlark threads. -It ensures data consistency and prevents race conditions in concurrent environments. - -**Methods** - -#### `len()` - -Returns the number of items in the shared dictionary. - -##### Examples - -**Basic** - -Get the length of a shared dictionary. - -```python -load("go_idiomatic", "make_shared_dict") -sd = make_shared_dict() -sd["key1"] = "value1" -print(sd.len()) -# Output: 1 -``` - -#### `perform(fn)` - -Calls the given function with the shared dictionary as its argument. The function must be callable. - -##### Parameters - -| name | type | description | -|------|------------|---------------------------------------------------------------------------------------------------------------| -| `fn` | `callable` | The function to be called with the shared dictionary, and accepts the shared dictionary as its only argument. | - -##### Examples - -**Basic** - -Perform a custom operation on the shared dictionary. - -```python -load("go_idiomatic", "make_shared_dict") -sd = make_shared_dict() -def my_operation(d): d["cnt"] = d.get("cnt", 0) + 1 -sd.perform(my_operation) -print(sd) -# Output: shared_dict({"new_key": "new_value"}) -``` - -#### `to_dict()` - -Returns a shadow-clone of the shared dictionary. Modifications to the clone do not affect the original shared dictionary. - -##### Examples - -**Clone and Modify** - -Clone a shared dictionary and add new data to the clone. - -```python -load("go_idiomatic", "make_shared_dict") -sd = make_shared_dict() -sd_clone = sd.to_dict() -sd_clone["clone_key"] = "clone_value" -print(sd) -print(sd_clone) -# Output: shared_dict({}) -# {"clone_key": "clone_value"} -``` - -#### `to_json()` - -Serializes the shared dictionary to a JSON string. - -##### Examples - -**Serialize** - -Convert a shared dictionary to a JSON string. - -```python -load("go_idiomatic", "make_shared_dict") -sd = make_shared_dict(data={"key": "value"}) -json_str = sd.to_json() -print(json_str) -# Output: {"key": "value"} -``` - -#### `from_json(json_str)` - -Deserializes a JSON string into the shared dictionary, updating it with the key-value pairs decoded from the string. - -##### Parameters - -| name | type | description | -|------------|----------|----------------------------------------------------------------------| -| `json_str` | `string` | The JSON string to deserialize and merge into the shared dictionary. | - -##### Examples - -**Deserialize** - -Update a shared dictionary with data from a JSON string. - -```python -load("go_idiomatic", "make_shared_dict") -sd = make_shared_dict() -sd.from_json('{"new_key": "new_value"}') -print(sd) -# Output: shared_dict({"new_key": "new_value"}) -``` +- **Not a Python module clone.** Names echo Python builtins, but this is a Star\* utility grab-bag, not a 1:1 API mirror — there is no module-level namespace; members load individually. +- **`length` vs `len`.** `length` counts Unicode code points for strings; the builtin `len` counts UTF-8 bytes. They agree for sequences. +- **`distinct` ordering.** First-seen order is preserved for list/tuple/custom inputs; dict-key order from `.keys()` is **not** guaranteed (sort it if you need stability). Sets are returned unchanged. +- **Process effects.** `sleep` blocks (and respects context cancellation); `exit`/`quit` do not stop the goroutine themselves — they return `ErrSystemExit` for the host runner to act on, and set the thread-local `exit_code`. +- **Determinism.** `struct`/`make_struct`/`module` print fields in sorted order; `to_dict` of a `GoStruct` keeps the Go field names verbatim. +- **`shared_dict` scope.** Locking protects a single instance across threads; it does not make the values it holds deeply immutable or persist anything beyond the process. diff --git a/lib/hashlib/README.md b/lib/hashlib/README.md index a08a685f..55f1b487 100644 --- a/lib/hashlib/README.md +++ b/lib/hashlib/README.md @@ -1,73 +1,94 @@ # hashlib -`hashlib` defines hash primitives for Starlark. +`hashlib` provides cryptographic hash primitives for Starlark — MD5, SHA-1, SHA-256, and SHA-512 digests of string or bytes input, returned as lowercase hex. Capability profile: **pure** (no filesystem, network, process, or log side effects). + +Migrated from [qri-io/starlib](https://github.com/qri-io/starlib/tree/master/hash). ## Functions -### `md5(data) string` +| function | description | +| --- | --- | +| `md5(data) -> string` | Lowercase hex MD5 digest of `data` (string or bytes). | +| `sha1(data) -> string` | Lowercase hex SHA-1 digest of `data` (string or bytes). | +| `sha256(data) -> string` | Lowercase hex SHA-256 digest of `data` (string or bytes). | +| `sha512(data) -> string` | Lowercase hex SHA-512 digest of `data` (string or bytes). | -Returns an MD5 hash for a string or bytes. +All four share the same signature and behavior; only the algorithm differs. `data` is a required positional argument (also accepted as the keyword `data`). It must be a Starlark `string` or `bytes`; the two encode identically, so a string and the equivalent `bytes` produce the same digest. The return is the digest hex-encoded as a lowercase `string`. -#### Examples +**Errors:** each function takes exactly one argument — passing more raises `got N arguments, want at most 1`. Passing a non-string/bytes value (e.g. an `int`) raises `for parameter data: got , want string or bytes`. -**Basic** +## Examples -Calculate an MD5 checksum for "hello world". +### `md5` ```python load("hashlib", "md5") -sum = md5("hello world!") -print(sum) -# Output: fc3ff98e8c6a0d3087d515c0473f8677 +print(md5("")) +print(md5("Aloha!")) +print(md5(b"Aloha!")) +# Output: +# d41d8cd98f00b204e9800998ecf8427e +# de424bf3e7dcba091c27d652ada485fb +# de424bf3e7dcba091c27d652ada485fb ``` -### `sha1(data) string` - -Returns a SHA-1 hash for a string or bytes. - -#### Examples +The string and bytes forms of `"Aloha!"` hash to the same value. -**Basic** - -Calculate an SHA-1 checksum for "hello world". +### `sha1` ```python load("hashlib", "sha1") -sum = sha1("hello world!") -print(sum) -# Output: 430ce34d020724ed75a196dfc2ad67c77772d169 +print(sha1("")) +print(sha1("Aloha!")) +# Output: +# da39a3ee5e6b4b0d3255bfef95601890afd80709 +# c3dd37312ba987e1cc40ae021bc202c4a52d8afe ``` -### `sha256(data) string` - -Returns an SHA-256 hash for a string or bytes. - -#### Examples - -**Basic** - -Calculate an SHA-256 checksum for "hello world". +### `sha256` ```python load("hashlib", "sha256") -sum = sha256("hello world!") -print(sum) -# Output: 7509e5bda0c762d2bac7f90d758b5b2263fa01ccbc542ab5e3df163be08e6ca9 +print(sha256("")) +print(sha256("Aloha!")) +# Output: +# e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +# dea7e28aee505f2dd033de1427a517793e38b7605e8fc24da40151907e52cea3 ``` -### `sha512(data) string` +### `sha512` -Returns an SHA-512 hash for a string or bytes. +```python +load("hashlib", "sha512") +print(sha512("")) +print(sha512("Aloha!")) +# Output: +# cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e +# d9cb95ad9d916a0781b3339424d5eb11c476405dfba7af7fabf4981fdd3291c27e8006e4cca617beae70dd00ab86a0213c44ed461229b16b45db45f64691049e +``` -#### Examples +### Error: wrong argument count -**Basic** +```python +load("hashlib", "md5") +md5("Aloha!", "Hello!") +# Output: +# Error: hash.md5: got 2 arguments, want at most 1 +``` -Calculate an SHA-512 checksum for "hello world". +### Error: wrong input type ```python -load("hashlib", "sha512") -sum = sha512("hello world!") -print(sum) -# Output: db9b1cd3262dee37756a09b9064973589847caa8e53d31a9d142ea2701b1b28abd97838bb9a27068ba305dc8d04a45a1fcf079de54d607666996b3cc54f6b67c +load("hashlib", "md5") +md5(123) +# Output: +# Error: hash.md5: for parameter data: got int, want string or bytes ``` + +## Notes / boundaries + +- **Engine:** Go standard library `crypto/md5`, `crypto/sha1`, `crypto/sha256`, `crypto/sha512`; output is `encoding/hex` lowercase. +- **Deterministic:** identical input always yields identical output; no salt, no randomness. +- **Pure:** no host effects of any kind. +- **No streaming/incremental API:** each call hashes the full `data` value in one shot; there is no reusable hasher object or `update`/`hexdigest` protocol like CPython's `hashlib`. The module exposes only the four one-shot functions above — no constants and no custom types. +- **MD5 and SHA-1 are not collision-resistant.** They are provided for checksums and legacy interop; do not use them for security-sensitive integrity or signatures — prefer `sha256` or `sha512`. diff --git a/lib/http/README.md b/lib/http/README.md index f47f6a03..6ae962f4 100644 --- a/lib/http/README.md +++ b/lib/http/README.md @@ -1,286 +1,266 @@ # http -`http` defines an HTTP client implementation. It is a thin wrapper around the Go standard package `net/http` but in Python `requests` style. +`http` is an HTTP **client** for Starlark scripts: a thin wrapper around Go's `net/http`, shaped after Python's `requests`. **Capability profile: Network** — every request function performs a real outbound HTTP request, so this module has network side effects. -Every request function (and `call`) has a `try_`-prefixed variant (`try_get`, `try_post`, …, `try_call`) that never aborts the script: it returns a `(response, error)` pair where exactly one side is `None`, the same shape as the `json` module's `try_*` functions. All request functions also accept `raise_for_status=True` to turn any non-2xx response into an error. +Every request function (and `call`) has a `try_`-prefixed twin (`try_get`, `try_post`, …, `try_call`) that never aborts the script: it returns a `(response, error)` tuple where exactly one side is `None`, the same shape as the `json` module's `try_*` functions. All request functions also accept `raise_for_status=True` to turn any non-2xx response into an error. + +> Note: `postForm` / `try_postForm` are **not** snake_case. The name is kept for historical compatibility (it is `post` with the form encoding forced to `application/x-www-form-urlencoded`). + +The server-side helpers (`ExportedServerRequest`, `ServerResponse`) are Go types this package exposes for embedding scripts inside a Go HTTP handler; they are documented under [Types](#types) but are not part of the loadable `http` module surface. ## Functions -### `call(method, url, params=None, headers=None, auth=(), body=None, json_body=None, form_body=None, form_encoding="", timeout=30, allow_redirects=True, verify=True) response` - -Perform an HTTP request of the specified method, returning a response. -The `call` method allows for flexibility in making HTTP requests by specifying the HTTP method as an argument. -It supports all common HTTP methods. This method dynamically dispatches the request based on the provided method name. -It is a convenience wrapper that enables users to use any supported HTTP method without needing separate method calls for each type of request. - -#### Parameters - -| name | type | description | -|-------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `method` | `string` | The HTTP method to use for the request (e.g., GET, POST, PUT, DELETE). | -| `url` | `string` | URL to request. | -| `params` | `dict` | optional. dictionary of URL parameters to append to the request. | -| `headers` | `dict` | optional. dictionary of headers to add to request. | -| `body` | `string` | optional. raw string body to provide to the request. | -| `form_body` | `dict` | optional. dict of values that will be encoded as form data. the value can be a string or a list of two strings (filename, file content) for file attachments. | -| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` (default for form data) or `multipart/form-data`. | -| `json_body` | `any` | optional. JSON data to supply as a request. handy for working with JSON-API's. | -| `auth` | `tuple` | optional. (username,password) tuple for HTTP Basic authorization. | -| `timeout` | `float` | optional. how many seconds to wait for the server to send all the data before giving up. 0 means no timeout. | -| `allow_redirects` | `bool` | optional. whether to follow redirects. | -| `verify` | `bool` | optional. whether to verify the server's SSL certificate. | -| `raise_for_status` | `bool` | optional. if True, a response with a non-2xx status code is reported as an error. defaults to False. | - -### `get(url, params=None, headers=None, auth=(), body=None, json_body=None, form_body=None, form_encoding="", timeout=30, allow_redirects=True, verify=True) response` - -Perform an HTTP GET request, returning a response. - -#### Parameters - -| name | type | description | -|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `url` | `string` | URL to request. | -| `params` | `dict` | optional. dictionary of URL parameters to append to the request. | -| `headers` | `dict` | optional. dictionary of headers to add to request. | -| `body` | `string` | optional. raw string body to provide to the request. | -| `json_body` | `any` | optional. optional. JSON data to supply as a request; mutually exclusive with body and form_body. handy for working with JSON-API's. | -| `form_body` | `dict` | optional. optional. dict of values that will be encoded as form data; mutually exclusive with body and json_body. the value can be a string or a list of two strings (filename, file content) for file attachments. | -| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` (default for form data) or `multipart/form-data`. | -| `auth` | `tuple` | optional. (username,password) tuple for HTTP Basic authorization. | -| `timeout` | `float` | optional. how many seconds to wait for the server to send all the data before giving up. 0 means no timeout. | -| `allow_redirects` | `bool` | optional. whether to follow redirects. | -| `verify` | `bool` | optional. whether to verify the server's SSL certificate. | -| `raise_for_status` | `bool` | optional. if True, a response with a non-2xx status code is reported as an error. defaults to False. | - -### `put(url, params=None, headers=None, auth=(), body=None, json_body=None, form_body=None, form_encoding="", timeout=30, allow_redirects=True, verify=True) response` - -Perform an HTTP PUT request, returning a response. - -#### Parameters - -| name | type | description | -|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `url` | `string` | URL to request. | -| `params` | `dict` | optional. dictionary of URL parameters to append to the request. | -| `headers` | `dict` | optional. dictionary of headers to add to request. | -| `body` | `string` | optional. raw string body to provide to the request. | -| `json_body` | `any` | optional. optional. JSON data to supply as a request; mutually exclusive with body and form_body. handy for working with JSON-API's. | -| `form_body` | `dict` | optional. optional. dict of values that will be encoded as form data; mutually exclusive with body and json_body. the value can be a string or a list of two strings (filename, file content) for file attachments. | -| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` (default for form data) or `multipart/form-data`. | -| `auth` | `tuple` | optional. (username,password) tuple for HTTP Basic authorization. | -| `timeout` | `float` | optional. how many seconds to wait for the server to send all the data before giving up. 0 means no timeout. | -| `allow_redirects` | `bool` | optional. whether to follow redirects. | -| `verify` | `bool` | optional. whether to verify the server's SSL certificate. | -| `raise_for_status` | `bool` | optional. if True, a response with a non-2xx status code is reported as an error. defaults to False. | - -### `post(url, params=None, headers=None, auth=(), body=None, json_body=None, form_body=None, form_encoding="", timeout=30, allow_redirects=True, verify=True) response` - -Perform an HTTP POST request, returning a response. - -#### Parameters - -| name | type | description | -|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `url` | `string` | URL to request. | -| `params` | `dict` | optional. dictionary of URL parameters to append to the request. | -| `headers` | `dict` | optional. dictionary of headers to add to request. | -| `body` | `string` | optional. raw string body to provide to the request. | -| `json_body` | `any` | optional. optional. JSON data to supply as a request; mutually exclusive with body and form_body. handy for working with JSON-API's. | -| `form_body` | `dict` | optional. optional. dict of values that will be encoded as form data; mutually exclusive with body and json_body. the value can be a string or a list of two strings (filename, file content) for file attachments. | -| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` (default for form data) or `multipart/form-data`. | -| `auth` | `tuple` | optional. (username,password) tuple for HTTP Basic authorization. | -| `timeout` | `float` | optional. how many seconds to wait for the server to send all the data before giving up. 0 means no timeout. | -| `allow_redirects` | `bool` | optional. whether to follow redirects. | -| `verify` | `bool` | optional. whether to verify the server's SSL certificate. | -| `raise_for_status` | `bool` | optional. if True, a response with a non-2xx status code is reported as an error. defaults to False. | - -### `postForm(url, params=None, headers=None, auth=(), body=None, json_body=None, form_body=None, form_encoding="", timeout=30, allow_redirects=True, verify=True) response` - -Perform an HTTP POST request with form data, returning a response. - -#### Parameters - -| name | type | description | -|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `url` | `string` | URL to request. | -| `params` | `dict` | optional. dictionary of URL parameters to append to the request. | -| `headers` | `dict` | optional. dictionary of headers to add to request. | -| `body` | `string` | optional. raw string body to provide to the request. | -| `json_body` | `any` | optional. optional. JSON data to supply as a request; mutually exclusive with body and form_body. handy for working with JSON-API's. | -| `form_body` | `dict` | optional. optional. dict of values that will be encoded as form data; mutually exclusive with body and json_body. the value can be a string or a list of two strings (filename, file content) for file attachments. | -| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` (default for form data) or `multipart/form-data`. | -| `auth` | `tuple` | optional. (username,password) tuple for HTTP Basic authorization. | -| `timeout` | `float` | optional. how many seconds to wait for the server to send all the data before giving up. 0 means no timeout. | -| `allow_redirects` | `bool` | optional. whether to follow redirects. | -| `verify` | `bool` | optional. whether to verify the server's SSL certificate. | -| `raise_for_status` | `bool` | optional. if True, a response with a non-2xx status code is reported as an error. defaults to False. | - -### `delete(url, params=None, headers=None, auth=(), body=None, json_body=None, form_body=None, form_encoding="", timeout=30, allow_redirects=True, verify=True) response` - -Perform an HTTP DELETE request, returning a response. - -#### Parameters - -| name | type | description | -|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `url` | `string` | URL to request. | -| `params` | `dict` | optional. dictionary of URL parameters to append to the request. | -| `headers` | `dict` | optional. dictionary of headers to add to request. | -| `body` | `string` | optional. raw string body to provide to the request. | -| `json_body` | `any` | optional. optional. JSON data to supply as a request; mutually exclusive with body and form_body. handy for working with JSON-API's. | -| `form_body` | `dict` | optional. optional. dict of values that will be encoded as form data; mutually exclusive with body and json_body. the value can be a string or a list of two strings (filename, file content) for file attachments. | -| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` (default for form data) or `multipart/form-data`. | -| `auth` | `tuple` | optional. (username,password) tuple for HTTP Basic authorization. | -| `timeout` | `float` | optional. how many seconds to wait for the server to send all the data before giving up. 0 means no timeout. | -| `allow_redirects` | `bool` | optional. whether to follow redirects. | -| `verify` | `bool` | optional. whether to verify the server's SSL certificate. | -| `raise_for_status` | `bool` | optional. if True, a response with a non-2xx status code is reported as an error. defaults to False. | - -### `patch(url, params=None, headers=None, auth=(), body=None, json_body=None, form_body=None, form_encoding="", timeout=30, allow_redirects=True, verify=True) response` - -Perform an HTTP PATCH request, returning a response. - -#### Parameters - -| name | type | description | -|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `url` | `string` | URL to request. | -| `params` | `dict` | optional. dictionary of URL parameters to append to the request. | -| `headers` | `dict` | optional. dictionary of headers to add to request. | -| `body` | `string` | optional. raw string body to provide to the request. | -| `json_body` | `any` | optional. optional. JSON data to supply as a request; mutually exclusive with body and form_body. handy for working with JSON-API's. | -| `form_body` | `dict` | optional. optional. dict of values that will be encoded as form data; mutually exclusive with body and json_body. the value can be a string or a list of two strings (filename, file content) for file attachments. | -| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` (default for form data) or `multipart/form-data`. | -| `auth` | `tuple` | optional. (username,password) tuple for HTTP Basic authorization. | -| `timeout` | `float` | optional. how many seconds to wait for the server to send all the data before giving up. 0 means no timeout. | -| `allow_redirects` | `bool` | optional. whether to follow redirects. | -| `verify` | `bool` | optional. whether to verify the server's SSL certificate. | -| `raise_for_status` | `bool` | optional. if True, a response with a non-2xx status code is reported as an error. defaults to False. | - -### `options(url,params={},headers={},body="",form_body={},form_encoding="",json_body={},auth=(),timeout=30,allow_redirects=True,verify=True) response` - -Perform an HTTP OPTIONS request, returning a response. - -#### Parameters - -| name | type | description | -|-------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `url` | `string` | URL to request. | -| `params` | `dict` | optional. dictionary of URL parameters to append to the request. | -| `headers` | `dict` | optional. dictionary of headers to add to request. | -| `body` | `string` | optional. raw string body to provide to the request. | -| `json_body` | `any` | optional. optional. JSON data to supply as a request; mutually exclusive with body and form_body. handy for working with JSON-API's. | -| `form_body` | `dict` | optional. optional. dict of values that will be encoded as form data; mutually exclusive with body and json_body. the value can be a string or a list of two strings (filename, file content) for file attachments. | -| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` (default for form data) or `multipart/form-data`. | -| `auth` | `tuple` | optional. (username,password) tuple for HTTP Basic authorization. | -| `timeout` | `float` | optional. how many seconds to wait for the server to send all the data before giving up. 0 means no timeout. | -| `allow_redirects` | `bool` | optional. whether to follow redirects. | -| `verify` | `bool` | optional. whether to verify the server's SSL certificate. | -| `raise_for_status` | `bool` | optional. if True, a response with a non-2xx status code is reported as an error. defaults to False. | - -### `set_timeout(timeout)` - -Set the default timeout for HTTP requests made through this module instance (i.e. this machine). It does not affect other machines in the process; the package-level `TimeoutSecond` variable seeds new instances. With a host-injected client the value is ignored (the client's own timeout applies). - -#### Parameters - -| name | type | description | -|-----------|---------|------------------------------------------------------------------------------------------------------------------------------| -| `timeout` | `float` | The timeout in seconds. Must be non-negative. This timeout will be used for all subsequent HTTP requests made by the module. | - -### `get_timeout() float` - -Get the current default timeout of this module instance. -returns: -The current timeout in seconds used for HTTP requests. +| function | description | +|----------|-------------| +| `call(method, url, *, params=None, headers=None, body=None, json_body=None, form_body=None, form_encoding="", auth=(), timeout=30, allow_redirects=True, verify=True, raise_for_status=False) -> response` / `try_call(...) -> (response, error)` | Perform a request with the HTTP method named by `method` (case-insensitive), dispatching to one of the verb functions below. | +| `get(url, ...) -> response` / `try_get(...) -> (response, error)` | Perform an HTTP GET request. | +| `put(url, ...) -> response` / `try_put(...) -> (response, error)` | Perform an HTTP PUT request. | +| `post(url, ...) -> response` / `try_post(...) -> (response, error)` | Perform an HTTP POST request. | +| `postForm(url, ...) -> response` / `try_postForm(...) -> (response, error)` | POST with `form_encoding` forced to `application/x-www-form-urlencoded`. Non-snake_case name. | +| `delete(url, ...) -> response` / `try_delete(...) -> (response, error)` | Perform an HTTP DELETE request. | +| `head(url, ...) -> response` / `try_head(...) -> (response, error)` | Perform an HTTP HEAD request (response body is empty). | +| `patch(url, ...) -> response` / `try_patch(...) -> (response, error)` | Perform an HTTP PATCH request. | +| `options(url, ...) -> response` / `try_options(...) -> (response, error)` | Perform an HTTP OPTIONS request. | +| `set_timeout(timeout)` | Set the default request timeout (seconds) for this module instance. | +| `get_timeout() -> float` | Return the current default request timeout (seconds) of this module instance. | + +Every verb function (`get`, `put`, `post`, `postForm`, `delete`, `head`, `patch`, `options`) and `call` share the same keyword parameters; see [Request parameters](#request-parameters). ## Types ### `response` -The result of performing a HTTP request. +The result of performing an HTTP request (a struct). -**Fields** +**Attributes** -| name | type | description | -|---------------|----------|---------------------------------------------------------------------| -| `url` | `string` | the URL that was ultimately requested (may change after redirects). | -| `status_code` | `int` | response status code (for example: `200 == OK`). | -| `ok` | `bool` | True when the status code is in the 2xx range. | -| `headers` | `dict` | dictionary of response headers. | -| `encoding` | `string` | transfer encoding. example: "octet-stream" or "application/json". | +| attribute | type | description | +|-----------|------|-------------| +| `url` | `string` | the URL that was ultimately requested (may differ from the input after redirects). | +| `status_code` | `int` | response status code (e.g. `200`). | +| `ok` | `bool` | `True` when `status_code` is in the 2xx range. | +| `headers` | `dict` | response headers; each value is the header's values joined by `,`. | +| `encoding` | `string` | transfer encoding(s) joined by `,` (empty when none). | **Methods** -#### `body() string` - -output response body as a string. - -#### `json() object` - -attempt to parse response body as json, returning a JSON-decoded result, or None if the response body is empty or not valid JSON (a parse failure and a JSON `null` are indistinguishable here; use `try_json()` to tell them apart). - -#### `try_body() (string, error)` - -like `body()`, but returns a `(value, error)` pair instead of aborting the script (for example when the configured response size limit is exceeded). - -#### `try_json() (object, error)` - -like `json()`, but returns a `(value, error)` pair: a parse or read failure lands in the error slot instead of being folded into None. +| method | description | +|--------|-------------| +| `body() -> string` | Read and return the whole response body as a string. Re-readable. | +| `json() -> object` | Parse the body as JSON; returns `None` when the body is empty or not valid JSON (a parse failure and a JSON `null` are indistinguishable — use `try_json` to tell them apart). | +| `try_body() -> (string, error)` | Like `body()` but returns a `(value, error)` pair instead of aborting (e.g. when the response-size limit is exceeded). | +| `try_json() -> (object, error)` | Like `json()` but a parse or read failure lands in the error slot instead of folding into `None`. | ### `ExportedServerRequest` -Encapsulates HTTP request data in a format accessible to both Go code and Starlark scripts. +Go-side helper (constructed via `NewExportedServerRequest` / `ConvertServerRequest`) that exposes an incoming `http.Request` to a script as a **read-only** struct. Not part of the loadable `http` module; passed in by the host. -**Fields** +**Attributes** -| name | type | description | -|------------|------------|----------------------------------------------------------------------------------------| -| `method` | `string` | The HTTP method (e.g., GET, POST, PUT, DELETE) | -| `url` | `string` | The request URL. | -| `proto` | `string` | The protocol used for the request (e.g., HTTP/1.1). | -| `host` | `string` | The host specified in the request. | -| `remote` | `string` | The remote address of the client. | -| `headers` | `dict` | The HTTP headers included in the request. | -| `query` | `dict` | The query parameters included in the request. | -| `encoding` | `[]string` | The transfer encodings specified in the request. | -| `body` | `string` | The request body data | -| `json` | `any` | The request body data as JSON, or None if the request body is empty or not valid JSON. | +| attribute | type | description | +|-----------|------|-------------| +| `method` | `string` | the HTTP method (e.g. `GET`, `POST`). | +| `url` | `string` | the request URL. | +| `proto` | `string` | the protocol (e.g. `HTTP/1.1`). | +| `host` | `string` | the request host. | +| `remote` | `string` | the client's remote address. | +| `headers` | `dict` | request headers (each value a list of strings). | +| `query` | `dict` | parsed query parameters (each value a list of strings). | +| `encoding` | `list` | transfer encodings specified in the request. | +| `body` | `string` | the raw request body. | +| `json` | `object` | the body parsed as JSON, or `None` if empty or invalid. | ### `ServerResponse` -Enables HTTP response manipulation within Starlark scripts, facilitating dynamic preparation of HTTP responses in Go-based web servers. +Go-side helper (constructed via `NewServerResponse`) that lets a script build an HTTP response the host later writes to an `http.ResponseWriter`. Not part of the loadable `http` module; passed in by the host. **Methods** -#### `set_status(code int)` - -Sets the HTTP status code for the response. - -#### `set_code(code int)` - -Alias for set_status. - -#### `add_header(key string, value string)` - -Adds a header with the given key and value to the response. - -#### `set_content_type(content_type string)` - -Sets the Content-Type header for the response, it will overwrite any existing or implicit Content-Type header. - -#### `set_data(data string|bytes)` - -Sets the response data as binary data, and the Content-Type header to `application/octet-stream`. - -#### `set_json(data any)` - -Sets the response data as JSON, marshaling the given Starlark value to JSON, and the Content-Type header to `application/json`. - -#### `set_text(data string|bytes)` - -Sets the response data as plain text, and the Content-Type header to `text/plain`. - -#### `set_html(data string|bytes)` - -Sets the response data as HTML, and the Content-Type header to `text/html`. +| method | description | +|--------|-------------| +| `set_status(code)` | Set the HTTP status code (must be 100–599). | +| `set_code(code)` | Alias for `set_status`. | +| `add_header(key, value)` | Append a header value under `key`. | +| `set_content_type(content_type)` | Set the `Content-Type` header, overriding any implicit one. | +| `set_data(data)` | Set the body as binary; implies `Content-Type: application/octet-stream`. | +| `set_json(data)` | Marshal a Starlark value to JSON and set it as the body; implies `Content-Type: application/json`. | +| `set_text(data)` | Set the body as plain text; implies `Content-Type: text/plain`. | +| `set_html(data)` | Set the body as HTML; implies `Content-Type: text/html`. | + +## Details & examples + +### Request parameters + +All verb functions and `call` accept the same parameters (for `call`, `method` is an extra first positional argument). Only `url` is required. + +| name | type | description | +|------|------|-------------| +| `url` | `string` | URL to request. | +| `params` | `dict` | optional. URL query parameters to append; values must be strings. | +| `headers` | `dict` | optional. headers to add; values must be strings. | +| `body` | `string`/`bytes` | optional. raw request body. | +| `json_body` | `any` | optional. JSON-serializable value sent with `Content-Type: application/json`. | +| `form_body` | `dict` | optional. values encoded as form data; a value is either a string (a field) or a two-element list/tuple `[filename, content]` (a file). | +| `form_encoding` | `string` | optional. `application/x-www-form-urlencoded` or `multipart/form-data`; inferred when omitted (multipart if any file is present, otherwise urlencoded). | +| `auth` | `tuple` | optional. `(username, password)` for HTTP Basic auth. | +| `timeout` | `float` | optional. seconds to wait before giving up; `0` means no timeout. Defaults to the instance timeout (30). | +| `allow_redirects` | `bool` | optional. whether to follow redirects (default `True`). | +| `verify` | `bool` | optional. whether to verify the server's TLS certificate (default `True`). | +| `raise_for_status` | `bool` | optional. if `True`, a non-2xx response is reported as an error (default `False`). | + +**Errors on:** a non-string `url`; a non-string `params`/`headers` value; an `auth` tuple that is not length 2; a `form_body` value that is neither a string nor a `(filename, content)` pair (e.g. `got: "int"`); supplying more than one of `body`/`json_body`/`form_body` (`body, json_body and form_body are mutually exclusive`); a JSON-unserializable `json_body`; a transport failure (connection refused, DNS, TLS); `raise_for_status=True` with a non-2xx response; `verify=False` when the host forces TLS verification; or passing `timeout`/`allow_redirects`/`verify` when the host injected its own client. + +```python +load('http', 'get') +res = get(test_server_url, params={"a": "b", "c": "d"}) +print(res.url) +print(res.status_code) +print(res.body()) +print(res.json()) +# Output: +# http://127.0.0.1:PORT?a=b&c=d +# 200 +# {"hello":"world"} +# {"hello": "world"} +``` + +(The server in the test returns `{"hello":"world"}`; `test_server_url` is the test server's base URL.) + +#### POST with a JSON body + +`json_body` is marshaled to JSON and sent with `Content-Type: application/json`. + +```python +load('http', 'post') +res = post(test_server_url, json_body={"a": "b", "c": "d"}) +b = res.body() # the echo server returns the raw request it received +print(res.status_code) +print('application/json' in b) +print('{"a":"b","c":"d"}' in b) +# Output: +# 200 +# True +# True +``` + +#### POST form data and files + +A string value becomes a form field; a `[filename, content]` pair becomes a file. With files present (or `form_encoding="multipart/form-data"`) the request is multipart; otherwise it is `application/x-www-form-urlencoded`. + +```python +load('http', 'post') +res = post(test_server_url, form_body={ + "a": ["better.txt", "123456"], + "b": ["dance.md", '"abcdef(@!'], +}) +rb = res.body() +print(res.status_code) +print('multipart/form-data; boundary=' in rb) +print('filename="better.txt"' in rb) +# Output: +# 200 +# True +# True +``` + +### `call` / `try_call` + +`call(method, url, ...)` dispatches to the verb function named by `method` (case-insensitive). The supported methods are `get`, `put`, `post`, `postForm`, `delete`, `head`, `patch`, `options`. + +**Errors on:** a missing method name (`http.call: missing method name`); a non-string method name; or an unsupported method (`unsupported method: `). + +```python +load('http', 'call') +res = call('POST', test_server_url, params={"hello": "world"}, json_body={"a": "b", "c": "d"}) +b = res.body() +print(res.status_code) +print('/?hello=world' in b) +print('{"a":"b","c":"d"}' in b) +# Output: +# 200 +# True +# True +``` + +### `try_*` variants + +A `try_` function returns `(response, error)` with the Go error always `nil`: on success the error slot is `None`; on failure the response slot is `None` and the error slot holds the message string. Argument-unpacking and dispatch errors are captured the same way. + +```python +load('http', 'try_get', 'try_call') +# transport failure is captured, not raised +res, err = try_get('http://127.0.0.1:1/') +print(res == None) +print('connect' in err or 'refused' in err) +# an unsupported method is captured too +res2, err2 = try_call('TRACE', test_server_url) +print(res2 == None) +print('unsupported method' in err2) +# Output: +# True +# True +# True +# True +``` + +### `raise_for_status` + +By default a non-2xx response is returned normally (`res.ok` is `False`); with `raise_for_status=True` it becomes an error. + +```python +load('http', 'get') +res = get(nf_url) # server replies 404 +print(res.ok) +print(res.status_code) +# Output: +# False +# 404 +``` + +```python +load('http', 'get') +get(nf_url, raise_for_status=True) +# Error: http.get: unexpected status: 404 Not Found +``` + +### `try_json` vs `json` + +`json()` folds a read/parse failure into `None`; `try_json()` surfaces it in the error slot, so a parse failure is distinguishable from a JSON `null`. + +```python +load('http', 'get') +res = get(ok_url) # server replies {"a": 1} +v, err = res.try_json() +print(err == None) +print(v) +# Output: +# True +# {"a": 1} +``` + +### `set_timeout` / `get_timeout` + +`set_timeout(timeout)` sets the default request timeout (seconds) for **this module instance** only — it does not leak into other machines in the process; the package-level `TimeoutSecond` seeds new instances. `get_timeout()` returns the current value. With a host-injected client the value is ignored (the client's own timeout applies). + +**Errors on:** a non-numeric `timeout` (`got string, want float or int`); a negative `timeout` (`timeout must be non-negative`); or passing any argument to `get_timeout()` (`got 1 arguments, want 0`). + +```python +load('http', 'get_timeout', 'set_timeout') +print(get_timeout()) +set_timeout(10.5) +print(get_timeout()) +# Output: +# 30.0 +# 10.5 +``` + +## Notes / boundaries + +- **Engine.** A thin wrapper over Go `net/http`; request/response semantics follow that package. JSON is handled by starlet's `dataconv` (Starlark-aware), so structs, `module`, `time`, and starlight-wrapped Go values marshal correctly. +- **Instance vs package state.** `set_timeout` and the host-configurable knobs (`SetClient`, `SetGuard`, `SetMaxResponseBodyBytes`, `SetForceTLSVerify`) live on the module instance; the package-level `TimeoutSecond`, `UserAgent`, `SkipInsecureVerify`, `DisableRedirect`, `MaxResponseBodyBytes`, `ForceTLSVerify`, `Client`, and `Guard` only *seed* new instances at `LoadModule` time. +- **Security knobs.** A host may force TLS verification (`verify=False` is then rejected), cap the response body size (over-limit `body()`/`json()` error with `response body exceeds the N-byte limit`), and install a `RequestGuard` to allow/deny requests by URL. When the host injects its own `*http.Client`, the per-request `timeout`/`allow_redirects`/`verify` options are rejected rather than silently ignored. +- **Body kinds are mutually exclusive.** Pass at most one of `body`, `json_body`, `form_body`; supplying more than one is an error rather than a silent drop. +- **Determinism.** Response `headers` and `encoding` join multiple values with `,`. `body()`/`json()` reset the body reader so they may be called repeatedly. +- **Difference from `requests`.** `postForm` is a non-Pythonic convenience name; `params`/`headers` values must be strings; `json()` returns `None` (not raising) on parse failure — use `try_json()` for an explicit error. + + diff --git a/lib/json/README.md b/lib/json/README.md index eb35a435..567fee43 100644 --- a/lib/json/README.md +++ b/lib/json/README.md @@ -1,64 +1,76 @@ # json -`json` defines utilities for converting Starlark values to/from JSON strings. The most recent IETF standard for JSON is https://www.ietf.org/rfc/rfc7159.txt . +`json` converts Starlark values to and from JSON text and offers a small toolkit around it: pretty-printing, JSONPath query/evaluation, LLM-output repair, and JSON Schema validation. It extends Go Starlark's stdlib `json` (`encode`/`decode`/`indent`) with `dumps`-style and `try_*` helpers. **Capability profile: pure** — no filesystem, network, or process side effects (external schema `$ref` is deliberately blocked to keep it so). + +Every host-error function ships a `try_*` twin that, instead of aborting the script, returns a `(result, error)` tuple — `error` is `None` on success, and `result` is `None` on failure. `try_validate` is the one exception, distinguishing three outcomes (see below). ## Functions -### `encode(x) string` +| function | description | +| --- | --- | +| `encode(x) -> string` | Encode a Starlark value to compact JSON text (go.starlark.net stdlib). | +| `decode(x[, default]) -> value` | Decode a JSON string to a Starlark value; on bad input returns `default` if given, else errors. | +| `indent(str, *, prefix="", indent="\t") -> string` | Pretty-print valid JSON text with the given prefix/indent unit (stdlib). | +| `dumps(obj, indent=0) -> string` / `try_dumps(obj, indent=0) -> tuple` | Encode a Starlark value (incl. struct/module) to JSON text, optionally indented by `indent` spaces. `try_dumps` returns `(text, error)`. | +| `encode(x)` / `try_encode(x) -> tuple` | `try_encode` is the tuple-returning variant of `encode`. | +| `decode(x)` / `try_decode(x) -> tuple` | `try_decode` is the tuple-returning variant of `decode` (no `default` param). | +| `indent(...)` / `try_indent(str, prefix="", indent="\t") -> tuple` | `try_indent` is the tuple-returning variant of `indent`. | +| `path(data, path) -> list` / `try_path(data, path) -> tuple` | Run a JSONPath query over `data`, returning the list of matches. `try_path` returns `(list, error)`. | +| `eval(data, expr) -> value` / `try_eval(data, expr) -> tuple` | Evaluate a JSONPath expression (aggregates, arithmetic, comparisons) over `data`. `try_eval` returns `(value, error)`. | +| `repair(text) -> string` / `try_repair(text) -> tuple` | Recover valid JSON *text* from messy/LLM output (fences, prose, single quotes, trailing commas, truncation). `try_repair` returns `(text, error)`. | +| `validate(data, schema) -> None` / `try_validate(data, schema) -> tuple` | Validate a JSON document against a JSON Schema. `validate` returns `None` or errors; `try_validate` returns one of three outcomes. | + +`data` and `schema` arguments to `path` / `eval` / `validate` accept a JSON `string`, `bytes`, or any encodable Starlark value (dict, list, struct, …). + +This module exposes no custom Starlark types — every result is a standard Starlark value (`dict`, `list`, `string`, `int`, `float`, `bool`, or `None`). -The encode function accepts one required positional argument, which it converts to JSON by cases: -- A Starlark value that implements Go's standard `json.Marshal` interface defines its own JSON encoding. -- `None`, `True`, and `False` are converted to `null`, `true`, and `false`, respectively. -- Starlark int values, no matter how large, are encoded as decimal integers. Some decoders may not be able to decode very large integers. -- Starlark float values are encoded using decimal point notation, even if the value is an integer. It is an error to encode a non-finite floating-point value. -- Starlark strings are encoded as JSON strings, using UTF-16 escapes. -- a Starlark IterableMapping (e.g. dict) is encoded as a JSON object. It is an error if any key is not a string. -- any other Starlark Iterable (e.g. list, tuple) is encoded as a JSON array. -- a Starlark HasAttrs (e.g. struct) is encoded as a JSON object. - It an application-defined type matches more than one the cases describe above, (e.g. it implements both `Iterable` and `HasFields`), the first case takes precedence. Encoding any other value yields an error. +## Details & examples -#### Examples +### `encode` / `try_encode` -**Basic** +`encode(x) -> string` converts a Starlark value to compact JSON using go.starlark.net's stdlib rules: -Encode a Starlark dict to a JSON string. +- `None`, `True`, `False` → `null`, `true`, `false`. +- Starlark ints (any size) → decimal integers; floats → decimal-point notation. Non-finite floats are an error. +- Strings → JSON strings (UTF-16 escapes); `dict`/IterableMapping → object (non-string keys error); other Iterable (`list`, `tuple`) → array; `HasAttrs` (`struct`) → object. + +It **errors** when a value cannot be encoded (e.g. a function: `cannot encode function as JSON`). `try_encode(x)` returns `(text, None)` on success or `(None, message)` on failure. ```python load('json', 'encode') -print(encode({'a': 1, 'b': 2})) -# Output: {"a":1,"b":2} +load("struct.star", "struct") +s = struct(a="Aloha", b=0x10, c=True, d=[1,2,3]) +print(encode(s)) +# Output: {"a":"Aloha","b":16,"c":true,"d":[1,2,3]} ``` -### `decode(x[, default]) string` - -The decode function has one required positional parameter, a JSON string. It returns the Starlark value that the string denotes. -- Numbers are parsed as int or float, depending on whether they contain a decimal point. -- JSON objects are parsed as new unfrozen Starlark dicts. -- JSON arrays are parsed as new unfrozen Starlark lists. - If x is not a valid JSON string, the behavior depends on the "default" parameter: if present, Decode returns its value; otherwise, Decode fails. - -#### Examples +```python +load('json', 'try_encode') +result, error = try_encode({'a': 10, 'b': 20}) +print(result, error) +# Output: {"a":10,"b":20} None +``` -**Basic** +### `decode` / `try_decode` -Decode a JSON string to a Starlark dict. +`decode(x[, default]) -> value` parses a JSON string into a Starlark value: numbers become `int` or `float` (by presence of a decimal point), objects become unfrozen `dict`s, arrays become unfrozen `list`s. On invalid input it returns `default` if supplied, otherwise it **errors**. `try_decode(x)` (no `default`) returns `(value, None)` or `(None, message)`. ```python load('json', 'decode') print(decode('{"a":10,"b":20}')) -# Output: {'a': 10, 'b': 20} +# Output: {"a": 10, "b": 20} ``` -### `indent(str, *, prefix="", indent="\t") string` - -The indent function pretty-prints a valid JSON encoding, and returns a string containing the indented form. -It accepts one required positional parameter, the JSON string, and two optional keyword-only string parameters, prefix and indent, that specify a prefix of each new line, and the unit of indentation. - -#### Examples +```python +load('json', 'try_decode') +result, error = try_decode('{"a": "b"}') +print(result, error) +# Output: {"a": "b"} None +``` -**Basic** +### `indent` / `try_indent` -Indent a JSON string. +`indent(str, *, prefix="", indent="\t") -> string` re-formats already-valid JSON text. `prefix` and `indent` are keyword-only on the stdlib `indent`; on `try_indent` they are ordinary optional params. It **errors** on invalid JSON text (e.g. `invalid character ...`). `try_indent(str, prefix="", indent="\t")` returns `(text, error)`. ```python load('json', 'indent') @@ -70,16 +82,9 @@ print(indent('{"a":10,"b":20}', indent=" ")) # } ``` -### `dumps(obj, indent=0) string` - -The dumps function converts a Starlark value to a JSON string, and returns it. -It accepts one required positional parameter, the Starlark value, and one optional integer parameter, indent, that specifies the unit of indentation. - -#### Examples - -**Basic** +### `dumps` / `try_dumps` -Dump a Starlark dict to a JSON string with indentation. +`dumps(obj, indent=0) -> string` encodes any Starlark value via the internal marshaler (which, unlike `encode`, also handles host structs/modules). `indent` is the number of spaces per level; `0` or negative produces compact output. It **errors** when a value cannot be marshaled (e.g. a function: `unrecognized starlark type: *starlark.Function`). `try_dumps(obj, indent=0)` returns `(text, error)`. ```python load('json', 'dumps') @@ -91,213 +96,59 @@ print(dumps({'a': 10, 'b': 20}, indent=2)) # } ``` -### `try_dumps(obj, indent=0) tuple` - -The try_dumps function is a variant of dumps that handles errors gracefully. -It accepts the same parameters as dumps, but returns a tuple of (result, error). -If successful, error will be None. If an error occurs, result will be None and error will contain the error message. - -#### Examples - -**Basic** - -Try to dump a Starlark dict to a JSON string and handle potential errors. - ```python load('json', 'try_dumps') -result, error = try_dumps({'a': 10, 'b': 20}, indent=2) -print("Result:", result) -print("Error:", error) -# Output: -# Result: { -# "a": 10, -# "b": 20 -# } -# Error: None -``` - -### `try_encode(x) tuple` - -The try_encode function is a variant of encode that handles errors gracefully. -It accepts the same parameter as encode, but returns a tuple of (result, error). -If successful, error will be None. If an error occurs, result will be None and error will contain the error message. - -#### Examples - -**Basic** - -Try to encode a Starlark dict to a JSON string and handle potential errors. - -```python -load('json', 'try_encode') -result, error = try_encode({'a': 10, 'b': 20}) -print("Result:", result) -print("Error:", error) -# Output: -# Result: {"a":10,"b":20} -# Error: None -``` - -### `try_decode(x) tuple` - -The try_decode function is a variant of decode that handles errors gracefully. -It accepts the same parameter as decode, but returns a tuple of (result, error). -If successful, error will be None. If an error occurs, result will be None and error will contain the error message. - -#### Examples - -**Basic** - -Try to decode a JSON string to a Starlark dict and handle potential errors. - -```python -load('json', 'try_decode') -result, error = try_decode('{"a":10,"b":20}') -print("Result:", result) -print("Error:", error) -# Output: -# Result: {'a': 10, 'b': 20} -# Error: None +result, error = try_dumps(1, indent=-7) +print(result, error) +# Output: 1 None ``` -### `try_indent(str, prefix="", indent="\t") tuple` +Note: `dumps`/`encode` can differ for host structs. A struct carrying a `star` tag encodes by its Go field names under `encode` (`{"Message":...}`) but by its struct values under `dumps` — see the test suite for the exact shapes. -The try_indent function is a variant of indent that handles errors gracefully. -It accepts the same parameters as indent, but returns a tuple of (result, error). -If successful, error will be None. If an error occurs, result will be None and error will contain the error message. +### `path` / `try_path` -#### Examples - -**Basic** - -Try to indent a JSON string and handle potential errors. - -```python -load('json', 'try_indent') -result, error = try_indent('{"a":10,"b":20}', indent=" ") -print("Result:", result) -print("Error:", error) -# Output: -# Result: { -# "a": 10, -# "b": 20 -# } -# Error: None -``` - -### `path(data, path) list` - -The path function performs a JSONPath query on the given JSON data and returns the matching elements. -It accepts two positional arguments: -- data: JSON data as a string, bytes, or Starlark value (dict, list, etc.) -- path: A JSONPath expression string - It returns a list of matching elements. If no matches are found, an empty list is returned. - If the JSONPath expression is invalid, an error is raised. - -#### Examples - -**Basic** - -Query JSON data using JSONPath expressions. +`path(data, path) -> list` runs a JSONPath query and returns the list of matching elements (empty list if nothing matches). Numeric matches come back as `int` when integral, else `float`. It **errors** on a malformed JSONPath expression (`wrong symbol 'X' at N`) or on `data` that is neither valid JSON nor an encodable value (`unrecognized starlark type`). `try_path(data, path)` returns `(list, error)`. ```python load('json', 'path') -data = '''{"store":{"book":[{"title":"Moby Dick","price":8.99},{"title":"War and Peace","price":12.99}]}}''' -titles = path(data, '$.store.book[*].title') -print(titles) -# Output: ['Moby Dick', 'War and Peace'] -prices = path(data, '$..price') -print(prices) -# Output: [8.99, 12.99] +data = '''{"store":{"book":[{"title":"Sayings of the Century","price":8.95},{"title":"Sword of Honour","price":12.99}]}}''' +print(path(data, '$.store.book[*].title')) +# Output: ["Sayings of the Century", "Sword of Honour"] ``` -### `try_path(data, path) tuple` - -The try_path function is a variant of path that handles errors gracefully. -It accepts the same parameters as path, but returns a tuple of (result, error). -If successful, error will be None. If an error occurs, result will be None and error will contain the error message. - -#### Examples - -**Basic** - -Try to query JSON data using JSONPath and handle potential errors. - ```python load('json', 'try_path') -data = '''{"store":{"book":[{"title":"Moby Dick","price":8.99},{"title":"War and Peace","price":12.99}]}}''' -result, error = try_path(data, '$..price') -print("Result:", result) -print("Error:", error) -# Output: -# Result: [8.99, 12.99] -# Error: None +data = {'items': [{'value': 5}, {'value': 10}, {'value': 15}]} +result, error = try_path(data, '$.items[?(@.value > 7)].value') +print(result, error) +# Output: [10, 15] None ``` -### `eval(data, expr) value` - -The eval function evaluates a JSONPath expression on the given JSON data and returns the evaluation result. -It accepts two positional arguments: -- data: JSON data as a string, bytes, or Starlark value (dict, list, etc.) -- expr: A JSONPath expression string to evaluate - It returns the result of the evaluation, which can be a number, string, boolean, list, dict, or None. - If the expression is invalid, an error is raised. +### `eval` / `try_eval` -#### Examples - -**Basic** - -Evaluate JSONPath expressions on JSON data. +`eval(data, expr) -> value` evaluates a JSONPath *expression* — aggregates (`sum`, `avg`, `size`), arithmetic, comparisons, string concatenation, and built-in constants (`pi`) — and returns a single `value` (number, string, bool, list, dict, or `None`). It **errors** on an unknown function (`'invalid' is not a function`), bad syntax, division by zero, invalid `data`, or an unencodable value. `try_eval(data, expr)` returns `(value, error)`. ```python load('json', 'eval') -data = '''{"store":{"book":[{"price":8.99},{"price":12.99},{"price":5.99}]}}''' -avg_price = eval(data, 'avg($..price)') -print(avg_price) -# Output: 9.323333333333334 -sum_price = eval(data, 'sum($..price)') -print(sum_price) -# Output: 27.97 +data = '''{"store":{"book":[{"price":8.95},{"price":12.99},{"price":8.99},{"price":22.99}],"bicycle":{"price":19.95}}}''' +print(eval(data, 'avg($..price)')) +# Output: 14.774000000000001 ``` -### `try_eval(data, expr) tuple` - -The try_eval function is a variant of eval that handles errors gracefully. -It accepts the same parameters as eval, but returns a tuple of (result, error). -If successful, error will be None. If an error occurs, result will be None and error will contain the error message. - -#### Examples - -**Basic** - -Try to evaluate JSONPath expressions on JSON data and handle potential errors. - ```python load('json', 'try_eval') -data = '''{"store":{"book":[{"price":8.99},{"price":12.99},{"price":5.99}]}}''' -result, error = try_eval(data, 'avg($..price)') -print("Result:", result) -print("Error:", error) -# Output: -# Result: 9.323333333333334 -# Error: None +result, error = try_eval({'value': 10}, '$.value > 5') +print(result, error) +# Output: True None ``` -### `repair(text) string` - -The repair function recovers valid JSON **text** from the messy output that language models often produce, so the result can then be passed to `decode`. It accepts one required positional parameter, the text, and returns a JSON string. +### `repair` / `try_repair` -It fixes the common breakages: code fences (` ```json … ``` `), surrounding prose, single-quoted keys/strings, trailing commas, comments, Python literals (`True`/`False`/`None`), unquoted keys, and truncated (cut-off) JSON. +`repair(text) -> string` recovers valid JSON **text** from messy model output so it can then be `decode`d — the idiom is `decode(repair(x))`. It strips code fences (```` ```json … ``` ````), surrounding prose, fixes single quotes, trailing commas, comments, Python literals (`True`/`False`/`None`), and completes truncated JSON. -It returns **text, not a value** — the idiom is `decode(repair(x))`, which keeps repair composable with `decode`'s `default` and the `try_*` variants. Because repair returns text, a recovered bare scalar (e.g. the input `The answer is 42` yields `42`) is honest output; scripts that require a structured result should check the type of the decoded value. - -**Already-valid JSON is returned byte-for-byte unchanged** (repair is idempotent on good input), so it is safe to call defensively. - -#### Examples - -**Basic** - -Repair a fenced, single-quoted, trailing-comma response and decode it. +- **Idempotent on good input**: already-valid JSON is returned byte-for-byte unchanged (so it never mangles valid escapes — calling it defensively is safe). +- Because it returns *text*, a recovered bare scalar is honest output (`repair('The answer is 42')` decodes to `42`, not a dict); scripts needing structure should check the decoded type. +- It **errors** on truly unrepairable input (e.g. `{,,,}`). `try_repair(text)` returns `(text, error)` — including on a non-string argument. ```python load('json', 'repair', 'decode') @@ -307,44 +158,27 @@ messy = '''Here is the result: ``` ''' print(decode(repair(messy))) -# Output: {'name': 'Ann', 'tags': ['a', 'b']} +# Output: {"name": "Ann", "tags": ["a", "b"]} ``` -### `try_repair(text) tuple` - -The try_repair function is a variant of repair that handles errors gracefully. -It accepts the same parameter as repair, but returns a tuple of (result, error). -If successful, error will be None. If an error occurs, result will be None and error will contain the error message. - -#### Examples - -**Basic** - ```python load('json', 'try_repair') result, error = try_repair('{"a": 1,}') -print("Result:", result) -print("Error:", error) -# Output: -# Result: {"a": 1} -# Error: None +print(result, error) +# Output: {"a": 1} None ``` -### `validate(data, schema) None` +### `validate` / `try_validate` -The validate function checks a JSON document against a [JSON Schema](https://json-schema.org) (drafts 4, 6, 7, 2019-09 and 2020-12, detected from the `$schema` keyword; 2020-12 by default). It accepts two positional arguments — both may be a JSON string, bytes, or a Starlark value (dict, list, etc.): -- data: the document to check -- schema: the JSON Schema +`validate(data, schema) -> None` checks a JSON document against a [JSON Schema](https://json-schema.org) (drafts 4, 6, 7, 2019-09, 2020-12 — detected from `$schema`, default 2020-12). It returns `None` when the data conforms; otherwise it **errors** with one line per violation, each prefixed by its [JSON Pointer](https://datatracker.ietf.org/doc/html/rfc6901) location (e.g. `at /age: must be >= 0 but found -3`; long lists are capped with `... and N more`). -It returns None when the data conforms. When the data is invalid, it fails with a message listing each violation prefixed by its [JSON Pointer](https://datatracker.ietf.org/doc/html/rfc6901) location, e.g. `at /age: must be >= 0 but found -3`. +Schemas must be **self-contained**: an external `$ref` (`file://`, `http://`) is rejected (`not allowed`) — this is what keeps the module pure. A bad schema or malformed data text reports `invalid schema` / `invalid data`. Compiled schemas are cached (bounded), so repeated validation against the same schema text avoids recompilation. -Schemas must be **self-contained**: a `$ref` to an external resource (a file or the network) is an error. Compiled schemas are cached, so repeated validation against the same schema text has no recompilation cost. +`try_validate(data, schema)` distinguishes three outcomes: -#### Examples - -**Basic** - -Validate a decoded value against a schema written as a Starlark dict. +- `(True, None)` — the data conforms. +- `(False, details)` — the data was checked and is invalid; `details` lists the violations. +- `(None, error)` — validation could not run (invalid schema, malformed JSON text, or bad arguments). ```python load('json', 'validate') @@ -353,23 +187,18 @@ print(validate({'name': 'Ann', 'age': 3}, schema)) # Output: None ``` -### `try_validate(data, schema) tuple` - -The try_validate function is a variant of validate that distinguishes three outcomes instead of aborting: -- `(True, None)` — the data conforms to the schema. -- `(False, details)` — the data was checked and is invalid; details lists the violations with their JSON Pointer locations. -- `(None, error)` — validation could not run at all (invalid schema, malformed JSON text, or bad arguments). - -#### Examples - -**Basic** - ```python load('json', 'try_validate') -ok, err = try_validate('{"age": -3}', '{"type":"object","properties":{"age":{"type":"integer","minimum":0}}}') -print("OK:", ok) -print("Error:", err) -# Output: -# OK: False -# Error: at /age: must be >= 0 but found -3 +ok, err = try_validate('{"age":-3}', '{"type":"object","properties":{"age":{"type":"integer","minimum":0}}}') +print(ok, err) +# Output: False at /age: must be >= 0 but found -3 ``` + +## Notes & boundaries + +- **Engines.** `encode`/`decode`/`indent` are go.starlark.net's stdlib `json`; `dumps` uses starlet's internal marshaler (handles host structs/modules); `path`/`eval` use [ajson](https://github.com/spyzhov/ajson) JSONPath; `repair` uses a vendored, frozen [jsonrepair](https://github.com/RealAlexandreAI/json-repair) (golden-locked); `validate` uses [santhosh-tekuri/jsonschema](https://github.com/santhosh-tekuri/jsonschema). +- **Purity.** No file or network access. JSON Schema `$ref` to external resources is blocked by design. +- **Number shaping.** `path`/`eval` return integral numbers as `int` and non-integral as `float`; JSON `null` becomes `None`. +- **`repair` vs `validate`.** `repair` fixes *text* and is idempotent on valid input; `validate` never mutates — it only reports conformance. +- All function names are snake_case; `try_*` variants mirror their base function and never abort the script. +- There is **no** `encode_indent` function: indentation is a parameter, not a separate call — use `dumps(obj, indent=N)` for indented encoding, or `indent(...)` to re-format existing JSON text. diff --git a/lib/log/README.md b/lib/log/README.md index cc2c4768..96d97015 100644 --- a/lib/log/README.md +++ b/lib/log/README.md @@ -1,121 +1,130 @@ # log -`log` provides functionality for logging messages at various severity levels. +`log` writes log messages at five severity levels from Starlark, backed by a [`zap`](https://github.com/uber-go/zap) sugared logger on the host side. Capability profile: **Log** — it produces log output as a side effect but touches no filesystem, network, or process state directly. -## Functions - -### `debug(msg, *misc, **kv)` - -Logs a message at the debug log level. +By default the module logs through a `zap` development logger (console encoder, caller and stacktrace disabled). The host can swap the logger at runtime with `SetLog` (Go side), including a no-op logger that discards everything. -#### Parameters - -| name | type | description | -|--------|------------|-----------------------------------------------------------------------------------------------| -| `msg` | `string` | The message to log. | -| `misc` | `*args` | Additional message arguments will be concatenated to the message string separated by a space. | -| `kv` | `**kwargs` | Key-value pairs to provide additional debug information. | +## Functions -#### Examples +| function | description | +|----------|-------------| +| `debug(msg, *misc, **kv) -> None` | Log `msg` at DEBUG level; returns `None`. | +| `info(msg, *misc, **kv) -> None` | Log `msg` at INFO level; returns `None`. | +| `warn(msg, *misc, **kv) -> None` | Log `msg` at WARN level; returns `None`. | +| `error(msg, *misc, **kv) -> None` | Log `msg` at ERROR level; returns `None` (does **not** halt the script). | +| `fatal(msg, *misc, **kv) -> error` | Log `msg` at ERROR level, then raise the message as an error that halts the script. | -**basic** +All five functions share the same calling shape: a required string `msg`, optional extra positional arguments (`*misc`) appended to the message, and optional keyword arguments (`**kv`) attached as structured key-value fields. -Log a debug message with additional information. +This module exposes **no constants** and **no custom types** — its five members are all builtin callables on the `log` module struct. -```python -load("log", "debug") -debug("Fetching data at", "21:40", retry_attempt=1) -{"retry_attempt": 1} -``` +## Details & examples -### `info(msg, *misc, **kv)` +### Common argument handling -Logs a message at the info log level. +Every function takes the same arguments: -#### Parameters +- `msg` (required, `string`) — the log message. It must be the first positional argument and must be a string. +- `*misc` (optional) — any further positional arguments. Each is rendered to text and appended to `msg`, separated by single spaces. Booleans render as `True`/`False`, numbers and strings render naturally. +- `**kv` (optional) — keyword arguments become structured fields. Keys are interpreted as strings; values are unmarshaled to Go types where possible (so dicts/lists/numbers serialize as JSON-like structures, `None` becomes `null`), falling back to the value's `String()` form for self-referential or non-marshalable values. -| name | type | description | -|--------|------------|-----------------------------------------------------------------------------------------------| -| `msg` | `string` | The message to log. | -| `misc` | `*args` | Additional message arguments will be concatenated to the message string separated by a space. | -| `kv` | `**kwargs` | Key-value pairs to provide additional information. | +**Errors** (identical for all five functions): -#### Examples +- Calling with no arguments fails: `log.: expected at least 1 argument, got 0`. +- A non-string first argument fails: `log.: expected string as first argument, got ` (e.g. `got int`). -**basic** +Note the error prefix uses the qualified builtin name, e.g. `log.debug`, `log.fatal`. -Log an info message with additional information. +### `debug` ```python -load("log", "info") -info("Data fetched", response_time=42) +load('log', 'debug') +debug('this is a debug message only') +# Output: +# DEBUG this is a debug message only ``` -### `warn(msg, *misc, **kv)` - -Logs a message at the warn log level. - -#### Parameters +Extra positional arguments are concatenated onto the message: -| name | type | description | -|--------|------------|-----------------------------------------------------------------------------------------------| -| `msg` | `string` | The message to log. | -| `misc` | `*args` | Additional message arguments will be concatenated to the message string separated by a space. | -| `kv` | `**kwargs` | Key-value pairs to provide additional warning information. | +```python +load('log', 'debug') +debug('this is a broken message', "what", 123, True) +# Output: +# DEBUG this is a broken message what 123 True +``` -#### Examples +Keyword arguments are attached as structured fields: -**basic** +```python +load('log', 'debug') +m = {"mm": "this is more"} +l = [2, "LIST", 3.14, True] +debug('this is a data message', map=m, list=l) +# Output: +# DEBUG this is a data message {"map": {"mm":"this is more"}, "list": [2,"LIST",3.14,true]} +``` -Log a warning message with additional information. +### `info` ```python -load("log", "warn") -warn("Fetching data took longer than expected", response_time=123) +load('log', 'info') +info('this is an info message', a1=2, hello="world") +# Output: +# INFO this is an info message {"a1": 2, "hello": "world"} ``` -### `error(msg, *misc, **kv)` - -Logs a message at the error log level and returns an error. +Self-referential values fall back to their string form rather than failing: -#### Parameters +```python +load('log', 'info') +d = {"hello": "world"} +d["a"] = d +l = [1, 2, 3] +l.append(l) +s = set([4, 5, 6]) +info('this is complex info message', self1=d, self2=l, self3=s) +# Output: +# INFO this is complex info message {"self1": "{\"hello\": \"world\", \"a\": {...}}", "self2": "[1, 2, 3, [...]]", "self3": [4,5,6]} +``` -| name | type | description | -|--------|------------|-----------------------------------------------------------------------------------------------| -| `msg` | `string` | The message to log. | -| `misc` | `*args` | Additional message arguments will be concatenated to the message string separated by a space. | -| `kv` | `**kwargs` | Key-value pairs to provide additional error information. | +### `warn` -#### Examples +```python +load('log', 'warn') +warn('this is a warning message only') +# Output: +# WARN this is a warning message only +``` -**basic** +### `error` -Log an error message with additional information. +`error` logs at ERROR level and returns `None` — it does not stop execution. Use `fatal` (or Starlark's `fail`) to halt. ```python -load("log", "error") -error("Failed to fetch data", response_time=240) +load('log', 'error') +error('this is an error message only', dsat=None) +# Output: +# ERROR this is an error message only {"dsat": null} ``` -### `fatal(msg, *misc, **kv)` +### `fatal` -Logs a message at the error log level, returns a `fail(msg)` to halt program execution. +`fatal` logs the message at ERROR level and then raises it as an error, halting the script (the message becomes the error string). It does **not** call `os.Exit`; the host receives a normal Starlark error. -#### Parameters - -| name | type | description | -|--------|------------|-----------------------------------------------------------------------------------------------| -| `msg` | `string` | The message to log. | -| `misc` | `*args` | Additional message arguments will be concatenated to the message string separated by a space. | -| `kv` | `**kwargs` | Key-value pairs to provide additional fatal error information. | - -#### Examples +```python +load('log', 'fatal') +fatal('this is a fatal message only') +# Output: +# this is a fatal message only +``` -**basic** +(The line above is the error raised to the host; an ERROR-level log entry with the same message is also written before the error returns.) -Log a fatal error message with additional information. +## Notes / boundaries -```python -load("log", "fatal") -fatal("Failed to fetch data and cannot recover", retry_attempts=3, response_time=360) -``` +- **Engine.** Logging is delegated to a `go.uber.org/zap` `SugaredLogger`. The exact output format (level token casing, field separators, JSON shape of structured values) depends on the configured zap encoder. The examples above show the console-encoder form used by the test suite; the default development logger emits a similar human-readable line. Timestamps and any caller/stacktrace fields are omitted here for brevity. +- **Levels.** Five levels are exposed: `debug`, `info`, `warn`, `error`, `fatal`. Internally `fatal` logs at zap's ERROR level (not zap's FatalLevel) and then returns an error — it never terminates the process. +- **`error` vs `fatal`.** `error` records and continues; `fatal` records and aborts the script. Both write at ERROR severity. +- **No-op mode.** When the host installs a nil/no-op logger via `SetLog`, all functions still validate arguments and return normally (and `fatal` still raises its error), but no output is produced. +- **Determinism.** Output content is deterministic for given inputs; structured-field ordering follows the order of keyword arguments. Whether a line is emitted at all depends on the configured logger's level threshold. +- All member names are snake_case-clean (single lowercase words); there are no irregular identifiers. diff --git a/lib/net/README.md b/lib/net/README.md index 0a69435c..e0b9e155 100644 --- a/lib/net/README.md +++ b/lib/net/README.md @@ -1,49 +1,101 @@ # net -`net` provides network diagnostics for Starlark: DNS lookup, TCP ping, and HTTP ping. It is inspired by Go's `net` package and Python's `socket` module. +`net` provides network diagnostics for Starlark: DNS lookup (`nslookup`), TCP connect ping (`tcping`), and HTTP connect ping (`httping`). It is inspired by Go's `net` package and Python's `socket` module. -All three functions honor the machine's context: a `RunWithTimeout`/`RunWithContext` deadline aborts a lookup or an in-flight ping loop promptly. +Capability profile: **Network** — every function performs real DNS resolution and/or outbound connections. All three honor the machine's context: a `RunWithTimeout`/`RunWithContext` deadline aborts a lookup or an in-flight ping loop promptly, between rounds. ## Functions -### `nslookup(domain, dns_server=None, timeout=10) []string` +| function | description | +|----------|-------------| +| `nslookup(domain, dns_server=None, timeout=10) -> list[string]` | resolve a domain to its IP addresses | +| `tcping(hostname, port=80, count=4, timeout=10, interval=1) -> statistics` | measure TCP connect round-trip times to `hostname:port` | +| `httping(url, count=4, timeout=10, interval=1) -> statistics` | measure HTTP connect round-trip times by issuing GET requests to `url` | -Looks up the IP addresses of a domain name, returning a list of strings. +## Types -| name | type | description | -|--------------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `domain` | `string` | the domain name to resolve (an IP literal is returned as-is) | -| `dns_server` | `string` | optional DNS server as `host` or `host:port` (port defaults to 53). Uses the system resolver when omitted. NOTE: a custom server requires the pure-Go resolver, which is a no-op on Windows before Go 1.19 — the system resolver is used there instead. | -| `timeout` | `float/int` | lookup timeout in seconds; non-positive values fall back to 10 | +### `statistics` -### `tcping(hostname, port=80, count=4, timeout=10, interval=1) statistics` +A `starlarkstruct` returned by `tcping` and `httping`. Round-trip values are in milliseconds. Read-only; access by attribute (`s.avg`). -Measures TCP connect round-trip times to `hostname:port` and returns a `statistics` struct. +| attribute | type | description | +|-----------|------|-------------| +| `address` | `string` | the resolved `host:port` (`tcping`) or the URL (`httping`) that was pinged | +| `total` | `int` | rounds attempted (equals `count`) | +| `success` | `int` | rounds that succeeded | +| `loss` | `float` | failed percentage, `0`–`100` | +| `min` | `float` | minimum round-trip time, ms | +| `avg` | `float` | mean round-trip time, ms | +| `max` | `float` | maximum round-trip time, ms | +| `stddev` | `float` | standard deviation of round-trip times, ms (`0` for a single successful round) | -| name | type | description | -|------------|-------------|--------------------------------------------------------------------------| -| `hostname` | `string` | the host to ping (resolved first; an IP literal skips DNS) | -| `port` | `int` | TCP port, defaults to 80 | -| `count` | `int` | number of rounds, `1..1024` | -| `timeout` | `float/int` | per-connect timeout in seconds (sub-second values work); at most 3600 | -| `interval` | `float/int` | pause between rounds in seconds (sub-second values work); at most 3600 | +If no round succeeds, the function errors with `no successful connections` instead of returning a `statistics` struct. -### `httping(url, count=4, timeout=10, interval=1) statistics` +## Details & examples -Measures HTTP connect round-trip times by issuing GET requests to `url` (redirects are not followed; a status outside `200..399` counts as a failed round) and returns a `statistics` struct. +### `nslookup` -Takes the same `count`/`timeout`/`interval` parameters and bounds as `tcping`. The client honors the system proxy settings (`HTTP_PROXY`, `HTTPS_PROXY`, `NO_PROXY`), just like `lib/http`; behind a proxy, the measured connect duration is the TCP connection to the proxy (the first hop), not to the origin server. +`nslookup(domain, dns_server=None, timeout=10) -> list[string]` -## Types +Looks up the IP addresses of `domain`, returning a list of address strings. -### `statistics` +- `domain` (`string`/`bytes`) — the name to resolve. An IP literal is returned as-is. +- `dns_server` (`string`/`bytes`, optional) — a DNS server as `host` or `host:port`; the port defaults to `53` when omitted. Uses the system resolver when not given. A custom server requires the pure-Go resolver, which is a no-op on Windows before Go 1.19 — the system resolver is used there instead. +- `timeout` (`float`/`int`, optional) — lookup timeout in seconds; non-positive values fall back to `10`. + +Errors when the lookup fails — the error always names the domain, whether the failure is an offline timeout, a server timeout (a `timeout` error against an unreachable `dns_server`), or an online `NXDOMAIN` (e.g. `missing.invalid`). + +```python +load('net', 'nslookup') +# an IP literal resolves to itself without touching DNS +print(nslookup('8.8.8.8')) +# Output: +# ["8.8.8.8"] +``` + +### `tcping` + +`tcping(hostname, port=80, count=4, timeout=10, interval=1) -> statistics` + +Resolves `hostname` (an IP literal skips DNS), then opens and immediately closes a TCP connection to `host:port` `count` times, returning a `statistics` struct of the connect times. + +- `port` (`int`) — TCP port, default `80`. +- `count` (`int`) — number of rounds, `1`–`1024`. +- `timeout` (`float`/`int`) — per-connect timeout in seconds; sub-second values work; at most `3600`. Non-positive falls back to `10`. +- `interval` (`float`/`int`) — pause between rounds in seconds; sub-second values work; at most `3600`. Non-positive falls back to `1`. + +Errors when: `count <= 0` (`count must be greater than 0`); `count > 1024` (`count must be at most 1024`); `timeout`/`interval > 3600` (`... must be at most 3600 seconds`); the hostname cannot be resolved (the error names the host); or no round connects (`no successful connections`). A round that fails to connect is counted as a loss, not an error, as long as at least one round succeeds. + +```python +load('net', 'tcping') +s = tcping('127.0.0.1', port=local_port, count=4, interval=0.1) +print(s.total, s.success > 0) +# Output: +# 4 True +``` + +### `httping` + +`httping(url, count=4, timeout=10, interval=1) -> statistics` + +Issues a GET request to `url` `count` times and returns a `statistics` struct of the connect (TCP-establish) times. Redirects are not followed; a status outside `200`–`399` makes the round a failure. + +Takes the same `count`/`timeout`/`interval` parameters and bounds as `tcping`. The client honors the system proxy settings (`HTTP_PROXY`, `HTTPS_PROXY`, `NO_PROXY`), just like `lib/http`. Behind a proxy, the measured connect duration is the TCP connection to the proxy (the first hop), not to the origin server. + +Errors when: `count <= 0`; `count > 1024`; `timeout`/`interval > 3600`; or no round succeeds (`no successful connections`) — this is also what an unresolvable URL or an all-`4xx`/`5xx` target produces. A `3xx` redirect counts as a *success* (the response is used directly rather than followed). + +```python +load('net', 'httping') +s = httping(server_url, count=4, interval=0.1) +print(s.total, s.success > 0) +# Output: +# 4 True +``` -| member | type | description | -|-----------|----------|----------------------------------------------| -| `address` | `string` | the resolved address or URL that was pinged | -| `total` | `int` | rounds attempted | -| `success` | `int` | rounds that succeeded | -| `loss` | `float` | failed percentage (0–100) | -| `min`/`avg`/`max`/`stddev` | `float` | round-trip statistics in milliseconds | +## Notes / boundaries -If no round succeeds, the function reports a `no successful connections` error instead of returning a struct. +- **Engine.** DNS uses Go's `net.Resolver` (a custom `dns_server` forces `PreferGo`, sending UDP queries to that server); `tcping` uses `net.Dialer.DialContext` over `tcp`; `httping` uses `net/http` with redirect-following disabled and keep-alives off, tracing the first `ConnectStart`/`ConnectDone` pair. +- **Cancellation.** A builtin cannot be stopped mid-flight, so the ping loop checks the context between rounds and uses an interruptible timer for the inter-round pause; a `RunWithTimeout` deadline aborts with a `context` error rather than running all rounds out. +- **Bounds rationale.** `count` is capped at `1024` and `timeout`/`interval` at `3600` seconds so a script (e.g. `count=10**9`) cannot park the host goroutine indefinitely. +- **Determinism.** Round-trip values depend on the live network and are not reproducible; only `address`, `total`, and (given a stable target) `success` are deterministic. Examples above print only those fields and are grounded in the hermetic test stubs (`local_port`/`server_url` are loopback targets). +- **`statistics` shape.** Statistics are computed only over the successful rounds; `min`/`avg`/`max`/`stddev` ignore losses. `loss` is `(total - success) / total * 100`. diff --git a/lib/path/README.md b/lib/path/README.md index ad2568da..56f0f251 100644 --- a/lib/path/README.md +++ b/lib/path/README.md @@ -1,9 +1,38 @@ # path -`path` provides functions to manipulate directories and file paths. It follows one rule: +`path` manipulates directories and file paths for Starlark. The **lexical** functions follow Python's `posixpath` semantics — pure string work on `/`-separated paths, no implicit cleaning, identical on every OS — while the **filesystem** functions operate on the real, OS-native filesystem. -- **Lexical functions** — `join`, `basename`, `dirname`, `normpath`, `split`, `splitext`, `isabs`, `relpath` — use **Python (`posixpath`) semantics**: pure string work on `/`-separated paths, no implicit cleaning, identical results on every OS. -- **Filesystem functions** — `abs`, `exists`, `is_file`, `is_dir`, `is_link`, `listdir`, `getcwd`, `chdir`, `mkdir` — operate on the real, OS-native filesystem. +Capability profile: **FileSystem + Process**. The filesystem functions touch the host disk; `chdir` changes the working directory of the **whole process** (every machine and goroutine in the host, persisting after the script ends), so this is a Process effect too. Never expose this module to untrusted scripts; host runtimes typically exclude `path` from restricted module sets. + +## Functions + +### Lexical (pure string, `posixpath` semantics) + +| function | description | +|---|---| +| `join(*paths) -> string` | Join path elements with `/`; an absolute component resets the result, an empty component adds a trailing separator, no cleaning. | +| `basename(path) -> string` | Final component (everything after the last `/`); empty when `path` ends in `/`. | +| `dirname(path) -> string` | Directory part (everything before the last `/`), trailing slashes stripped unless all-slashes. | +| `normpath(path) -> string` | Collapse redundant separators and `..`/`.` lexically. | +| `split(path) -> (head, tail)` | Split into `(dirname, basename)`. | +| `splitext(path) -> (root, ext)` | Split off the extension at the last dot of the final component. | +| `isabs(path) -> bool` | Whether `path` is absolute (starts with `/`). | +| `relpath(path, start=".") -> string`, `try_relpath(path, start=".") -> (string, error)` | Relative path from `start` to `path`, computed lexically. `try_relpath` returns a `(value, error)` pair instead of aborting. | + +### Filesystem (real OS-native disk) + +| function | description | +|---|---| +| `abs(path) -> string`, `try_abs(path) -> (string, error)` | Absolute representation of `path`. `try_abs` returns a `(value, error)` pair instead of aborting. | +| `expanduser(path) -> string` | Replace a leading `~` with the current user's home directory (reads the host environment). | +| `exists(path) -> bool` | Whether `path` exists (symlinks are followed). | +| `is_file(path) -> bool` | Whether `path` exists and is a regular file (symlinks followed). | +| `is_dir(path) -> bool` | Whether `path` exists and is a directory (symlinks followed). | +| `is_link(path) -> bool` | Whether `path` exists and is a symbolic link (not followed). | +| `listdir(path, recursive=False, filter=None) -> list[string]` | List directory contents, optionally recursively and filtered. | +| `getcwd() -> string` | Current working directory of the process. | +| `chdir(path) -> None` | Change the **process-wide** working directory (global side effect). | +| `mkdir(path, mode=0o755) -> None` | Create a directory (and parents); existing directories are not an error. | ## Migrating from v0.1.x @@ -18,312 +47,309 @@ Use `normpath(join(...))` where the old cleaned result is wanted. -## Functions - -### `abs(path) string` - -Returns an absolute representation of path. If the path is not absolute it will be joined with the current working directory to turn it into an absolute path. The absolute path name for a given file is not guaranteed to be unique. +## Lexical functions -#### Parameters +### `join(*paths) -> string` -| name | type | description | -|--------|----------|----------------------------------------------------| -| `path` | `string` | The file path to be converted to its absolute form | +Joins one or more string elements. Components are joined with `/`; an absolute component (one starting with `/`) resets the result; an empty component contributes a trailing separator; no lexical cleaning is applied. Requires at least one argument; every argument must be a string. -#### Examples - -**basic** - -Convert a relative path to an absolute path. +Errors on: zero arguments (`got 0 arguments, want at least 1`); a non-string argument (`for parameter path: got int, want string`). ```python -load("path", "abs") -p = abs('.') -print(p) -# Output: '/current/absolute/path' +load("path", "join") +print(join("a", "b", "c")) # 'a/b/c' +print(join("a", "/b", "c")) # '/b/c' (absolute resets) +print(join("a", "b", "")) # 'a/b/' (empty adds trailing /) +print(join("a/b", "../../xyz")) # 'a/b/../../xyz' (no cleaning) +# Output: a/b/c +# /b/c +# a/b/ +# a/b/../../xyz ``` -### `join(path, *paths) string` - -Joins one or more path elements with Python (`posixpath`) semantics: components are joined with `/`, an absolute component resets the result, an empty component contributes a trailing separator, and no cleaning is applied (see the migration notes above). - -#### Parameters +### `basename(path) -> string` -| name | type | description | -|------------|----------|--------------------------------| -| `paths...` | `string` | The path elements to be joined | +Everything after the last `/`. A path ending in `/` has an empty basename (unlike Go's `filepath.Base`). Errors on a non-string argument. -#### Examples +```python +load("path", "basename") +print(basename("a/b/c.txt")) # 'c.txt' +print(basename("a/b/")) # '' +print(basename("/")) # '' +print(basename("plain")) # 'plain' +# Output: c.txt +# +# +# plain +``` -**basic** +### `dirname(path) -> string` -Join multiple path parts. +Everything before the last `/`, with trailing slashes stripped unless the result is all slashes (so `dirname("//a")` keeps `//`). Errors on a non-string argument. ```python -load("path", "join") -p = join('a', 'b', 'c') -print(p) -# Output: 'a/b/c' +load("path", "dirname") +print(dirname("a/b/c.txt")) # 'a/b' +print(dirname("a/b/")) # 'a/b' +print(dirname("plain")) # '' +print(dirname("/a")) # '/' +print(dirname("//a")) # '//' +# Output: a/b +# a/b +# +# / +# // ``` -### `exists(path) bool` +### `normpath(path) -> string` -Returns true if the path exists. +Collapses redundant separators and up-level references lexically: `a//b`, `a/./b` and `a/c/../b` all become `a/b`. Exactly two leading slashes are preserved (POSIX gives them implementation-defined meaning); three or more collapse to one; an empty path normalizes to `.`. Errors on a non-string argument. -#### Parameters - -| name | type | description | -|--------|----------|------------------------| -| `path` | `string` | The path to be checked | - -#### Examples +```python +load("path", "normpath") +print(normpath("a/c/../b")) # 'a/b' +print(normpath("a/../../b")) # '../b' +print(normpath("//a")) # '//a' +print(normpath("///a")) # '/a' +print(normpath("")) # '.' +# Output: a/b +# ../b +# //a +# /a +# . +``` -**basic** +### `split(path) -> (head, tail)` -Check if a path exists. +Returns the `(dirname, basename)` pair. `split("/a")` is `("/", "a")`; `split("a/b/")` is `("a/b", "")`. Errors on a non-string argument. ```python -load("path", "exists") -p = exists('path_test.go') -print(p) -# Output: True +load("path", "split") +print(split("a/b/c.txt")) # ('a/b', 'c.txt') +print(split("/a")) # ('/', 'a') +print(split("a/b/")) # ('a/b', '') +print(split("plain")) # ('', 'plain') +# Output: ("a/b", "c.txt") +# ("/", "a") +# ("a/b", "") +# ("", "plain") ``` -### `is_file(path) bool` +### `splitext(path) -> (root, ext)` -Returns true if the path exists and is a file. +Splits off the extension — the suffix beginning at the last dot of the final component. Leading dots do not count, so `splitext(".bashrc")` is `(".bashrc", "")`. A dot only in a directory component is ignored. Errors on a non-string argument. -#### Parameters - -| name | type | description | -|--------|----------|------------------------| -| `path` | `string` | The path to be checked | - -#### Examples +```python +load("path", "splitext") +print(splitext("a/b.tar.gz")) # ('a/b.tar', '.gz') +print(splitext(".bashrc")) # ('.bashrc', '') +print(splitext("a/.bashrc")) # ('a/.bashrc', '') +print(splitext("a.b/c")) # ('a.b/c', '') +# Output: ("a/b.tar", ".gz") +# (".bashrc", "") +# ("a/.bashrc", "") +# ("a.b/c", "") +``` -**basic** +### `isabs(path) -> bool` -Check if a path is a file. +Reports whether `path` is absolute in the POSIX sense (starts with `/`). The empty string is not absolute. Errors on a non-string argument. ```python -load("path", "is_file") -p = is_file('path_test.go') -print(p) +load("path", "isabs") +print(isabs("/a/b")) # True +print(isabs("a/b")) # False +print(isabs("")) # False # Output: True +# False +# False ``` -### `is_dir(path) bool` +### `relpath(path, start=".") -> string` / `try_relpath(path, start=".") -> (string, error)` + +Returns a relative path from `start` to `path`, computed lexically on the normalized inputs. `start` defaults to `"."` (an empty `start` is also treated as `"."`). -Returns true if the path exists and is a directory. +Errors on: an empty `path` (`no path specified`); mixing an absolute path with a relative `start` (`cannot mix an absolute path with a relative start`) — resolving the mix would silently depend on the process working directory, so call `abs()` first when that is intended; a non-string argument. `try_relpath` returns these as the error half of a `(value, error)` pair rather than aborting. -#### Parameters +```python +load("path", "relpath") +print(relpath("/a/b/c", "/a")) # 'b/c' +print(relpath("/a/b", "/a/b")) # '.' +print(relpath("/a/b", "/a/c/d")) # '../../b' +print(relpath("a/b", "a")) # 'b' +print(relpath("a/b")) # 'a/b' (start defaults to '.') +# Output: b/c +# . +# ../../b +# b +# a/b +``` -| name | type | description | -|--------|----------|------------------------| -| `path` | `string` | The path to be checked | +```python +load("path", "try_relpath") +v, err = try_relpath("/a/b/c", "/a") +print(v, err) # 'b/c' None +v2, err2 = try_relpath("/a", "c") +print(v2, "cannot mix" in err2) # None True +# Output: b/c None +# None True +``` -#### Examples +## Filesystem functions -**basic** +### `abs(path) -> string` / `try_abs(path) -> (string, error)` -Check if a path is a directory. +Returns an absolute representation of `path`. A relative path is joined with the current working directory; the result is not guaranteed to be unique. Errors on a missing argument (`missing argument for path`) or a non-string argument. `try_abs` returns a `(value, error)` pair instead of aborting. ```python -load("path", "is_dir") -p = is_dir('.') -print(p) +load("path", "abs") +p = abs("path_test.go") +print(p.endswith("lib/path/path_test.go")) # Output: True ``` -### `is_link(path) bool` - -Returns true if the path exists and is a symbolic link. +```python +load("path", "try_abs") +v, err = try_abs(".") +print(err, len(v) > 0) +# Output: None True +``` -#### Parameters +### `expanduser(path) -> string` -| name | type | description | -|--------|----------|------------------------| -| `path` | `string` | The path to be checked | +Replaces a leading `~` with the current user's home directory. Only the bare `~` or `~/...` form expands; `~user/...` is returned unchanged (matching Python when the user lookup is unavailable), and so is the path when the home directory cannot be determined. Reads the host environment. Errors on a non-string argument. -#### Examples +```python +load("path", "expanduser") +print(expanduser("~") != "~") # True (home resolved) +print(expanduser("~/x").endswith("/x")) # True +print(expanduser("~user/x")) # '~user/x' +print(expanduser("plain")) # 'plain' +# Output: True +# True +# ~user/x +# plain +``` -**basic** +### `exists(path) -> bool` -Check if a path is a symbolic link. +True if `path` exists; a symbolic link is followed. The empty string and non-existent paths return `False`. Errors only on a missing or non-string argument. ```python -load("path", "is_link") -p = is_link('link_to_path_test.go') -print(p) -# Output: False +load("path", "exists") +print(exists("path_test.go")) # True +print(exists(".")) # True +print(exists("nope")) # False +# Output: True +# True +# False ``` -### `listdir(path, recursive=False, filter=None) []string` - -Returns a list of directory contents. Optionally applies a filter function to each path to decide inclusion in the final list. - -#### Parameters +### `is_file(path) -> bool` -| name | type | description | -|-------------|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `path` | `string` | The directory path to be listed | -| `recursive` | `bool` | If true, list contents recursively | -| `filter` | `callable` | A callable object (e.g., lambda or function) that takes a single argument (a path) and returns a boolean value. Paths for which the filter function returns `False` are excluded from the result list. | +True if `path` exists and is a regular file (symlinks followed). Directories and non-existent paths return `False`. -#### Examples +```python +load("path", "is_file") +print(is_file("path_test.go")) # True +print(is_file(".")) # False +print(is_file("nope")) # False +# Output: True +# False +# False +``` -**basic** +### `is_dir(path) -> bool` -List directory contents. +True if `path` exists and is a directory (symlinks followed). ```python -load("path", "listdir") -p = listdir('.') -print(p) -# Output: ['file1', 'file2', ...] +load("path", "is_dir") +print(is_dir(".")) # True +print(is_dir("path_test.go")) # False +print(is_dir("nope")) # False +# Output: True +# False +# False ``` -**recursive** +### `is_link(path) -> bool` -List directory contents recursively. +True if `path` exists and is a symbolic link. The link itself is inspected (not followed), so plain files, directories, and non-existent paths return `False`. ```python -load("path", "listdir") -p = listdir('.', True) -print(p) -# Output: ['file1', 'file2', 'subdir/file3', ...] +load("path", "is_link") +print(is_link("path_test.go")) # False +print(is_link(".")) # False +# Output: False +# False ``` -**filtered** +### `listdir(path, recursive=False, filter=None) -> list[string]` + +Returns a list of directory contents. Listing a non-directory (e.g. a file) returns an empty list. With `recursive=True` the walk descends into subdirectories. `filter` is a callable taking one path argument and returning a bool; paths for which it returns `False` are excluded. -List directory contents with a filter function. +Errors on: a missing or non-string `path`; a non-existent path (`lstat ...`); an unreadable directory (`open ...`); a `filter` that is neither callable nor `None` (`expected or None, got int`); a filter returning a non-bool (`got int, want bool`); or a filter that itself fails (the inner error propagates). ```python load("path", "listdir") -is_not_go_file = lambda p: not p.endswith('.go') -p = listdir('.', filter=is_not_go_file) -print(p) -# Output: ['file1.py', 'file2.txt', ...] +p = listdir(".") +print("path_test.go" in p) +# Output: True ``` -**filtered_recursive** - -List directory contents recursively with a filter function. - ```python load("path", "listdir") -is_not_go_file = lambda p: not p.endswith('.go') -p = listdir('.', True, filter=is_not_go_file) -print(p) -# Output: ['file1.py', 'file2.txt', 'subdir/file3'] +p = listdir(".", filter=lambda x: not x.endswith(".go")) +print("path_test.go" not in p) +# Output: True ``` -### `getcwd() string` +### `getcwd() -> string` -Returns the current working directory. - -#### Examples - -**basic** - -Get the current working directory. +Returns the current working directory of the process. Takes no arguments; passing any is an error (`got 1 arguments, want 0`). ```python load("path", "getcwd") p = getcwd() -print(p) -# Output: '/current/directory' +print(p.endswith("path")) +# Output: True ``` -### `chdir(path)` +### `chdir(path) -> None` Changes the current working directory **of the whole process**. -> ⚠️ The effect is global: it applies to every machine and goroutine in the host, persists after the script ends, and concurrent machines calling it race with each other. Never expose this module to untrusted scripts; host runtimes typically exclude `path` from restricted module sets for this reason. - -#### Parameters - -| name | type | description | -|--------|----------|---------------------------------------| -| `path` | `string` | The path to the new current directory | +> WARNING: the effect is global — it applies to every machine and goroutine in the host, persists after the script ends, and concurrent machines calling it race with each other. Never expose this module to untrusted scripts. -#### Examples - -**basic** - -Change the current working directory. +Errors on a missing or non-string argument, or when the target cannot be entered (a non-existent path or a file both error with `chdir ...`). ```python -load("path", "chdir") -chdir('/new/directory') -# Current directory is now '/new/directory' +load("path", "chdir", "abs") +a = abs(".") +chdir(".") +b = abs(".") +print(a == b) +# Output: True ``` -### `mkdir(path, mode=0o755)` - -Creates a directory with the given name. If the directory already exists, no error is thrown. It's capable of creating nested directories. - -#### Parameters - -| name | type | description | -|--------|----------|-----------------------------------------------------------------------------------------------------------------------| -| `path` | `string` | The directory path to be created | -| `mode` | `int` | The file mode (permissions) to use for the newly-created directory, represented as an octal number. Defaults to 0755. | +### `mkdir(path, mode=0o755) -> None` -#### Examples +Creates a directory at `path`, creating parent directories as needed (like `mkdir -p`). An already-existing directory is not an error. `mode` is the octal permission bits for newly-created directories (default `0o755`); `path` may be a string or bytes. -**default** - -Create a new directory. - -```python -load("path", "mkdir") -mkdir('new_directory') -# New directory named 'new_directory' is created with default permissions -``` - -**permission** - -Create a new directory with specific permissions. +Errors on a missing argument, or when a path component is an existing non-directory (`not a directory`). ```python load("path", "mkdir") -mkdir('secure_directory', 0o700) -# New directory named 'secure_directory' is created with permissions set to 0700 +mkdir("new_directory") # default 0o755 +mkdir("secure_directory", 0o700) # explicit mode +# Output: ``` -### `basename(path) string` - -Returns the final component of a path — everything after the last `/`; a path ending in `/` has an empty basename (`basename("a/b/")` is `""`, unlike Go's `filepath.Base`). - -### `dirname(path) string` - -Returns the directory part of a path — everything before the last `/`, with trailing slashes stripped unless the result is all slashes. - -### `normpath(path) string` - -Collapses redundant separators and up-level references lexically: `a//b`, `a/./b` and `a/c/../b` all become `a/b`. Exactly two leading slashes are preserved (POSIX gives them special meaning); an empty path normalizes to `.`. - -### `split(path) (string, string)` - -Splits a path into a `(head, tail)` pair: `split("/a")` is `("/", "a")`, `split("a/b/")` is `("a/b", "")`. - -### `splitext(path) (string, string)` - -Splits a path into a `(root, extension)` pair: `splitext("a/b.tar.gz")` is `("a/b.tar", ".gz")`; leading dots do not count, so `splitext(".bashrc")` is `(".bashrc", "")`. - -### `isabs(path) bool` - -Reports whether the path is absolute (starts with `/`). - -### `relpath(path, start=".") string` - -Returns a relative path to `path` from `start`, computed lexically on the normalized inputs. Mixing an absolute path with a relative one is an error here — resolving the mix would silently depend on the process working directory; call `abs()` first when that is intended. `try_relpath` returns a `(value, error)` pair instead of aborting. - -### `expanduser(path) string` - -Replaces a leading `~` with the current user's home directory. Only the bare `~`/`~/...` form expands; `~user/...` is returned unchanged. Note that this reads the host environment. - -### `try_abs(path) (string, error)` / `try_relpath(path, start=".") (string, error)` +## Notes / boundaries -The `(value, error)` pair forms of `abs` and `relpath`, never aborting the script — the same shape as the `json`/`csv`/`http` modules' `try_*` functions. +- **Lexical vs filesystem.** `join`, `basename`, `dirname`, `normpath`, `split`, `splitext`, `isabs`, and `relpath` are pure string operations on `/`-separated paths and never touch disk — they match CPython's `posixpath` (not Go's `path/filepath`, which cleans eagerly and uses the OS separator). The rest read or mutate the real filesystem. +- **`expanduser` and the filesystem functions are OS-native**, so paths use the host separator and `abs`/`getcwd` return host-absolute paths; examples above that assert OS-specific shapes are skipped on Windows in the test suite. +- **`chdir` is a process-global, persistent side effect** shared across all machines — the Process capability. The test suite saves and restores the working directory around each case for this reason. +- **`try_abs` / `try_relpath`** never abort the script: they return a `(value, error)` tuple with `None` error on success, the same shape as the `json`/`csv`/`http` modules' `try_*` functions. +- All names are snake_case; the only non-alphabetic members are the `try_` prefixes on `try_abs` and `try_relpath`. diff --git a/lib/random/README.md b/lib/random/README.md index 48ad4ab6..da70afc3 100644 --- a/lib/random/README.md +++ b/lib/random/README.md @@ -1,254 +1,155 @@ # random -`random` defines functions that generate random values for various distributions, it's intended to be a drop-in subset of [Python's **random** module](https://docs.python.org/3/library/random.html) for Starlark. +`random` generates random values for various distributions — a drop-in subset of [Python's `random` module](https://docs.python.org/3/library/random.html) for Starlark. All randomness is drawn from the OS cryptographic source (`crypto/rand`), so it is suitable for security-sensitive use. **Capability profile: Pure** — no filesystem, network, process, or log side effects. ## Functions -### `randbytes(n)` +| function | description | +|----------|-------------| +| `randbytes(n=10) -> bytes` | Random byte string of `n` bytes. | +| `randstr(chars, n=10) -> str` | Random string of `n` characters drawn from `chars`. | +| `randb32(n=10, sep=0) -> str` | Random base32 string of `n` characters, optionally dash-separated every `sep` characters. | +| `randint(a, b) -> int` | Random integer `N` with `a <= N <= b`. | +| `random() -> float` | Random float in `[0.0, 1.0)`. | +| `uniform(a, b) -> float` | Random float between `a` and `b`. | +| `choice(seq) -> value` | Random element from the non-empty sequence `seq`. | +| `choices(population, weights=None, cum_weights=None, k=1) -> list` | `k`-sized list chosen from `population` with replacement, optionally weighted. | +| `shuffle(seq) -> None` | Shuffle the mutable sequence `seq` in place. | +| `uuid() -> str` | Random RFC 4122 version 4 UUID string. | -Generate a random byte string containing n number of bytes. +This module exposes no constants and no custom types — every member is a function on the `random` module. -#### Parameters +## Details & examples -| name | type | description | -|------|-------|---------------------------------------------------------------------------| -| `n` | `int` | If n bytes is non-positive or not supplied, a reasonable default is used. | +### `randbytes(n=10) -> bytes` -#### Examples - -**basic** - -Generate a random byte string containing 10 bytes. +Returns a random byte string of length `n`. If `n` is non-positive or omitted, the default length `10` is used. Errors only if the underlying RNG read fails. ```python load("random", "randbytes") b = randbytes(10) -print(b) -# Output: b'K\xaa\xbb4\xbaEh0\x19\x9c' +print(len(b)) +# Output: 10 ``` -### `randstr(chars, n)` - -Generate a random string containing n number of unicode characters from the given unicode string. - -#### Parameters - -| name | type | description | -|---------|----------|-----------------------------------------------------------------------------------------------| -| `chars` | `string` | The characters to choose from. | -| `n` | `int` | The length of the string. If n is non-positive or not supplied, a reasonable default is used. | - -#### Examples - -**basic** +### `randstr(chars, n=10) -> str` -Generate a random string containing 10 characters from the given unicode string. +Returns a random string of `n` characters, each drawn uniformly from the characters in `chars`. `chars` is split into Unicode runes, so multi-byte characters are selected as whole code points (length is measured in bytes, e.g. each Chinese character contributes 3). If `n` is non-positive or omitted, the default length `10` is used. Errors when `chars` is empty (`chars must not be empty`). ```python load("random", "randstr") -s = randstr("abcdefghijklmnopqrstuvwxyz", 10) -print(s) -# Output: "enfknqfbra" +x = randstr("AAA", 10) +print(x) +# Output: AAAAAAAAAA ``` -### `randb32(n, sep)` - -Generate a random base32 string containing n number of bytes with optional separator dash for every sep characters. - -#### Parameters - -| name | type | description | -|-------|-------|---------------------------------------------------------------------------------------------------------------| -| `n` | `int` | The number of bytes to generate. If n is non-positive or not supplied, a reasonable default is used. | -| `sep` | `int` | The number of characters to separate with a dash, if it's non-positive or not supplied, no separator is used. | +### `randb32(n=10, sep=0) -> str` -#### Examples - -**basic** - -Generate a random base32 string containing 10 bytes with no separator. +Returns a random base32 string of `n` characters using the standard RFC 4648 alphabet (`A-Z`, `2-7`). If `sep` is positive and smaller than the string length, a dash is inserted every `sep` characters (this adds separator characters to the total length). If `n` is non-positive or omitted, the default length `10` is used; if `sep` is non-positive or omitted, no separator is inserted. ```python load("random", "randb32") -s = randb32(10, 4) -print(s) -# Output: 2RXQ-H45H-WV +x = randb32(20, 5) +print(len(x), x[5], len(x.split("-"))) +# Output: 23 - 4 ``` -### `randint(a,b)` - -Return a random integer N such that a <= N <= b. +### `randint(a, b) -> int` -#### Parameters - -| name | type | description | -|------|-------|-------------------------------| -| `a` | `int` | The lower bound of the range. | -| `b` | `int` | The upper bound of the range. | - -#### Examples - -**basic** - -Return a random integer N such that 0 <= N <= 10. +Returns a random integer `N` such that `a <= N <= b` (inclusive on both ends). Both `a` and `b` must be integers; backed by arbitrary-precision big integers. Errors when `a > b` (`a must be less than or equal to b`). ```python load("random", "randint") -n = randint(0, 10) -print(n) -# Output: 7 +val = randint(1, 1) +print(val) +# Output: 1 ``` -### `random()` - -Return a random floating point number in the range 0.0 <= X < 1.0. - -#### Examples - -**basic** +### `random() -> float` -Return a random floating point number in the range [0.0, 1.0). +Returns a random float in the range `[0.0, 1.0)`. Takes no arguments; passing any argument errors (`random.random: got 1 arguments, want 0`). ```python load("random", "random") -n = random() -print(n) -# Output: 0.7309677873766576 +val = random() +print((0 <= val) and (val < 1)) +# Output: True ``` -### `uniform(a, b)` +### `uniform(a, b) -> float` -Return a random floating point number N such that a <= N <= b for a <= b and b <= N <= a for b < a. -The end-point value b may or may not be included in the range depending on floating-point rounding in the equation a + (b-a) * random(). - -#### Parameters - -| name | type | description | -|------|---------|-------------------------------| -| `a` | `float` | The lower bound of the range. | -| `b` | `float` | The upper bound of the range. | - -#### Examples - -**basic** - -Return a random floating point number N such that 5.0 <= N <= 10.0. +Returns a random float `N` between `a` and `b`, computed as `a + (b - a) * random()`. For `a <= b` the range is `a <= N <= b`; for `b < a` it is `b <= N <= a`. The end-point `b` may or may not be included depending on floating-point rounding. Both `a` and `b` accept `int` or `float`. ```python load("random", "uniform") -n = uniform(5, 10) -print(n) -# Output: 7.309677873766576 -``` - -### `uuid()` - -Generate a random UUID (RFC 4122 version 4). - -#### Examples - -**basic** - -Generate a random UUID. - -```python -load("random", "uuid") -u = uuid() -print(u) -# Output: 6e360b7a-f677-4f6c-9c57-8b09694d66b3 +val = uniform(1, 1) +print(val) +# Output: 1.0 ``` -### `choice(seq)` +### `choice(seq) -> value` -Return a random element from the non-empty sequence seq. - -#### Parameters - -| name | type | description | -|-------|--------|-----------------------| -| `seq` | `list` | A non-empty sequence. | - -#### Examples - -**basic** - -Return a random element from the non-empty sequence [1, 2, 3, 4, 5]. +Returns a single random element from the non-empty indexable sequence `seq` (e.g. list, tuple, range). Errors when `seq` is empty (`cannot choose from an empty sequence`) or not indexable. ```python load("random", "choice") -n = choice([1, 2, 3, 4, 5]) -print(n) +val = choice((3, 3, 3, 3, 3)) +print(val) # Output: 3 ``` -### `choices(population, weights=None, cum_weights=None, k=1)` - -Return a k-sized list of elements chosen from the population with replacement. If the population is empty, raises a ValueError. - -#### Parameters +### `choices(population, weights=None, cum_weights=None, k=1) -> list` -| name | type | description | -|---------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `population` | `list` | A non-empty sequence to choose from. | -| `weights` | `list` | A sequence of weights corresponding to the population, where weights[i] is the weight of population[i]. If not provided, all weights are considered equal. | -| `cum_weights` | `list` | A sequence of cumulative weights corresponding to the population. If provided, weights should not be provided. | -| `k` | `int` | The size of the result list. If not provided, defaults to 1. If k is non-positive, an empty list is returned. | +Returns a `k`-sized list of elements chosen from `population` **with replacement**. -#### Examples +- `population` — a non-empty indexable sequence. +- `weights` — relative weights per element; if omitted, selection is uniform. +- `cum_weights` — cumulative weights per element; cannot be combined with `weights`. +- `k` — result size (default `1`); if `k <= 0`, an empty list is returned. -**basic** - -Return a list of 2 random elements chosen from the population [1, 2, 3]. +Errors on: empty `population` (`population is empty`); both `weights` and `cum_weights` given (`cannot specify both weights and cumulative weights`); a weight list whose length differs from `population` (`the number of weights does not match the population`); non-numeric weights (`weights must be numeric`); decreasing cumulative weights (`cumulative weights must be non-decreasing`); a non-positive weight total (`total of weights must be greater than zero`); or a non-finite total (`total of weights must be finite`). ```python load("random", "choices") -result = choices([1, 2, 3], k=2) -print(result) -# Output: [2, 3] +a = choices([1, 2, 3], weights=[0, 1, 0]) +print(a) +# Output: [2] ``` -**with_weights** - -Return a list of 3 random elements chosen from the population [1, 2, 3] with given weights. +A zero weight (or a flat cumulative segment) makes an element unreachable, so deterministic weight vectors yield deterministic results: ```python load("random", "choices") -result = choices([1, 2, 3], weights=[1, 2, 1], k=3) -print(result) -# Output: [2, 3, 1] +a = choices([1, 2, 3, 4, 5], cum_weights=[0, 0, 1, 1, 1]) +print(a) +# Output: [3] ``` -**with_cumulative_weights** +### `shuffle(seq) -> None` -Return a list of 2 random elements chosen from the population [1, 2, 3] with given cumulative weights. +Shuffles the mutable sequence `seq` in place using the Fisher-Yates algorithm and returns `None`. `seq` must support index assignment (a list); tuples and other immutable sequences error (`want starlark.HasSetIndex`), and a frozen list errors (`cannot assign to element of frozen list`). Sequences of length 0 or 1 are left unchanged. ```python -load("random", "choices") -result = choices([1, 2, 3], cum_weights=[1, 3, 4], k=2) -print(result) -# Output: [3, 2] +load("random", "shuffle") +val = [1] +shuffle(val) +print(val) +# Output: [1] ``` -### `shuffle(x)` - -Shuffle the sequence x in place. - -#### Parameters - -| name | type | description | -|------|--------|-----------------------| -| `x` | `list` | A non-empty sequence. | +### `uuid() -> str` -#### Examples - -**basic** - -Shuffle the sequence [1, 2, 3, 4, 5] in place. +Returns a random UUID (RFC 4122 version 4) as a 36-character string (32 hex digits plus 4 dashes). Takes no arguments; passing any argument errors (`random.uuid: got 1 arguments, want 0`). ```python -load("random", "shuffle") -x = [1, 2, 3, 4, 5] -shuffle(x) -print(x) -# Output: [3, 1, 5, 4, 2] +load("random", "uuid") +val = uuid() +print(len(val), len(val.replace("-", ""))) +# Output: 36 32 ``` + +## Notes / boundaries + +- **Engine.** All values come from `crypto/rand` (the OS CSPRNG); integers use `math/big`, so `randint` is exact for arbitrarily large bounds. There is no seeding API and no `random.seed`/`getrandbits` equivalent — output is non-deterministic by design and cannot be made reproducible. +- **Float precision.** `random()` and `uniform()` quantize to `1/2^53` (53 bits of mantissa), matching CPython's effective precision. +- **Python parity.** This is a subset: `randbytes`, `randstr`, and `randb32` are extensions not present in CPython; `randint`, `random`, `uniform`, `choice`, `choices`, and `shuffle` mirror their CPython signatures. `choices` returns a `list` (never a tuple). Not provided: `randrange`, `sample`, `seed`, `getstate`/`setstate`, and the distribution helpers (`gauss`, `betavariate`, …). diff --git a/lib/re/README.md b/lib/re/README.md index 9c59a238..0b8e5ec5 100644 --- a/lib/re/README.md +++ b/lib/re/README.md @@ -1,121 +1,138 @@ # re -`re` defines regular expression functions, it's intended to be a subset of [Python's **re** module](https://docs.python.org/3/library/re.html) for Starlark, built on Go's [RE2 syntax](https://golang.org/s/re2syntax). +`re` provides regular-expression functions for Starlark — a small subset of [Python's `re` module](https://docs.python.org/3/library/re.html), built on Go's [RE2 syntax](https://golang.org/s/re2syntax). It is **pure** (no filesystem, network, process, or log side effects). -Notable differences from Python's `re`: - -- **flags must be `0`**: numeric `re.*` flags are not supported and are rejected with an error. Use inline pattern flags like `(?i)`, `(?m)`, `(?s)` instead. -- **`sub` replacement templates use Go syntax**: `$1` or `${name}` refers to a capture group and `$$` is a literal dollar sign. Python backslash references like `\1` are **not** interpreted, and a function `repl` is not supported. -- **`split` does not include capture-group text** in its result. -- There is no Match object: `match` returns a list of tuples and `search` returns an index pair (see below). +Capability profile: **Pure** — and **legacy/frozen**: this module is superseded by the `regex` module, which returns Python-shaped `list` results, supports more of the API, and is the one new code should use. `re` is kept only for backward compatibility; no new features are added here. ## Functions -### `compile(pattern, flags=0) Pattern` - -Compile a regular expression pattern into a regular expression object, which -can be used for matching using its match(), search() and other methods. - -#### Parameters - -| name | type | description | -|-----------|----------|--------------------------------------------------------| -| `pattern` | `string` | regular expression pattern string | -| `flags` | `int` | must be 0; use inline flags like `(?i)` in the pattern | - -### `search(pattern, string, flags=0)` - -Scan through string looking for the first location where the regular expression pattern -produces a match, and return the `[start, end]` byte-index pair of that match as a list. -Return None if no position in the string matches the pattern; note that this is different -from finding a zero-length match at some point in the string. - -#### Parameters - -| name | type | description | -|-----------|----------|--------------------------------------------------------| -| `pattern` | `string` | regular expression pattern string | -| `string` | `string` | input string to search | -| `flags` | `int` | must be 0; use inline flags like `(?i)` in the pattern | - -### `findall(pattern, text, flags=0)` - -Returns all non-overlapping matches of pattern in string, as a tuple of strings. -The string is scanned left-to-right, and matches are returned in the order found. -If one group is present in the pattern, the group text is returned instead of the -full match; with several groups, each element is a tuple of the group texts. -Empty matches are included in the result. - -#### Parameters - -| name | type | description | -|-----------|----------|--------------------------------------------------------| -| `pattern` | `string` | regular expression pattern string | -| `text` | `string` | string to find within | -| `flags` | `int` | must be 0; use inline flags like `(?i)` in the pattern | - -### `split(pattern, text, maxsplit=0, flags=0)` - -Split text by the occurrences of pattern. If maxsplit is positive, at most maxsplit -splits occur, and the remainder of the string is returned as the final element of the -result; a negative maxsplit means no splits happen at all. Note that unlike Python, -the text of capture groups in the pattern is **not** included in the result. +| function | description | +|----------|-------------| +| `compile(pattern, flags=0) -> regexp` | Compile `pattern` into a reusable `regexp` object exposing `match`/`search`/`findall`/`split`/`sub` methods. | +| `search(pattern, string, flags=0) -> list \| None` | Find the first match anywhere in `string`; return its `[start, end]` byte-index pair, or `None` if there is no match. | +| `match(pattern, string, flags=0) -> list` | Match only at the **beginning** of `string`; return `[(full_match, group1, ...)]` on success, or `[]` on no match. | +| `split(pattern, string, maxsplit=0, flags=0) -> tuple` | Split `string` on `pattern`. Capture-group text is **not** included (unlike Python). | +| `findall(pattern, string, flags=0) -> tuple` | Return all non-overlapping matches; full match text with no groups, the single group's text with one group, or a tuple of group texts with several. | +| `sub(pattern, repl, string, count=0, flags=0) -> string` | Replace non-overlapping matches of `pattern` in `string` with the Go-template `repl`. | -#### Parameters +## Constants -| name | type | description | -|------------|----------|------------------------------------------------------------------------| -| `pattern` | `string` | regular expression pattern string | -| `text` | `string` | input string to split | -| `maxsplit` | `int` | maximum number of splits. 0 (default) splits everywhere, negative none | -| `flags` | `int` | must be 0; use inline flags like `(?i)` in the pattern | - -### `sub(pattern, repl, text, count=0, flags=0)` - -Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern -in string by the replacement repl. If the pattern isn't found, string is returned unchanged. -repl must be a string and uses Go's template syntax: `$1` or `${name}` refers to a capture -group and `$$` is a literal dollar sign; Python backslash references like `\1` are **not** -interpreted. - -#### Parameters - -| name | type | description | -|-----------|----------|----------------------------------------------------------------------------| -| `pattern` | `string` | regular expression pattern string | -| `repl` | `string` | replacement template (`$1`/`${name}` for groups, `$$` for a literal `$`) | -| `text` | `string` | input string to replace | -| `count` | `int` | number of replacements. 0 (default) replaces all matches, negative none | -| `flags` | `int` | must be 0; use inline flags like `(?i)` in the pattern | - -### `match(pattern, string, flags=0)` - -If zero or more characters at the **beginning** of string match the regular expression -pattern, return a list with a single tuple holding the full match followed by the text of -every capture group. Return an empty list if the beginning of the string does not match -the pattern — a match elsewhere in the string does not count; use `search()` or -`findall()` for that. - -#### Parameters - -| name | type | description | -|-----------|----------|--------------------------------------------------------| -| `pattern` | `string` | regular expression pattern string | -| `string` | `string` | input string to match | -| `flags` | `int` | must be 0; use inline flags like `(?i)` in the pattern | +This module exposes no constants. In particular, no `re.*` flag constants exist — the `flags` parameter must be `0` (see Notes). ## Types -### `Pattern` - -**Methods** - -#### `search(text, flags=0)` - -#### `match(text, flags=0)` - -#### `findall(text, flags=0)` - -#### `split(text, maxsplit=0, flags=0)` - -#### `sub(repl, text, count=0, flags=0)` +### `regexp` + +A compiled pattern returned by `compile`. Its `Type()` string is `regexp`. It is immutable (frozen) and hashable. Each method mirrors the module-level function of the same name but drops the leading `pattern` argument. + +| method | signature | description | +|--------|-----------|-------------| +| `search` | `r.search(string, flags=0) -> list \| None` | As `search`, against the compiled pattern. | +| `match` | `r.match(string, flags=0) -> list` | As `match`, anchored at the start. | +| `findall` | `r.findall(string, flags=0) -> tuple` | As `findall`. | +| `split` | `r.split(string, maxsplit=0, flags=0) -> tuple` | As `split`. | +| `sub` | `r.sub(repl, string, count=0, flags=0) -> string` | As `sub`. | + +`dir()` on a compiled object lists exactly `["findall", "match", "search", "split", "sub"]`. + +## Details & examples + +### `compile(pattern, flags=0) -> regexp` + +Compile a pattern once and reuse it. Errors if `pattern` is not a string, if it is not valid RE2, or if `flags` is non-zero. + +```python +load('re', 'compile') +foo_r = compile("foo") +print(foo_r.findall("foo bar baz")) +# Output: +# ("foo",) +``` + +### `search(pattern, string, flags=0) -> list | None` + +Scans the whole string for the first match and returns its `[start, end]` byte indices, or `None` when nothing matches. Errors if `pattern`/`string` is not a string, if `pattern` is invalid RE2, or if `flags` is non-zero. Note this returns an index pair, not match text (use `findall` for text). + +```python +load('re', 'compile') +b = compile('b') +print(b.search('abc')) +print(b.search('xyz')) +# Output: +# [1, 2] +# None +``` + +### `match(pattern, string, flags=0) -> list` + +Matches only at the **beginning** of the string (Python `re.match` semantics — a match elsewhere does not count). On success returns a one-element list holding a tuple of the full match followed by each capture group; on failure returns an empty list `[]` (which is falsy, so `if match(...)` works as ported from Python). Same error conditions as `search`. + +```python +load('re', 'match') +print(match('world', 'hello world')) +print(match('hello', 'hello world')) +print(match('(h)(e)', 'hello')) +# Output: +# [] +# [("hello",)] +# [("he", "h", "e")] +``` + +### `split(pattern, string, maxsplit=0, flags=0) -> tuple` + +Splits on `pattern`. `maxsplit=0` (default) splits everywhere; a positive `maxsplit` keeps the remainder as the final element; a negative `maxsplit` performs no split at all. An astronomically large `maxsplit` is treated as "no limit". Capture-group text is **not** inserted into the result (a deliberate difference from Python). Same error conditions as `search`. + +```python +load('re', 'split') +print(split(',', 'a,b,c', 1)) +print(split(',', 'a,b,c', maxsplit=2)) +print(split(',', 'a,b,c', -1)) +# Output: +# ("a", "b,c") +# ("a", "b", "c") +# ("a,b,c",) +``` + +### `findall(pattern, string, flags=0) -> tuple` + +Returns all non-overlapping matches, left to right. With no capture groups each element is the full match text; with one group it is that group's text; with several groups it is a tuple of the group texts. Empty matches are included. Returns an empty tuple when nothing matches. Same error conditions as `search`. + +```python +load('re', 'findall') +print(findall(r'(\w)(\d)', 'a1 b2')) +print(findall(r'\w(\d)', 'a1 b2')) +print(findall('foo', 'bar baz')) +# Output: +# (("a", "1"), ("b", "2")) +# ("1", "2") +# () +``` + +### `sub(pattern, repl, string, count=0, flags=0) -> string` + +Replaces non-overlapping matches with `repl`. `count=0` (default) replaces all; a positive `count` replaces only the first `count`; a negative `count` replaces nothing (returns the input unchanged). `repl` uses **Go template syntax**: `$1` or `${name}` refers to a capture group and `$$` is a literal `$` — Python backslash references like `\1` are **not** interpreted, and a function `repl` is not supported. Errors if any of `pattern`/`repl`/`string` is not a string, if `pattern` is invalid RE2, or if `flags` is non-zero. + +```python +load('re', 'sub') +print(sub('a', 'X', 'aaa', 1)) +print(sub('a', 'X', 'aaa', count=2)) +print(sub('a', 'X', 'aaa', -1)) +print(sub('(a)(b)', '${2}${1}', 'ab ab')) +print(sub('a', '$$5', 'a')) +# Output: +# Xaa +# XXa +# aaa +# ba ba +# $5 +``` + +## Notes / boundaries + +- **Engine:** Go's `regexp` (RE2). RE2 guarantees linear-time matching but has no backreferences or lookaround; patterns needing those fail to compile with an error rather than being silently approximated. +- **`flags` must be `0`.** This module exports no flag constants. Any non-zero `flags` (e.g. a ported `re.IGNORECASE == 2`) is rejected with an error — historically it was silently ignored, which matched with default behavior and was wrong. Use **inline pattern flags** like `(?i)`, `(?m)`, `(?s)` instead. +- **Return shapes differ from CPython** by design: there is no `Match` object; `match` returns a list of tuples, `search` returns a `[start, end]` index pair, and `findall`/`split` return **tuples**. The successor `regex` module returns Python-shaped lists — prefer it for new code. +- **`split` drops capture-group text**, unlike Python's `re.split`. +- **`sub` templates are Go-style** (`$1`, `${name}`, `$$`), not Python-style (`\1`); `repl` must be a string. +- **Determinism:** matching is deterministic; results depend only on the pattern and input string. +- **Not implemented** from Python's `re`: `fullmatch`, `finditer`, `subn`, `escape`, and numeric flag constants. diff --git a/lib/regex/README.md b/lib/regex/README.md index 150ea3a6..f0cd2efe 100644 --- a/lib/regex/README.md +++ b/lib/regex/README.md @@ -1,69 +1,264 @@ # regex -`regex` provides regular expression functions for Starlark, a subset of [Python's **re** module](https://docs.python.org/3/library/re.html) backed by Go's [RE2 engine](https://golang.org/s/re2syntax). +`regex` provides regular-expression functions for Starlark — a subset of [Python's **re** module](https://docs.python.org/3/library/re.html) backed by Go's [RE2 engine](https://golang.org/s/re2syntax). **Capability profile: Pure** (no filesystem, network, process, or log side effects; deterministic for a given pattern and input). -It is the successor to the legacy `re` module (which is frozen): `regex` adds **Match objects**, named-group extraction, the `IGNORECASE`/`MULTILINE`/`DOTALL` flags, `\1`/`\g` replacement and function replacements, and the full `compile`/`fullmatch`/`finditer`/`subn`/`escape` surface. - -**Python re semantics subset (RE2 engine, no lookaround / no backreferences).** RE2 matches in linear time — no catastrophic backtracking / ReDoS — which suits running untrusted or LLM-generated scripts in a sandbox. Where RE2 genuinely differs from Python — lookahead/lookbehind (`(?=...)`, `(?<=...)`) and in-pattern backreferences (`\1`) — the pattern fails to **compile** with a clear error rather than silently misbehaving. +It succeeds the legacy `re` module (which is frozen): `regex` adds `Match` objects, named-group extraction, the `IGNORECASE`/`MULTILINE`/`DOTALL` flags, `\1`/`\g` and function replacements, and the full `compile`/`fullmatch`/`finditer`/`subn`/`escape` surface. RE2 matches in linear time — no catastrophic backtracking / ReDoS — which suits running untrusted or LLM-generated scripts in a sandbox. Where RE2 genuinely differs from Python — lookahead/lookbehind (`(?=...)`, `(?<=...)`) and in-pattern backreferences (`\1`) — the pattern fails to **compile** with a clear error rather than silently misbehaving. ## Functions | function | description | |---|---| -| `compile(pattern, flags=0)` | compile a pattern into a `Pattern` object | -| `search(pattern, string, flags=0)` | first match anywhere → `Match` or `None` | -| `match(pattern, string, flags=0)` | match anchored at the **start** → `Match` or `None` | -| `fullmatch(pattern, string, flags=0)` | match the **whole** string → `Match` or `None` | -| `findall(pattern, string, flags=0)` | all matches as a list (Python group shaping, see below) | -| `finditer(pattern, string, flags=0)` | a tuple of `Match` objects | -| `sub(pattern, repl, string, count=0, flags=0)` | replace matches → string | -| `subn(pattern, repl, string, count=0, flags=0)` | replace → `(string, count)` | -| `split(pattern, string, maxsplit=0, flags=0)` | split into a list, **including capture-group text** (Python semantics) | -| `escape(pattern)` | escape regex metacharacters | - -`try_compile` / `try_search` are also provided, returning a `(value, error)` pair instead of aborting the script — the same shape as the `json`/`csv`/`http` modules. +| `compile(pattern, flags=0) -> Pattern` | compile a pattern into a reusable `Pattern` object | +| `search(pattern, string, flags=0) -> Match` | first match anywhere → `Match`, or `None` | +| `match(pattern, string, flags=0) -> Match` | match anchored at the **start** → `Match`, or `None` | +| `fullmatch(pattern, string, flags=0) -> Match` | match the **whole** string → `Match`, or `None` | +| `findall(pattern, string, flags=0) -> list` | all matches as a list (Python group shaping, see below) | +| `finditer(pattern, string, flags=0) -> tuple` | a tuple of `Match` objects | +| `sub(pattern, repl, string, count=0, flags=0) -> str` | replace matches → the result string | +| `subn(pattern, repl, string, count=0, flags=0) -> (str, int)` | replace → `(result, num_replacements)` | +| `split(pattern, string, maxsplit=0, flags=0) -> list` | split into a list, **including capture-group text** (Python semantics) | +| `escape(pattern) -> str` | escape regex metacharacters in `pattern` | +| `try_compile(...) -> (value, error)`, `try_search(...) -> (value, error)` | non-raising variants of `compile` / `search`: return a `(value, error)` pair (error is `None` on success) instead of aborting the script — the same shape as the `json`/`csv`/`http` modules | -**findall shaping** (matches Python): the result is a **list**; no capture group → the full match text; one group → that group's text; two or more groups → a **tuple** of the group texts per match. (`split` likewise returns a list — the same as Python's `re` and starlet's legacy `re` module.) +**findall shaping** (matches Python): the result is always a **list**; no capture group → the full match text; one group → that group's text; two or more groups → a **tuple** of the group texts per match. `split` likewise returns a **list** (same as Python's `re` and starlet's legacy `re`). -**sub replacement**: `repl` is either a string template — `\1`/`\g` reference capture groups, `\n`/`\t`/`\r` are the usual escapes, a literal `$` is preserved — or a function called with each `Match`, returning the replacement string. `count` limits the number of replacements (`0` = all, negative = none). +## Constants -## Flags +Integer flag constants (Python `re` values), OR-able with `|` and passed as the `flags` argument; they translate to RE2 inline flags (`(?i)`, `(?m)`, `(?s)`). Each flag has a short and a long spelling that are equal. -`I`/`IGNORECASE`, `M`/`MULTILINE`, `S`/`DOTALL` — integer constants (Python's `re` values) that may be combined with `|` and translate to RE2 inline flags. +| constant | meaning | +|---|---| +| `I` / `IGNORECASE` | case-insensitive matching (`(?i)`, value `2`) | +| `M` / `MULTILINE` | `^`/`$` match at line boundaries (`(?m)`, value `8`) | +| `S` / `DOTALL` | `.` matches newlines too (`(?s)`, value `16`) | -```python -load("regex", "search", "I") -print(search("hello", "HELLO WORLD", I).group(0)) -# Output: "HELLO" -``` +A `flags` value outside the supported set (e.g. `1024`) errors with `unsupported flags value`. ## Types ### `Pattern` -A compiled regular expression. Methods mirror the module functions without the `pattern` argument: `search`, `match`, `fullmatch`, `findall`, `finditer`, `sub`, `subn`, `split`. Attributes: `pattern` (the source), `flags`, `groups` (number of capture groups). +A compiled regular expression — the value returned by `compile` (`type` is `regex.Pattern`). Its methods mirror the module-level functions **without** the leading `pattern` argument. `Pattern` is hashable, so it may be used as a dict key. + +| member | signature | description | +|---|---|---| +| `search` | `p.search(string) -> Match` | first match anywhere → `Match`, or `None` | +| `match` | `p.match(string) -> Match` | match anchored at the start → `Match`, or `None` | +| `fullmatch` | `p.fullmatch(string) -> Match` | match the whole string → `Match`, or `None` | +| `findall` | `p.findall(string) -> list` | all matches (Python group shaping) | +| `finditer` | `p.finditer(string) -> tuple` | a tuple of `Match` objects | +| `split` | `p.split(string, maxsplit=0) -> list` | split, including capture-group text | +| `sub` | `p.sub(repl, string, count=0) -> str` | replace matches → string | +| `subn` | `p.subn(repl, string, count=0) -> (str, int)` | replace → `(result, count)` | +| `pattern` | attribute → `str` | the source pattern string | +| `flags` | attribute → `int` | the flags passed at compile time | +| `groups` | attribute → `int` | number of capture groups | ### `Match` -The result of a successful `search`/`match`/`fullmatch`. +The result of a successful `search` / `match` / `fullmatch` (`type` is `regex.Match`). `Match` objects are **unhashable** — using one as a dict key errors with `unhashable type: regex.Match`. A non-participating optional group reports `None` (or the supplied default). -| member | description | -|---|---| -| `group(n=0, ...)` | group text by index or name; several args → a tuple; group 0 is the whole match | -| `groups(default=None)` | a tuple of all capture groups (`default` for non-participating groups) | -| `groupdict(default=None)` | a dict of named groups to their text | -| `start(n=0)` / `end(n=0)` | start/end byte index of a group | -| `span(n=0)` | `(start, end)` of a group | -| `expand(template)` | apply a `\1`/`\g` template against the match | -| `string` | the subject string that was matched | -| `re` | the `Pattern` that produced the match | +| member | signature | description | +|---|---|---| +| `group` | `m.group(n=0, ...) -> str` | group text by index or name; no arg → group 0 (the whole match); several args → a tuple | +| `groups` | `m.groups(default=None) -> tuple` | a tuple of all capture groups; `default` substitutes for non-participating groups | +| `groupdict` | `m.groupdict(default=None) -> dict` | a dict of named groups → their text | +| `start` | `m.start(group=0) -> int` | start byte index of a group (index or name) | +| `end` | `m.end(group=0) -> int` | end byte index of a group | +| `span` | `m.span(group=0) -> (int, int)` | `(start, end)` of a group | +| `expand` | `m.expand(template) -> str` | apply a `\1`/`\g` template against the match | +| `string` | attribute → `str` | the subject string that was matched | +| `re` | attribute → `Pattern` | the `Pattern` that produced the match | + +## Details & examples + +### Matching: `search`, `match`, `fullmatch` + +`search` finds the first match anywhere; `match` anchors at the start of the string; `fullmatch` requires the whole string to match. Each returns a `Match` on success or `None` on no match. All three error with `cannot compile pattern` if `pattern` is invalid or uses an RE2-unsupported construct (lookaround, backreferences). + +```python +load("regex", "search", "match", "fullmatch") +m = search(r'(\w+)@(\w+)', 'reach ann@host now') +print(m.group(0), m.group(1), m.group(2), m.span(0)) +print(match('world', 'hello world')) +print(fullmatch('a+', 'aaa').group(0)) +# Output: +# ann@host ann host (6, 14) +# None +# aaa +``` + +### Named groups + +Groups can be addressed by name (`(?P...)`) as well as index, in `group`, `start`, `end`, and `span`; `groupdict` returns just the named groups. + +```python +load("regex", "search") +m = search(r'(?P\w+)@(?P\w+)', 'ann@example') +print(m.group('user'), m.group('host')) +print(m.groupdict()) +print(m.group('user', 'host')) +# Output: +# ann example +# {"user": "ann", "host": "example"} +# ("ann", "example") +``` -A non-participating optional group reports `None`. `Match` objects are unhashable (they cannot be used as dict keys). +`m.group` / `m.start` / `m.span` error with `no such group` for an out-of-range index or unknown name; passing a non-int/non-string selector errors with `group index must be an int or string`. ```python load("regex", "search") -m = search(r'(?P\w+)@(?P\w+)', 'ann@example.com') -print(m.group('user'), m.group('host')) # ann example -print(m.groupdict()) # {"user": "ann", "host": "example"} -print(m.span(0)) # (0, 11) +m = search(r'(a)(b)?', 'a') +print(m.groups()) +print(m.groups('X')) +# Output: +# ("a", None) +# ("a", "X") +``` + +### `findall` and `finditer` + +`findall` returns a list shaped by capture-group count (see *findall shaping* above); `finditer` returns a tuple of `Match` objects. Both error with `cannot compile pattern` on a bad pattern. + +```python +load("regex", "findall", "finditer") +print(findall(r'\d+', 'a1 b22 c333')) +print(findall(r'(\w)(\d)', 'a1 b2')) +print(findall('z', 'abc')) +ms = finditer(r'\d+', 'a1 b22') +print([m.group(0) for m in ms], [m.span(0) for m in ms]) +# Output: +# ["1", "22", "333"] +# [("a", "1"), ("b", "2")] +# [] +# ["1", "22"] [(1, 2), (4, 6)] ``` + +### `sub` and `subn` + +`repl` is either a string template or a function called with each `Match`. In a string template, `\1`/`\g` reference capture groups, `\n`/`\t`/`\r`/`\\` are the usual escapes, an unrecognized `\x` or unclosed `\g<` is left literal, and a literal `$` is preserved. A function `repl` is called with the `Match` and must return a string. `count` limits the number of replacements: `0` = all, a positive `n` = at most `n`, a negative value = none. `subn` additionally returns the replacement count. + +`sub`/`subn` error with `cannot compile pattern` on a bad pattern, `repl must be a string or a function` for a wrong `repl` type, and `repl function must return a string` if a function `repl` returns a non-string. + +```python +load("regex", "sub", "subn") +print(sub(r'(\w+)@(\w+)', r'\2.\1', 'ann@host')) +print(sub(r'(?P\d)', r'[\g]', 'a1b2')) +print(sub('a', 'X', 'aaa', 2)) +print(sub('a', 'X', 'aaa', -1)) +print(subn('a', 'X', 'aaa')) +# Output: +# host.ann +# a[1]b[2] +# XXa +# aaa +# ("XXX", 3) +``` + +```python +load("regex", "sub") +def up(m): + return m.group(0).upper() +print(sub(r'[a-z]+', up, 'aa bb')) +# Output: AA BB +``` + +### `split` + +Splits `string` on matches of `pattern`, returning a list. The text of capture groups is included between the pieces (Python semantics). A non-positive `maxsplit` means no limit; otherwise at most `maxsplit` splits are made. Errors with `cannot compile pattern` on a bad pattern. + +```python +load("regex", "split") +print(split(r'\s+', 'a b c')) +print(split(r'(\s+)', 'a b')) +print(split(',', 'a,b,c', 1)) +# Output: +# ["a", "b", "c"] +# ["a", " ", "b"] +# ["a", "b,c"] +``` + +### `escape` + +Escapes all regex metacharacters in `pattern` so the result matches the input literally. + +```python +load("regex", "escape", "search") +p = escape('a.b*c') +print(search(p, 'a.b*c').group(0)) +print(search(p, 'axbyc')) +# Output: +# a.b*c +# None +``` + +### Flags + +```python +load("regex", "search", "findall", "I", "M", "S") +print(search('hello', 'HELLO', I).group(0)) +print(findall('^x', 'x\nx\ny', M)) +print(search('a.b', 'a\nb', S).group(0)) +print(search('a.b', 'a\nb')) +# Output: +# HELLO +# ["x", "x"] +# a +# b +# None +``` + +### `compile` and `Pattern` + +`compile` returns a reusable `Pattern`; its methods drop the leading `pattern` argument. `expand` applies a replacement template against an existing `Match`. `compile` errors with `cannot compile pattern` on an invalid or RE2-unsupported pattern. + +```python +load("regex", "compile") +p = compile(r'(?P\d+)') +print(p.pattern, p.groups) +print(p.search('x42').group('n')) +print(p.findall('1 2 3')) +print(p.sub('#', 'a1b2')) +print(p.match('x5')) +# Output: +# (?P\d+) 1 +# 42 +# ["1", "2", "3"] +# a#b# +# None +``` + +```python +load("regex", "search") +m = search(r'(\w+) (\w+)', 'hello world') +print(m.expand(r'\2 \1')) +print(m.string, m.re.pattern) +# Output: +# world hello +# hello world (\w+) (\w+) +``` + +### `try_compile` and `try_search` + +The `try_*` variants return a `(value, error)` tuple with a `None` error on success, instead of aborting the script. On a compile failure the value is `None` and the error is a string containing `cannot compile`. `try_search` returns `(None, None)` when the pattern is valid but does not match. + +```python +load("regex", "try_compile", "try_search") +p, err = try_compile('a+') +print(err, p != None) +bad, err2 = try_compile('(') +print(bad, 'cannot compile' in err2) +res, err3 = try_search('z', 'abc') +print(res, err3) +# Output: +# None True +# None True +# None None +``` + +## Notes / boundaries + +- **Engine: Go RE2** — linear-time matching, no catastrophic backtracking / ReDoS. RE2-unsupported Python constructs (lookahead/lookbehind, in-pattern backreferences) **fail to compile** with `cannot compile pattern`; they are never silently approximated. +- **Indices are byte offsets**, not rune offsets — `start`/`end`/`span` report positions into the UTF-8 bytes of the subject string. +- **Determinism & purity** — no host effects; the same pattern and input always yield the same result. Suitable for sandboxed/untrusted scripts. +- **Python-parity carve-outs** — `findall` and `split` return **lists** (not tuples), matching CPython; `Match` is unhashable like CPython, while `Pattern` is hashable here so it can serve as a dict key. A negative `count` to `sub`/`subn` replaces nothing. diff --git a/lib/runtime/README.md b/lib/runtime/README.md index 97e7a84e..d4ec92a3 100644 --- a/lib/runtime/README.md +++ b/lib/runtime/README.md @@ -1,33 +1,47 @@ # runtime -`runtime` is a Starlark module provides Go and app runtime information. +`runtime` is a Starlark module that exposes Go and application runtime information — host, paths, OS/architecture, Go version, process IDs, app start time/uptime, and read/write access to environment variables. + +Capability profile: **Process**. The module reports host/process facts captured at load time and can read and mutate the process's environment (`getenv` / `putenv` / `setenv` / `unsetenv`); it does not touch the filesystem or network. + +## Functions + +| function | description | +| --- | --- | +| `uptime() -> time.duration` | Time elapsed since the application started. | +| `getenv(key, default=None) -> string` | Value of environment variable `key`, or `default` if it is not set. | +| `putenv(key, value) -> None` | Set environment variable `key` to `value` (coerced to a string). `setenv` is an alias. | +| `setenv(key, value) -> None` | Alias of `putenv`. | +| `unsetenv(key) -> None` | Unset a single environment variable. | ## Constants -- `hostname`: A string representing the hostname of the system where the script is being executed. -- `workdir`: A string representing the current working directory of the process. -- `homedir`: A string representing the home directory of the user running the process, it's `$HOME` on Unix/Linux, `%USERPROFILE%` on Windows. -- `tempdir`: A string representing the default directory to use for temporary files. This is similar to Python's `tempfile.gettempdir()`. -- `os`: A string representing the operating system of the runtime. This value comes from Go's `runtime.GOOS`. -- `arch`: A string representing the architecture of the machine. This value is derived from Go's `runtime.GOARCH`. -- `gover`: A string representing the Go runtime version. This is obtained using `runtime.Version()` from the Go standard library. -- `pid`: An integer representing the process ID of the current process. -- `ppid`: An integer representing the parent process ID of the current process. -- `uid`: An integer representing the user ID of the process owner. -- `gid`: An integer representing the group ID of the process owner. -- `app_start`: A time value representing the moment when the application started. This is used to calculate uptime. +Constants are captured once when the module is first loaded. -## Functions +| constant | meaning | +| --- | --- | +| `hostname` | `string` — the host name of the system (Go `os.Hostname()`). | +| `workdir` | `string` — the current working directory of the process (Go `os.Getwd()`). | +| `homedir` | `string` — the home directory of the user: `$HOME` on Unix/Linux, `%USERPROFILE%` on Windows (Go `os.UserHomeDir()`). | +| `tempdir` | `string` — the default directory for temporary files, like Python's `tempfile.gettempdir()` (Go `os.TempDir()`). | +| `os` | `string` — the operating system, from Go `runtime.GOOS` (e.g. `linux`, `darwin`, `windows`). | +| `arch` | `string` — the machine architecture, from Go `runtime.GOARCH` (e.g. `amd64`, `arm64`). | +| `gover` | `string` — the Go runtime version, from Go `runtime.Version()` (e.g. `go1.19`). | +| `pid` | `int` — the process ID of the current process. | +| `ppid` | `int` — the parent process ID. | +| `uid` | `int` — the user ID of the process owner. | +| `gid` | `int` — the group ID of the process owner. | +| `app_start` | `time.time` — the moment the application started; used to compute `uptime`. | -### `uptime()` +## Details & examples -Returns the uptime of the current process in `time.duration`. +### `uptime` -#### Examples +`uptime() -> time.duration` -**basic** +Returns the elapsed time since the application started (relative to `app_start`) as a `time.duration`. Takes no arguments; passing any argument errors with `runtime.uptime: got 1 arguments, want 0`. -Returns the uptime of the current process immediately. +The exact value depends on how long the process has been running, so the output below is illustrative. ```python load("runtime", "uptime") @@ -35,64 +49,62 @@ print(uptime()) # Output: 883.583µs ``` -### `getenv(key, default=None)` +### `getenv` -Returns the value of the environment variable key as a string if it exists, or default if it doesn't. +`getenv(key, default=None) -> string` -#### Examples - -**basic** - -Returns the value of the environment variable PATH if it exists, or None if it doesn't. +Returns the value of environment variable `key` as a `string` if it is set, otherwise returns `default` (which defaults to `None` and may be any value). `key` must be a string; a missing or non-string `key` errors (`runtime.getenv: missing argument for key`, `runtime.getenv: for parameter key: got int, want string`). ```python load("runtime", "getenv") -print(getenv("PATH")) -# Output: /usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin +x = getenv("very-long-long-non-existent") +print(x) +y = getenv("very-long-long-non-existent", 1000) +print(y) +# Output: None +# 1000 ``` -### `putenv(key, value)` - -Sets the value of the environment variable named by the key, returning an error if any. - -#### Examples +### `putenv` / `setenv` -**basic** +`putenv(key, value) -> None` — `setenv(key, value) -> None` is an identical alias. -Sets the environment variable `STARLET_TEST` to the value `123456`. +Sets environment variable `key` to `value`. `value` is coerced to a string before being stored, so non-string values become their string form (e.g. the int `123456` is stored as `"123456"`). Returns `None`. `key` must be a string and both arguments are required; otherwise it errors (`runtime.putenv: missing argument for key`, `runtime.putenv: missing argument for value`, `runtime.putenv: for parameter key: got int, want string`). ```python -load("runtime", "putenv") +load("runtime", "putenv", "getenv") putenv("STARLET_TEST", 123456) +print(getenv("STARLET_TEST")) +# Output: 123456 ``` -### `setenv(key, value)` - -Sets the value of the environment variable named by the key, returning an error if any. -Alias of `putenv`. - -#### Examples - -**basic** - -Sets the environment variable `STARLET_TEST` to the value `ABC`. +`setenv` behaves the same way: ```python -load("runtime", "setenv") -setenv("STARLET_TEST", "ABC") +load("runtime", "setenv", "getenv") +setenv("STARLET_TEST", 123456) +print(getenv("STARLET_TEST")) +# Output: 123456 ``` -### `unsetenv(key)` - -Unsets a single environment variable. +### `unsetenv` -#### Examples +`unsetenv(key) -> None` -**basic** - -Unsets the environment variable STARLET_TEST. +Unsets a single environment variable. Returns `None`. Unsetting a variable that does not exist is a no-op (not an error). `key` must be a string and is required; otherwise it errors (`runtime.unsetenv: missing argument for key`, `runtime.unsetenv: for parameter key: got int, want string`). ```python -load("runtime", "unsetenv") +load("runtime", "putenv", "unsetenv", "getenv") +putenv("STARLET_TEST", 123456) unsetenv("STARLET_TEST") +print(getenv("STARLET_TEST")) +# Output: None ``` + +## Notes / boundaries + +- **Capture timing.** The constants (`hostname`, `workdir`, `homedir`, `tempdir`, `os`, `arch`, `gover`, `pid`, `ppid`, `uid`, `gid`, `app_start`) are read when the module is first loaded and do not refresh afterwards. `uptime` is computed live at call time against `app_start`. +- **No custom types.** All members are native Starlark values: strings, ints, a `time.time` (`app_start`), and a `time.duration` returned by `uptime`. The `time.*` values come from `go.starlark.net/lib/time`. +- **Environment writes are global.** `putenv`/`setenv`/`unsetenv` mutate the host process environment, not a sandboxed copy; effects are visible to the rest of the process and to child processes. +- **Platform differences.** `os`, `arch`, `gover`, `homedir`, and the numeric IDs reflect the underlying platform; on Windows, `uid`/`gid` follow Go's `os.Getuid()`/`os.Getgid()` semantics (which may be `-1`). +- All exported names are snake_case. diff --git a/lib/serial/README.md b/lib/serial/README.md index ffd6b3c5..7e8d4a7c 100644 --- a/lib/serial/README.md +++ b/lib/serial/README.md @@ -2,6 +2,19 @@ `serial` serializes Starlark **data values** to and from a compact JSON envelope, round-tripping the types plain JSON cannot: `bytes`, `set`, `tuple`, arbitrary-precision `int`, `time`, and dicts with **non-string keys**. It is the persistence companion to the `json` module — where `json.encode`/`decode` speak the JSON subset, `serial.dumps`/`loads` preserve the full Starlark data shape so a value written by one script reads back identically in another (caches, content-addressed keys, cross-run state). +**Capability profile: Pure.** No filesystem, network, process, or logging side effects — it only transforms a value to a string and back. + +## Functions + +| function | description | +|---|---| +| `dumps(value) -> string` | serialize a data value to a JSON-envelope string (deterministic; usable as a cache key) | +| `loads(s) -> value` | reconstruct the value from a `dumps` string; returns a fresh, unfrozen value | +| `try_dumps(value) -> tuple` | `dumps` variant returning `(result, error)` instead of aborting | +| `try_loads(s) -> tuple` | `loads` variant returning `(result, error)` instead of aborting | + +The `try_*` variants never abort the script: on success they return `(result, None)`; on failure they return `(None, message)` where `message` is the error string. The non-`try` `dumps`/`loads` raise the same error instead. + ## The contract: lossless, or a clear error — never silently lossy A value either round-trips **losslessly** or `dumps` fails with an **actionable error**. There is no quietly-lossy middle ground, because flattening an object would drop its type identity, methods, or live host binding without telling you. @@ -20,32 +33,29 @@ A value either round-trips **losslessly** or `dumps` fails with an **actionable This directly answers "will serializing a complex object lose information?" — serial won't let you serialize an object at all; it tells you to flatten it first, so nothing is dropped behind your back. -## Functions - -| function | description | -|---|---| -| `dumps(value) string` | serialize a data value to a JSON-envelope string (deterministic; usable as a cache key) | -| `loads(s) value` | reconstruct the value from a `dumps` string | -| `try_dumps(value) tuple` | `dumps` variant returning `(result, error)` instead of aborting | -| `try_loads(s) tuple` | `loads` variant returning `(result, error)` | +## Encoding -`loads` returns a fresh, **unfrozen** value (the same as `json.decode`), so scripts can read or mutate the result. +Non-JSON-native values are wrapped in a `{"$t": , "v": }` envelope. The tags are: -## Encoding +| tag | for | payload | +|---|---|---| +| `bytes` | `bytes` | base64 string | +| `bigint` | `int` outside int64 | decimal string | +| `tuple` | `tuple` | JSON array of encoded elements | +| `set` | `set` | JSON array, sorted by encoded form | +| `time` | `time` | RFC 3339 (nanosecond) string | +| `mapkv` | dict with non-string keys | array of `[key, value]` pairs, sorted by encoded key | +| `object` | a real dict that itself contains a `"$t"` key | the dict, so it is never mistaken for an envelope on the way back | -Non-JSON-native values are wrapped in a `{"$t": , "v": }` envelope: `bytes` (base64), `set`, `tuple`, `bigint` (decimal string), `time` (RFC 3339), and `mapkv` for a dict with non-string keys (a list of `[key, value]` pairs). A real dict that itself contains a `"$t"` key is wrapped in an `object` envelope so it is never mistaken for a tagged value on the way back. Output is **deterministic**: object keys are sorted and set elements are ordered by their encoded form, so the same value always dumps to the same bytes. +An int that fits `int64` is a plain JSON number; an all-string-keyed dict is a plain JSON object. Output is **deterministic**: object keys are sorted (by `json.Marshal`), and set elements and `mapkv` pairs are ordered by their encoded bytes, so the same value always dumps to the same string — safe to use directly as a cache key. -## Examples +## Details & examples -**Round-trip the types JSON drops** +### `dumps(value) -> string` -```python -load('serial', 'dumps', 'loads') -v = {'id': 2**80, 'tags': set(['a', 'b']), 'raw': b'\x00\x01', 'pair': (1, 2)} -assert loads(dumps(v)) == v -``` +Serialize `value` to a JSON-envelope string. Walks the value directly (never via re-parse, which would collapse type information). Returns the deterministic JSON text. -**Deterministic, usable as a cache key** +**Errors on**: a function/lambda/builtin (`cannot serialize function: it is code …`), a `struct` (`convert it to a dict first`), a host Go object (`serial round-trips data, not host objects`), a non-finite float (`cannot serialize non-finite float …`), or a reference cycle (`cannot serialize a value that refers to itself (cycle …)`). The error propagates from any depth — an unserializable element inside a list, tuple, set, dict value, or non-string dict key fails the whole `dumps`. ```python load('serial', 'dumps') @@ -53,11 +63,54 @@ print(dumps({'b': 2, 'a': 1})) # Output: {"a":1,"b":2} ``` -**Handle failure without aborting** +### `loads(s) -> value` + +Reconstruct the value from a `dumps` string, interpreting the type tags. The result is a fresh, **unfrozen** value (the same as `json.decode`), so scripts can read or mutate it. + +**Errors on**: invalid JSON (`serial.loads: …`), an unknown type tag (`unknown type tag "…"`), or a malformed envelope payload (`invalid bytes payload`, `invalid bigint payload`, `invalid time payload`, `invalid mapkv entry`, `invalid object payload`). A `set` with an unhashable element or a `mapkv` with an unhashable key errors (`unhashable`). Bare JSON numbers decode without a tag: an integer (any precision) to `int`, a number with `.`/`e`/`E` to `float`. ```python -load('serial', 'try_dumps') -out, err = try_dumps(lambda x: x) +load('serial', 'loads') +print(loads('{"a":1,"b":[2,3]}')) +# Output: {"a": 1, "b": [2, 3]} +``` + +### Round-trip the types JSON drops + +`loads(dumps(x))` reproduces `x` exactly, preserving the type — a `tuple` stays a `tuple`, `bytes` stay `bytes`, a `set` stays a `set`, and a big integer keeps full precision. + +```python +load('serial', 'dumps', 'loads') +def rt(x): return loads(dumps(x)) +v = {'id': 1267650600228229401496703205376, 'tags': set(['a', 'b']), 'raw': b'abc', 'pair': (1, 2), 'm': {1: 'x'}} +print(rt(v) == v, type(rt((1, 2))), type(rt(b'x'))) +# Output: True tuple bytes +``` + +### `try_dumps(value) -> tuple` / `try_loads(s) -> tuple` + +Same as `dumps`/`loads` but report failure as a `(result, error)` tuple instead of aborting the script. On success `error` is `None`; on failure `result` is `None` and `error` is the message string. + +```python +load('serial', 'try_dumps', 'try_loads') +out, err = try_dumps(42) print(out, err) -# Output: None serial.dumps: cannot serialize function: it is code — store the .star script and load() it instead +# Output: 42 None +``` + +```python +load('serial', 'try_loads') +val, e1 = try_loads('[1,2,3]') +bad, e2 = try_loads('not json') +print(val, e1, bad, e2 != None) +# Output: [1, 2, 3] None None True ``` + +## Notes / boundaries + +- **Engine** — encoding/decoding uses the Go standard `encoding/json`; decoding preserves number precision via `json.Number` (`UseNumber`), so large integers do not lose precision through a float round-trip. +- **Determinism** — output is byte-stable for a given value: object keys sorted, set elements and `mapkv` pairs ordered by encoded form. This is what makes a `dumps` string usable as a cache key. +- **`$t` reserved key** — `"$t"` is the envelope discriminator. A real string-keyed dict that contains a `"$t"` key is automatically wrapped in an `object` envelope and unwrapped on `loads`, so such dicts still round-trip; you do not need to avoid the key. +- **Difference from `json`** — `json.encode`/`decode` cover only JSON-native types and silently lose `tuple`/`set`/`bytes`/big-int/`time`/non-string keys; `serial` preserves all of them and instead refuses (with an actionable error) the things that genuinely cannot be persisted as data (code, structs, host objects, non-finite floats, cycles). +- **`module name`** — load as `load('serial', 'dumps', 'loads')`; the module constant `ModuleName` is `"serial"`. +- All exported names are snake_case; no non-conforming identifiers. diff --git a/lib/stats/README.md b/lib/stats/README.md index a1075d82..2a0e60f5 100644 --- a/lib/stats/README.md +++ b/lib/stats/README.md @@ -1,603 +1,192 @@ # stats -`stats` provides a Starlark module for comprehensive statistics functions. It is a wrapper around the Go package: https://github.com/montanaflynn/stats +`stats` provides a comprehensive set of statistics functions for Starlark, a thin wrapper around the Go package [`github.com/montanaflynn/stats`](https://github.com/montanaflynn/stats). It is **pure** (no filesystem, network, process, or log side effects) — every function is a deterministic computation over its arguments, with the sole exception of `sample`, which draws on a random number generator. -## Functions - -### `euclidean_distance(data1, data2) float` - -Calculates the straight line distance between two points in Euclidean space. - -#### Parameters - -| name | type | description | -|---------|--------|-------------------------------------| -| `data1` | `list` | First dataset of numerical values. | -| `data2` | `list` | Second dataset of numerical values. | - -#### Examples - -**basic** - -Calculate Euclidean distance between two points. - -```python -load("stats", "euclidean_distance") -print(euclidean_distance([3, 4], [0, 0])) # Output: 5.0 -``` - -### `manhattan_distance(data1, data2) float` - -Computes the sum of the absolute differences of their coordinates. - -#### Parameters - -| name | type | description | -|---------|--------|-------------------------------------| -| `data1` | `list` | First dataset of numerical values. | -| `data2` | `list` | Second dataset of numerical values. | - -#### Examples - -**basic** - -Calculate Manhattan distance between two points. - -```python -load("stats", "manhattan_distance") -print(manhattan_distance([3, 4], [0, 0])) # Output: 7.0 -``` - -### `softmax(data) list` - -Applies the Softmax function, useful for converting scores to probabilities. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Apply Softmax function to a dataset. - -```python -load("stats", "softmax") -print(softmax([1, 1, 1])) # Output: [0.3333333333333333, 0.3333333333333333, 0.3333333333333333] -``` - -### `sigmoid(data) list` - -Applies the Sigmoid function, often used for binary classification. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Apply Sigmoid function to a dataset. - -```python -load("stats", "sigmoid") -print(sigmoid([0, 2, 4])) # Output: [0.5, 0.8807970779778823, 0.9820137900379085] -``` - -### `mode(data) list` - -Determines the most frequently occurring data points in a dataset. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate mode of a dataset. - -```python -load("stats", "mode") -print(mode([1, 2, 2, 3, 4])) # Output: [2.0] -``` - -### `sum(data) float` - -Computes the sum of a series of numbers. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate sum of a dataset. - -```python -load("stats", "sum") -print(sum([1, 2, 3, 4])) # Output: 10.0 -``` - -### `max(data) float` - -Finds the maximum value in a dataset. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Find maximum value in a dataset. - -```python -load("stats", "max") -print(max([1, 2, 3, 4])) # Output: 4.0 -``` - -### `min(data) float` - -Finds the minimum value in a dataset. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Find minimum value in a dataset. - -```python -load("stats", "min") -print(min([1, 2, 3, 4])) # Output: 1.0 -``` - -### `midrange(data) float` - -Calculates the midrange, the average of the maximum and minimum values. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate midrange of a dataset. - -```python -load("stats", "midrange") -print(midrange([1, 2, 3, 4])) # Output: 2.5 -``` - -### `average(data) float` - -Alias for mean. Calculates the arithmetic mean of a dataset. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate average of a dataset. - -```python -load("stats", "average") -print(average([1, 2, 2, 3])) # Output: 2.0 -``` - -### `mean(data) float` - -Calculates the arithmetic mean of a dataset. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate mean of a dataset. - -```python -load("stats", "mean") -print(mean([1, 2, 3, 4])) # Output: 2.5 -``` - -### `geometric_mean(data) float` - -Computes the geometric mean, useful for datasets with exponential growth. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate geometric mean of a dataset. +Every data argument accepts any Starlark **iterable of `int` or `float`** (a `list`, `tuple`, …); ints are promoted to floats. Scalar results are returned as `float`, vector results as a `list` of `float`. -```python -load("stats", "geometric_mean") -print(geometric_mean([1, 2, 3, 4])) # Output: 2.213363839400643 -``` - -### `harmonic_mean(data) float` - -Computes the harmonic mean, effective for rates and ratios. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate harmonic mean of a dataset. - -```python -load("stats", "harmonic_mean") -print(harmonic_mean([1, 2, 3, 6])) # Output: 2.0 -``` - -### `trimean(data) float` - -Calculates the trimean, a measure of a dataset's tendency. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate trimean of a dataset. - -```python -load("stats", "trimean") -print(trimean([1, 2, 3, 4, 5])) # Output: 3.0 -``` - -### `median(data) float` - -Finds the middle value of a dataset. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate median of a dataset. - -```python -load("stats", "median") -print(median([1, 2, 3, 4, 5])) # Output: 3.0 -``` - -### `percentile(data, p) float` - -Determines the value below which a given percentage of observations in a group of observations falls. - -#### Parameters - -| name | type | description | -|--------|---------|------------------------------| -| `data` | `list` | Dataset of numerical values. | -| `p` | `float` | The percentile to compute. | - -#### Examples - -**basic** - -Calculate 50th percentile of a dataset. - -```python -load("stats", "percentile") -print(percentile([1, 2, 3, 4, 5], 50)) # Output: 2.5 -``` - -### `percentile_nearest_rank(data, p) float` - -Determines the value below which a given percentage of observations in a group of observations falls using the nearest rank method. - -#### Parameters - -| name | type | description | -|--------|---------|------------------------------| -| `data` | `list` | Dataset of numerical values. | -| `p` | `float` | The percentile to compute. | - -#### Examples - -**basic** - -Calculate 50th percentile using nearest rank method. - -```python -load("stats", "percentile_nearest_rank") -print(percentile_nearest_rank([1, 2, 3, 4, 5], 50)) # Output: 3.0 -``` - -### `variance(data) float` - -Calculates the variance of a dataset, a measure of dispersion. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate variance of a dataset. - -```python -load("stats", "variance") -print(variance([1, 2, 3, 4, 5])) # Output: 2.0 -``` - -### `covariance(data1, data2) float` - -Measures how changes in one variable are associated with changes in another variable. - -#### Parameters - -| name | type | description | -|---------|--------|-------------------------------------| -| `data1` | `list` | First dataset of numerical values. | -| `data2` | `list` | Second dataset of numerical values. | - -#### Examples - -**basic** - -Calculate covariance between two datasets. - -```python -load("stats", "covariance") -print(covariance([1, 2, 3], [4, 5, 6])) # Output: 1.0 -``` - -### `covariance_population(data1, data2) float` - -Calculates the covariance for an entire population. - -#### Parameters - -| name | type | description | -|---------|--------|-------------------------------------| -| `data1` | `list` | First dataset of numerical values. | -| `data2` | `list` | Second dataset of numerical values. | - -#### Examples - -**basic** - -Calculate population covariance between two datasets. +## Functions -```python -load("stats", "covariance_population") -print(covariance_population([1, 2, 3], [4, 5, 6])) # Output: 1.0 -``` +Each function takes one or two data arguments (an iterable of numbers) unless noted. `data`, `data1`, `data2` are iterables of `int`/`float`; `p` is a number; `take` is an `int`; `replace` is a `bool`. -### `population_variance(data) float` +| function | description | +|----------|-------------| +| `euclidean_distance(data1, data2) -> float` | Straight-line (L2) distance between two points. | +| `manhattan_distance(data1, data2) -> float` | Sum of absolute coordinate differences (L1 distance). | +| `softmax(data) -> list` | Softmax transform; converts scores to probabilities summing to 1. | +| `sigmoid(data) -> list` | Element-wise sigmoid (logistic) transform. | +| `mode(data) -> list` | Most frequently occurring value(s); may return several. | +| `sum(data) -> float` | Sum of the values. | +| `max(data) -> float` | Maximum value. | +| `min(data) -> float` | Minimum value. | +| `midrange(data) -> float` | Average of the maximum and minimum values. | +| `average(data) -> float` | Arithmetic mean (alias of `mean`). | +| `mean(data) -> float` | Arithmetic mean. | +| `geometric_mean(data) -> float` | Geometric mean. | +| `harmonic_mean(data) -> float` | Harmonic mean. | +| `trimean(data) -> float` | Trimean, a robust measure of central tendency. | +| `median(data) -> float` | Middle value. | +| `percentile(data, p) -> float` | Value below which `p`% of observations fall (interpolated). | +| `percentile_nearest_rank(data, p) -> float` | Percentile via the nearest-rank method. | +| `variance(data) -> float` | Variance (population variance; alias of `population_variance`). | +| `population_variance(data) -> float` | Variance of an entire population. | +| `sample_variance(data) -> float` | Variance of a sample (Bessel-corrected, `n-1`). | +| `covariance(data1, data2) -> float` | Sample covariance between two datasets. | +| `covariance_population(data1, data2) -> float` | Population covariance between two datasets. | +| `correlation(data1, data2) -> float` | Correlation coefficient between two datasets. | +| `pearson(data1, data2) -> float` | Pearson product-moment correlation coefficient. | +| `standard_deviation(data) -> float` | Population standard deviation. | +| `stddev(data) -> float` | Alias of `standard_deviation`. | +| `stddev_sample(data) -> float` | Sample standard deviation (Bessel-corrected, `n-1`). | +| `sample(data, take, replace=False) -> list` | Randomly draw `take` elements, with or without replacement. | -Computes the variance of an entire population. +There are no exported constants and no custom value types; inputs are plain Starlark iterables and outputs are plain `float`/`list` values. -#### Parameters +## Details & examples -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | +All functions delegate validation to the underlying engine. Common errors propagate verbatim: -#### Examples +- Empty input where a value is required → `Input must not be empty.` +- Mismatched lengths for a two-dataset function → `Must be the same length.` +- A percentile / sample size outside the valid range → `Input is outside of range.` +- A non-iterable data argument → `: for parameter 1: got , want iterable` +- Wrong argument count → `: got N arguments, want M` -**basic** +### Distance metrics — `euclidean_distance`, `manhattan_distance` -Calculate population variance of a dataset. +`euclidean_distance(data1, data2)` returns the L2 distance; `manhattan_distance(data1, data2)` returns the L1 distance. Both treat the two iterables as coordinate vectors. ```python -load("stats", "population_variance") -print(population_variance([1, 2, 3, 4, 5])) # Output: 2.0 +load("stats", "euclidean_distance", "manhattan_distance") +print(euclidean_distance([3, 4], [0, 0])) +print(manhattan_distance([3, 4], [0, 0])) +# Output: +# 5.0 +# 7.0 ``` -### `sample_variance(data) float` - -Computes the variance of a sample from the population. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | +### Transforms — `softmax`, `sigmoid` -#### Examples - -**basic** - -Calculate sample variance of a dataset. +`softmax(data)` returns a list whose entries are non-negative and sum to 1. `sigmoid(data)` applies the logistic function element-wise. Both return a `list` of `float` the same length as the input, and error on empty input. ```python -load("stats", "sample_variance") -print(sample_variance([1, 2, 3, 4, 5])) # Output: 2.5 +load("stats", "softmax", "sigmoid") +print(softmax([1, 2, 3])) +print(sigmoid([0, 2, 4])) +# Output: +# [0.09003057317038046, 0.24472847105479764, 0.6652409557748218] +# [0.5, 0.8807970779778823, 0.9820137900379085] ``` -### `correlation(data1, data2) float` - -Computes the correlation coefficient between two datasets. +### Basic measures — `sum`, `max`, `min`, `midrange`, `mode` -#### Parameters - -| name | type | description | -|---------|--------|-------------------------------------| -| `data1` | `list` | First dataset of numerical values. | -| `data2` | `list` | Second dataset of numerical values. | - -#### Examples - -**basic** - -Calculate correlation between two datasets. +`sum`, `max`, `min`, and `midrange` return a single `float`. `mode(data)` returns a `list` because a dataset can have more than one most-frequent value. `midrange` is `(min + max) / 2`. All error on empty input. ```python -load("stats", "correlation") -print(correlation([1, 2, 3], [1, 2, 3])) # Output: 1.0 +load("stats", "sum", "max", "min", "midrange", "mode") +print(sum([1, 2, 3, 4])) +print(max([1, 2, 3, 4])) +print(min([1, 2, 3, 4])) +print(midrange([1, 2, 3, 4])) +print(mode([1, 1, 2, 3, 3])) +# Output: +# 10.0 +# 4.0 +# 1.0 +# 2.5 +# [1.0, 3.0] ``` -### `pearson(data1, data2) float` - -Computes Pearson's correlation coefficient. - -#### Parameters - -| name | type | description | -|---------|--------|-------------------------------------| -| `data1` | `list` | First dataset of numerical values. | -| `data2` | `list` | Second dataset of numerical values. | - -#### Examples +### Central tendency — `average`, `mean`, `geometric_mean`, `harmonic_mean`, `trimean`, `median` -**basic** - -Calculate Pearson's correlation coefficient. +`average` is an alias of `mean`. All return a single `float`. ```python -load("stats", "pearson") -print(pearson([1, 2, 3], [1, 2, 3])) # Output: 1.0 +load("stats", "mean", "average", "geometric_mean", "harmonic_mean", "trimean", "median") +print(mean([1, 2, 3, 4])) +print(average([1, 2, 2, 3])) +print(geometric_mean([1, 2, 3, 4])) +print(harmonic_mean([1, 2, 3, 6])) +print(trimean([1, 2, 3, 4, 5])) +print(median([1, 2, 3, 4, 5])) +# Output: +# 2.5 +# 2.0 +# 2.2133638394006434 +# 2.0 +# 3.0 +# 3.0 ``` -### `standard_deviation(data) float` - -Calculates the standard deviation of a dataset. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | +### Variability — `percentile`, `percentile_nearest_rank`, `variance`, `population_variance`, `sample_variance` -#### Examples - -**basic** - -Calculate standard deviation of a dataset. +`percentile(data, p)` and `percentile_nearest_rank(data, p)` take a second positional argument `p` (the percentile, 0–100); both require exactly two arguments and error with `Input is outside of range.` if `p` is out of bounds. `variance` is population variance (same as `population_variance`); `sample_variance` uses the Bessel `n-1` correction. ```python -load("stats", "standard_deviation") -print(standard_deviation([1, 2, 3, 4, 5])) # Output: 1.4142135623730951 +load("stats", "percentile", "percentile_nearest_rank", "variance", "sample_variance") +print(percentile([1, 2, 3, 4, 5], 50)) +print(percentile_nearest_rank([1, 2, 3, 4, 5], 50)) +print(variance([1, 2, 3, 4, 5])) +print(sample_variance([1, 2, 3, 4, 5])) +# Output: +# 2.5 +# 3.0 +# 2.0 +# 2.5 ``` -### `stddev(data) float` - -Alias for standard_deviation. Calculates the standard deviation of a dataset. +### Covariance & correlation — `covariance`, `covariance_population`, `correlation`, `pearson` -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | - -#### Examples - -**basic** - -Calculate stddev of a dataset. +Each takes two equal-length datasets and returns a single `float`; unequal lengths error with `Must be the same length.`. `pearson` is the Pearson product-moment coefficient; `correlation` uses the engine's correlation routine. ```python -load("stats", "stddev") -print(stddev([1, 2, 3, 4, 5])) # Output: 1.4142135623730951 +load("stats", "covariance", "covariance_population", "correlation", "pearson") +print(covariance([1, 2, 3], [4, 5, 6])) +print(covariance_population([1, 2, 3], [4, 5, 6])) +print(correlation([1, 2, 3], [6, 5, 4])) +print(pearson([1, 2, 3], [1, 2, 3])) +# Output: +# 1.0 +# 0.6666666666666666 +# -1.0 +# 1.0 ``` -### `stddev_sample(data) float` - -Calculates the standard deviation of a sample. - -#### Parameters - -| name | type | description | -|--------|--------|------------------------------| -| `data` | `list` | Dataset of numerical values. | +### Standard deviation — `standard_deviation`, `stddev`, `stddev_sample` -#### Examples - -**basic** - -Calculate standard deviation of a sample. +`stddev` is an alias of `standard_deviation` (population standard deviation). `stddev_sample` uses the sample (`n-1`) correction. ```python -load("stats", "stddev_sample") -print(stddev_sample([1, 2, 3, 4, 5])) # Output: 1.5811388300841898 +load("stats", "standard_deviation", "stddev", "stddev_sample") +print(standard_deviation([1, 2, 3, 4, 5])) +print(stddev([1, 2, 3, 4, 5])) +print(stddev_sample([1, 2, 3, 4, 5])) +# Output: +# 1.4142135623730951 +# 1.4142135623730951 +# 1.5811388300841898 ``` -### `sample(data, take, replace=False) list` - -Randomly samples elements from the dataset. - -#### Parameters +### `sample(data, take, replace=False) -> list` -| name | type | description | -|-----------|--------|-------------------------------------------------------| -| `data` | `list` | Dataset of numerical values. | -| `take` | `int` | Number of elements to sample. | -| `replace` | `bool` | Optional. If True, sampling is done with replacement. | +Randomly draws `take` elements from `data` and returns them as a `list` of `float`. Arguments are keyword-capable: `data`, `take`, and the optional `replace`. With `replace=False` the result is a subset (so `take` must not exceed `len(data)`, else `Input is outside of range.`); with `replace=True` the same element may be drawn more than once and `take` may exceed the input length. `take` is required — omitting it errors with `sample: missing argument for take`. -#### Examples - -**basic** - -Sample elements from a dataset. +Because it draws on a random number generator, `sample` is the one **non-deterministic** function here; its element order and selection vary per call, so only the length is asserted in tests: ```python load("stats", "sample") -print(sample(data=[1, 2, 3, 4], take=3, replace=False)) -``` +r1 = sample(data=[1, 2, 3, 4], take=3, replace=False) +print(len(r1)) +r2 = sample(data=[1, 2, 3, 4], take=5, replace=True) +print(len(r2)) +# Output: +# 3 +# 5 +``` + +## Notes & boundaries + +- **Engine.** Computation is delegated to `github.com/montanaflynn/stats`; numeric results match that library exactly (full IEEE-754 `float` precision, as shown in the examples). +- **Input domain.** Any iterable of `int`/`float` is accepted; ints are promoted to `float`. A non-iterable argument errors with `want iterable`. Validation messages (empty input, length mismatch, out-of-range) come straight from the engine. +- **Aliases.** `average` ≡ `mean`; `stddev` ≡ `standard_deviation`; `variance` ≡ `population_variance`. The sample-corrected (`n-1`) variants are `sample_variance` and `stddev_sample`. +- **Determinism.** All functions are pure and deterministic except `sample`, which is random. +- **Naming.** All exported members use `snake_case`. diff --git a/lib/string/README.md b/lib/string/README.md index 18bb0190..6ca502a0 100644 --- a/lib/string/README.md +++ b/lib/string/README.md @@ -1,345 +1,179 @@ # string -`string` provides constants and functions to manipulate strings, it's intended to be a drop-in subset of Python's string module for Starlark. - -## Constants - -- `ascii_lowercase`: A string containing all the characters that are considered lowercase letters. -- `ascii_uppercase`: A string containing all the characters that are considered uppercase letters. -- `ascii_letters`: A string containing all the characters that are considered letters. -- `digits`: A string containing all characters considered decimal digits: `0123456789`. -- `hexdigits`: A string containing all characters considered hexadecimal digits: `0123456789abcdefABCDEF`. -- `octdigits`: A string containing all characters considered octal digits: `01234567`. -- `punctuation`: A string containing all characters which are considered punctuation characters. -- `whitespace`: A string containing all characters that are considered whitespace. -- `printable`: A string containing all characters that are considered printable. This is a combination of digits, ascii_letters, punctuation, and whitespace +`string` provides constants and functions for manipulating strings — length, reversal, searching, slicing, HTML escape/unescape, Go-syntax quote/unquote, and rune-aware head/tail/truncate helpers. It is intended to be a drop-in subset of [Python's `string` module](https://docs.python.org/3/library/string.html) for Starlark, extended with a few utilities of its own. Capability profile: **Pure** — no filesystem, network, process, or log side effects. ## Functions -### `length(obj) int` +| function | description | +|----------|-------------| +| `length(obj) -> int` | Number of Unicode code points in a string (bytes for `bytes`, element count for any sequence) | +| `reverse(s) -> string` | The value reversed (by rune for strings, by byte for `bytes`) | +| `index(s, sub) -> int` | Rune index of the first occurrence of `sub`; errors if not found | +| `rindex(s, sub) -> int` | Rune index of the last occurrence of `sub`; errors if not found | +| `find(s, sub) -> int` | Rune index of the first occurrence of `sub`, or `-1` if not found | +| `rfind(s, sub) -> int` | Rune index of the last occurrence of `sub`, or `-1` if not found | +| `substring(s, start, end=None) -> string` | Rune slice `[start:end)`; `end` defaults to the end of `s`; supports negative indices | +| `codepoint(s, index) -> string` | The single character (code point) at rune `index`; supports a negative index | +| `head(s, n) -> string` | First `n` runes of `s`, clamped to its length | +| `tail(s, n) -> string` | Last `n` runes of `s`, clamped to its length | +| `head_lines(s, n) -> string` | First `n` lines of `s` (split on `\n`), clamped to the line count | +| `tail_lines(s, n) -> string` | Last `n` lines of `s` (split on `\n`), clamped to the line count | +| `truncate(s, length, suffix="...") -> string` | Shorten `s` to at most `length` runes, appending `suffix` when cut | +| `escape(s) -> string` | HTML-escape `&`, `<`, `>`, `"`, `'` | +| `unescape(s) -> string` | Reverse of `escape`: HTML entities back to characters | +| `quote(s) -> string` | Go-syntax double-quoted string literal (like `strconv.Quote`) | +| `unquote(s) -> string` | Reverse of `quote`; returns the input unchanged if it is not a valid quoted literal | -Returns the length of the object; for string, it returns the number of Unicode code points, instead of bytes like `len()`. +## Constants -#### Parameters +Each constant is a `string` (matching Python's `string` module names). -| name | type | description | -|-------|----------|---------------------------------------------| -| `obj` | `string` | The object whose length is to be calculated | +| constant | meaning | +|----------|---------| +| `ascii_lowercase` | `abcdefghijklmnopqrstuvwxyz` | +| `ascii_uppercase` | `ABCDEFGHIJKLMNOPQRSTUVWXYZ` | +| `ascii_letters` | `ascii_lowercase` + `ascii_uppercase` | +| `digits` | decimal digits `0123456789` | +| `hexdigits` | hexadecimal digits `0123456789abcdefABCDEF` | +| `octdigits` | octal digits `01234567` | +| `punctuation` | ASCII punctuation: `` !"#$%&'()*+,-./:;<=>?@[\]^_{|}~` `` | +| `whitespace` | whitespace characters: space, `\t`, `\n`, `\r`, `\v`, `\f` | +| `printable` | `digits` + `ascii_letters` + `punctuation` + `whitespace` | -#### Examples +This module exposes no custom Starlark types — only the functions and constants above. -**String** +## Details & examples -Calculate the length of a CJK string. +### `length(obj) -> int` -```python -load("string", "length") -s = "你好" -print(length(s), len(s)) -# Output: 2 6 -``` - -**Misc** - -Calculate the length of a list, set and map. +Returns the number of Unicode code points in a `string` (not bytes, unlike the built-in `len()`), the number of bytes in a `bytes` value, or the element count of any `starlark.Sequence` (list, tuple, set). Errors with `length() takes exactly one argument` when not given exactly one argument, and `length() function isn't supported for '' type object` for an unsupported type (e.g. `int`). ```python load("string", "length") -print(length([1, 2, 3]), length(set([1, 2])), length({1: 2})) -# Output: 3 2 1 +print(length("我爱你"), length(b"☕"), length([1, 2, "#", True, None])) +# Output: 3 3 5 ``` -### `reverse(str) string` - -Returns the reversed string of the given value. - -#### Parameters +### `reverse(s) -> string` -| name | type | description | -|-------|----------|---------------------------------| -| `str` | `string` | A string that is to be reversed | - -#### Examples - -**Basic** - -Reverse a string. +Reverses a `string` by rune (so multi-byte characters stay intact) or a `bytes` value by byte. Same single-argument and type errors as `length`. ```python load("string", "reverse") -s = "123我爱你" -print(reverse(s)) +print(reverse("123我爱你")) # Output: 你爱我321 ``` -### `index(s, sub) int` - -Returns the index of the first occurrence of the substring `sub` in `s`. If the substring is not found, an error is raised. - -#### Parameters - -| name | type | description | -|-------|----------|-----------------------------| -| `s` | `string` | The string to be searched | -| `sub` | `string` | The substring to search for | +### `index(s, sub) -> int` / `rindex(s, sub) -> int` -#### Examples - -**Basic** - -Find the first occurrence of a substring in a string. +Return the rune index of the first (`index`) or last (`rindex`) occurrence of `sub` in `s`. They error with `: substring not found` when `sub` is absent. Indices are counted in runes, so they are correct for multi-byte text. ```python -load("string", "index") -s = "hello world" -print(index(s, "o")) -# Output: 4 +load("string", "index", "rindex") +print(index("你好世界", "好"), rindex("你好世界你好", "好")) +# Output: 1 5 ``` -### `rindex(s, sub) int` - -Returns the index of the last occurrence of the substring `sub` in `s`. If the substring is not found, an error is raised. - -#### Parameters - -| name | type | description | -|-------|----------|-----------------------------| -| `s` | `string` | The string to be searched | -| `sub` | `string` | The substring to search for | +### `find(s, sub) -> int` / `rfind(s, sub) -> int` -#### Examples - -**Basic** - -Find the last occurrence of a substring in a string. +Like `index` / `rindex`, but return `-1` instead of erroring when `sub` is not found. ```python -load("string", "rindex") -s = "hello world" -print(rindex(s, "o")) -# Output: 7 +load("string", "find", "rfind") +print(find("hello", "o"), find("hello", "x"), rfind("hello hello", "o")) +# Output: 4 -1 10 ``` -### `find(s, sub) int` - -Returns the index of the first occurrence of the substring `sub` in `s`. If the substring is not found, returns -1. - -#### Parameters - -| name | type | description | -|-------|----------|-----------------------------| -| `s` | `string` | The string to be searched | -| `sub` | `string` | The substring to search for | - -#### Examples +### `substring(s, start, end=None) -> string` -**Basic** - -Find the first occurrence of a substring in a string, returning -1 if not found. - -```python -load("string", "find") -s = "hello world" -print(find(s, "o")) -print(find(s, "x")) -# Output: 4 -# Output: -1 -``` - -### `rfind(s, sub) int` - -Returns the index of the last occurrence of the substring `sub` in `s`. If the substring is not found, returns -1. - -#### Parameters - -| name | type | description | -|-------|----------|-----------------------------| -| `s` | `string` | The string to be searched | -| `sub` | `string` | The substring to search for | - -#### Examples - -**Basic** - -Find the last occurrence of a substring in a string, returning -1 if not found. - -```python -load("string", "rfind") -s = "hello world" -print(rfind(s, "o")) -print(rfind(s, "x")) -# Output: 7 -# Output: -1 -``` - -### `substring(s, start, end=None) string` - -Returns a substring of `s` from index `start` to `end` (exclusive). If `end` is omitted or `None`, the substring extends to the end of the string. An explicit `end` of `0` (or `-len(s)`) means an empty range, as in Python slicing; out-of-range indices are reported as errors. - -#### Parameters - -| name | type | description | -|---------|----------|--------------------------------------| -| `s` | `string` | The string to be sliced | -| `start` | `int` | The starting index for the substring | -| `end` | `int` or `None` | Optional ending index (exclusive); defaults to the end of the string | - -#### Examples - -**Basic** - -Get a substring of a string. +Returns the rune slice `s[start:end)`. `end` defaults to the end of `s` when omitted or `None`; an explicit `end` of `0` (or `-len(s)`) means an empty range, as in Python slicing. Negative `start`/`end` count from the end. Errors with `substring: indices are out of range` if, after normalization, an index falls outside `[0, len(s)]` or `start > end`. ```python load("string", "substring") -s = "hello world" -print(substring(s, 1, 5)) -# Output: "ello" +print(substring("hello", 1, 4), substring("你好世界", 2, -1), substring("hello", 1)) +# Output: ell 世 ello ``` -**Negative Indices** +### `codepoint(s, index) -> string` -Get a substring of a string using negative indices. +Returns the single character at rune `index` (negative indices count from the end). Errors with `codepoint: index out of range` when `index` is outside the string. ```python -load("string", "substring") -s = "hello world" -print(substring(s, -5, -1)) -# Output: "worl" +load("string", "codepoint") +print(codepoint("a☕c", 1), codepoint("a☕c", -1)) +# Output: ☕ c ``` -### `codepoint(s, index) string` - -Returns the Unicode codepoint of the character at the given `index` in `s`. - -#### Parameters - -| name | type | description | -|---------|----------|--------------------------------------------| -| `s` | `string` | The string from which to get the codepoint | -| `index` | `int` | The index of the character | +### `head(s, n) -> string` / `tail(s, n) -> string` -#### Examples - -**Basic** - -Get the Unicode codepoint of a character at a specific index. +Return the first (`head`) or last (`tail`) `n` runes of `s`. `n` must be non-negative (else `: n must be non-negative`) and is clamped to the string length, so a short input never errors. The cut is rune-aware and never splits a multi-byte character. ```python -load("string", "codepoint") -s = "hello world" -print(codepoint(s, 4)) -# Output: "o" +load("string", "head", "tail") +print(head("你好世界", 2), tail("a☕c", 2), head("hello", 99)) +# Output: 你好 ☕c hello ``` -### `escape(str) string` - -Converts the characters "&", "<", ">", '"' and "'" in string to their corresponding HTML entities. - -#### Parameters +### `head_lines(s, n) -> string` / `tail_lines(s, n) -> string` -| name | type | description | -|-------|----------|--------------------------------------| -| `str` | `string` | A string which is to be HTML escaped | - -#### Examples - -**Basic** - -Escape a string. +Return the first (`head_lines`) or last (`tail_lines`) `n` lines of `s`, splitting on `\n` and clamping `n` to the line count. `n` must be non-negative. ```python -load("string", "escape") -s = "Hello" -print(escape(s)) -# Output: Hello<World> +load("string", "head_lines", "tail_lines") +s = "a\nb\nc" +print(head_lines(s, 2)) +print(tail_lines(s, 2)) +# Output: +# a +# b +# b +# c ``` -### `unescape(str) string` - -Converts the HTML entities in a string back to their corresponding characters. - -#### Parameters +### `truncate(s, length, suffix="...") -> string` -| name | type | description | -|-------|----------|-----------------------| -| `str` | `string` | A HTML escaped string | - -#### Examples - -**Basic** - -Unescape a string. +Shortens `s` to at most `length` runes. A string already within the limit is returned unchanged; otherwise `suffix` is appended and the result — including the suffix — never exceeds `length` runes (if `length` is shorter than `suffix`, the suffix itself is truncated). `length` must be non-negative (else `truncate: length must be non-negative`). ```python -load("string", "unescape") -s = "You&Me" -print(unescape(s)) -# Output: "You&Me" +load("string", "truncate") +print(truncate("hello world", 8)) +print(truncate("hello world", 8, suffix="~")) +print(truncate("hello", 2)) +# Output: +# hello... +# hello w~ +# .. ``` -### `quote(str) string` - -Returns a double-quoted string literal in Go syntax representing str, with control characters and non-printable runes escaped (like Go's `strconv.Quote`). NOTE: this is **not** shell escaping — do not use it to build shell command lines. - -#### Parameters - -| name | type | description | -|-------|----------|--------------------------------| -| `str` | `string` | A string which is to be quoted | +### `escape(s) -> string` / `unescape(s) -> string` -#### Examples - -**Basic** - -Quote a string. +`escape` converts `&`, `<`, `>`, `"`, and `'` to their HTML entities; `unescape` is the inverse. Both accept a `string` or `bytes` (exactly one argument) and preserve the input type. Same single-argument and type errors as `length`. ```python -load("string", "quote") -s = "Hello World" -print(quote(s)) -# Output: "Hello World" +load("string", "escape", "unescape") +print(escape("<&>")) +print(unescape("我&你")) +# Output: +# <&> +# 我&你 ``` -### `unquote(str) string` - -Returns a shell-unescaped version of the string str. This returns a string that was used as one token in a shell command line. - -#### Parameters - -| name | type | description | -|-------|----------|----------------------------------| -| `str` | `string` | A string which is to be unquoted | +### `quote(s) -> string` / `unquote(s) -> string` -#### Examples - -**Basic** - -Unquote a string. +`quote` returns a Go-syntax double-quoted string literal for `s`, escaping control characters and non-printable runes (like Go's `strconv.Quote`). NOTE: this is **not** shell escaping — do not use it to build shell command lines. `unquote` reverses it: it unquotes a Go double-quoted literal, and robustly returns the input unchanged if it is not a valid quoted literal (e.g. shorter than 2 characters, or only one side quoted). Both accept a `string` or `bytes`. ```python -load("string", "unquote") -s = '"Hello\tWorld"' -print(unquote(s)) -World +load("string", "quote", "unquote") +print(quote("\n1")) +print(unquote('"我爱你"'), unquote('"我爱你')) +# Output: +# "\n1" +# 我爱你 "我爱你 ``` -### `head(s, n) string` - -Returns the first `n` characters (runes) of `s`. `n` must be non-negative and is clamped at the length of the string, so a short input never errors. Unlike the language-level `s[:n]`, the cut is rune-aware and never splits a multi-byte character. - -### `tail(s, n) string` - -Returns the last `n` characters (runes) of `s`, with the same clamping and rune-awareness as `head`. - -### `head_lines(s, n) string` - -Returns the first `n` lines of `s` (split on `\n`), clamped at the number of lines. - -### `tail_lines(s, n) string` - -Returns the last `n` lines of `s` (split on `\n`), clamped at the number of lines. - -### `truncate(s, length, suffix="...") string` +## Notes / boundaries -Shortens `s` to at most `length` characters (runes). If a cut happens, `suffix` is appended and the result — including the suffix — never exceeds `length` runes; a string already within the limit is returned unchanged. +- **Rune-based indexing.** `length`, `index`/`rindex`/`find`/`rfind`, `substring`, `codepoint`, `head`/`tail`, and `truncate` all operate on Unicode code points, not bytes — counts and slices never split a multi-byte character. This differs from the built-in `len()` and the `s[a:b]` slice operator, which work on bytes. +- **`bytes` support.** Only `length`, `reverse`, `escape`, `unescape`, `quote`, and `unquote` accept `bytes` (and preserve the type). The index/slice/line helpers take `string` only. +- **Pure module.** No I/O, no global state, deterministic output for a given input. +- **Python parity.** The constants mirror Python's `string` module; `index`/`rindex`/`find`/`rfind` mirror the `str` methods of the same names. `quote`/`unquote`/`escape`/`unescape` and the `head`/`tail`/`*_lines`/`truncate`/`substring`/`codepoint` helpers are starlet extensions, not part of Python's `string` module. diff --git a/tools/doccov/coverage.star b/tools/doccov/coverage.star new file mode 100644 index 00000000..870a8e3a --- /dev/null +++ b/tools/doccov/coverage.star @@ -0,0 +1,37 @@ +# Documentation coverage check, run by doc_coverage_test.go (dogfoods the regex +# module — starlet checking its own docs with its own tools). +# +# Injected globals: +# surface: {module_name: [exported member names]} — the authoritative code +# surface, enumerated in Go from each module's registered members. +# docs: {module_name: README text} +# +# A member counts as documented when its name appears immediately after a +# backtick and as a whole identifier — `read_all(`, `ascii_letters`, `I` — which +# is the doc standard's backtick-quoted convention. The next-character guard +# (not an identifier rune) keeps `head` from matching inside `head_lines`. +# RE2 has no lookahead, so the guard character is consumed by the class. +# +# Results read back by the harness: `missing` (list of "module.name") and +# `report` (human summary). +load('regex', 'search', 'escape') + +def check(): + missing = [] + documented = 0 + total = 0 + for mod in sorted(surface): + text = docs[mod] + for name in surface[mod]: + total += 1 + if search('`' + escape(name) + '[^A-Za-z0-9_]', text) != None: + documented += 1 + else: + missing.append(mod + '.' + name) + report = str(documented) + '/' + str(total) + ' module members documented across ' + str(len(surface)) + ' modules' + if missing: + report = report + '\nUNDOCUMENTED (' + str(len(missing)) + '): ' + ', '.join(sorted(missing)) + return missing, report + +missing, report = check() +print(report)