diff --git a/.gitignore b/.gitignore index e00448d9..1b286750 100644 --- a/.gitignore +++ b/.gitignore @@ -89,3 +89,5 @@ gitgalaxy/tools/supply_chain_security/README supply_chain.md gitgalaxy/tools/terabyte_log_scanning/README terabyte.md *.sqlite *.db +LM-Notebook files.md +terminology_updates.md \ No newline at end of file diff --git a/README.md b/README.md index 5db1dc42..7a101c83 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Engine](https://img.shields.io/badge/Engine-blAST-8A2BE2.svg)](#) [![Velocity](https://img.shields.io/badge/Velocity-100k+_LOC%2Fs-00C957.svg)](#) -[![Analysis](https://img.shields.io/badge/Analysis-Code_Bioinformatics-00BFFF.svg)](#) +[![Analysis](https://img.shields.io/badge/Analysis-Static_Analysis-00BFFF.svg)](#) [![Threat Hunting](https://img.shields.io/badge/Threat_Hunting-Behavioral-FF4500.svg)](#) [![Architecture](https://img.shields.io/badge/Architecture-Zero__Trust-teal.svg)](#) [![Coverage](https://img.shields.io/badge/Coverage-50%2B_Languages-00C957.svg)](#) @@ -15,29 +15,32 @@ [![Airgap Ready](https://img.shields.io/badge/Security-Airgap_Ready-teal.svg)](#) [![Downloads](https://static.pepy.tech/badge/gitgalaxy)](https://pepy.tech/project/gitgalaxy) -### **Whole-Repository Understanding & DevSecOps Physics** +### **Whole-Repository Understanding & DevSecOps Topology** -Most tools analyze code line-by-line. GitGalaxy maps the entire architectural ecosystem. By tracking the exact flow of information across network dependencies, identifying local folder constraints, and natively recognizing 50+ languages—even mid-file—GitGalaxy provides a deterministic, 10,000-foot view of your software's physical architecture. +Most tools analyze code line-by-line. GitGalaxy maps the entire architectural ecosystem. By tracking the exact flow of information across network dependencies, identifying local folder constraints, and natively recognizing 50+ languages—even mid-file—GitGalaxy provides a deterministic, macro-level view of your software's structural architecture. ### Scanning Apollo-11 with the blAST Engine -![GitGalaxy CLI Scan](./docs/wiki/assets/apollo11_scan.gif) +![GitGalaxy CLI Scan](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/apollo11_scan.gif) -**Why we built a custom physics engine:** -Standard Abstract Syntax Trees (ASTs) are great for finding syntax errors, but they require compilable code and miss the forest for the trees when mapping massive-scale information flow. LLMs, on the other hand, suffer from severe hallucination when analyzing large context windows and yield probabilistic, fluctuating answers. +**Why we built a custom parsing engine:** +Standard Abstract Syntax Trees (ASTs) are excellent for finding syntax errors, but they require fully compilable code and struggle to map massive-scale information flow efficiently. LLMs suffer from context window saturation and yield probabilistic, fluctuating answers. -The **blAST (Bypassing LLMs and ASTs) engine** solves this. It reads code as raw structural text, scanning for semantic anchors to build a deterministic 3D knowledge graph of your entire repository. It instantly calculates the ratio of test boundaries to core logic, maps network blast radiuses, and extracts the vital project structure data that rigid linters ignore. +The **blAST (Bypassing LLMs and ASTs) engine** solves this by adopting the search philosophy of computational biology. Just as genomic BLAST sequencing scans billions of DNA base pairs to identify protein domains without "executing" the organism, GitGalaxy blAST sequences raw source code to identify **Structural Signatures**. -Think of GitGalaxy as a highly calibrated telescope for codebase risk. Every assumption the system makes is abstracted into over 300 tunable variables. You can query active API nodes, isolate supply chain threats, or highlight functions exhibiting extreme cognitive load—all adjusted via custom shields to eliminate false-positive fatigue. Field-tested on over 1,000 repositories, the engine comes equipped with enterprise-grade defaults ready for immediate CI/CD deployment. +Instead of mapping gene starts and genetic mutations, the engine deterministically sequences coding intent, execution boundaries, and architectural risk exposures. This enables the engine to build a deterministic 3D knowledge graph of your entire repository in linear O(N) time without ever requiring the code to compile. It instantly calculates the ratio of test boundaries to core logic, maps network blast radiuses, and extracts the vital project structure data that rigid linters ignore. -**Core Codebase Mapping Technology** -* Bypasses LLMs and rigid ASTs. -* Doesn't require code to compile (AST-free). -* Produces full network mapping via imports with generlized function calls per file. -* Deterministically maps code by 60+ keyword regex profiles (Structural markers, I/O intents, state mutations). -* Regex keyword profiles allow us to classify functions, files, classes, folders and repos. -* Eliminates LLM architectural hallucinations and context window limits. -* Scans 50+ languages, 250+ extensions, fully folder-aware. **([How to add a language in 1 minute and 1 prompt](gitgalaxy/standards/HOW_TO_ADD_LANGUAGE.md))** +*(Note: Raw structural signatures are simply data points. True risk exposures in GitGalaxy are calculated metrics derived from these structural alignments combined with topological network gravity).* + +Think of GitGalaxy as a highly calibrated macro-analyzer for codebase risk. Every assumption the system makes is abstracted into over 300 tunable variables. You can query active API nodes, isolate supply chain threats, or highlight functions exhibiting extreme cognitive load—all adjusted via custom thresholds to eliminate false-positive fatigue. Field-tested on over 1,000 repositories, the engine comes equipped with enterprise-grade defaults ready for immediate CI/CD deployment. + +**Core Repository Mapping Technology** +* **Heuristic Structural Alignment:** Bypasses LLMs and rigid ASTs. Sequences code in raw text without requiring it to compile. +* **Deterministic Sequencing:** Maps code using 60+ bounded structural signatures (I/O intents, state mutations, execution wrappers) exactly like sequencing genetic markers. +* **Taxonomical Classification:** These structural profiles allow us to classify functions, files, classes, and entire repositories into distinct architectural archetypes. +* **Topological Cartography:** Produces full network mapping via imports and dynamic execution markers. +* **Zero-Hallucination:** Eliminates LLM architectural hallucinations and context window limits by relying strictly on mathematical constraints. +* **Polyglot Sequencing:** Scans 50+ languages, 250+ extensions, fully folder-aware. **([How to add a language in 1 minute and 1 prompt](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md))** **Enterprise Scale & Performance Metrics** * **Active Pipeline Integration:** Over 11,000 PyPI downloads, driven heavily by automated CI/CD security sweeps and zero-trust DevSecOps workflows. @@ -54,7 +57,7 @@ GitGalaxy is backed by an academic-grade thesis detailing the equations powering * **[Languages are getting easier to regex for meaning and intent](https://squid-protocol.github.io/gitgalaxy/03-02-claim-2-explicitness/):** Quantifies linguistic opacity. Assigns mathematical "trust dampeners" to implicit languages. * **[All languages have keywords that roughly do the same thing, these can be grouped to make cross-language keyword maps](https://squid-protocol.github.io/gitgalaxy/03-03-claim-3-taxonomy-map/):** Standardizes 50+ languages into a single universal physical framework. * **[Cross-Language Comparisons of over 1000 repos](https://squid-protocol.github.io/gitgalaxy/03-04-claim-4-comparing-languages/):** Deterministic 1:1 benchmarking of distinct syntax architectures. -* **[Universal File Archetypes by k-means clustering](https://squid-protocol.github.io/gitgalaxy/03-05-claim-5-file-archetypes/):** ML isolation of files into K-means clusters (e.g., "The God Nodes," "Declarative Glue"). +* **[Universal File Archetypes by k-means clustering](https://squid-protocol.github.io/gitgalaxy/03-05-claim-5-file-archetypes/):** ML isolation of files into K-means clusters. * **[Mainframe Proven: 100% CI/CD Translation Success Rate](https://github.com/squid-protocol/gitgalaxy/tree/main/examples/ibm_cics_translation):** Flawless architectural translation of 27 distinct legacy COBOL repositories (including IBM CICS benchmark apps) into compiling Java Spring Boot environments. **Data Privacy & On-Premise Deployment** @@ -65,7 +68,7 @@ GitGalaxy is backed by an academic-grade thesis detailing the equations powering **Installation & Usage** * Python-based: `pip install gitgalaxy` * CLI execution -* CI/CD Integration: Native **[GitHub Action](github-action-read-me.md)** available for zero-trust DevSecOps pipelines. +* CI/CD Integration: Native **[GitHub Action](https://github.com/squid-protocol/gitgalaxy/blob/main/github-action-readme.md)** available for zero-trust DevSecOps pipelines. * Outputs forensic JSONs (optimized for AI-agent summary reports) and a native SQLite3 database for robust querying and storage. > **📖 Official Documentation:** Read the full technical specifications, architecture blueprints, and the Taxonomical Equivalence Map at **[squid-protocol.github.io/gitgalaxy](https://squid-protocol.github.io/gitgalaxy/)**. @@ -93,11 +96,11 @@ galaxyscope /path/to/your/local/repo GitGalaxy can be integrated directly into your GitHub Actions pipeline for automated DevSecOps auditing, Zero-Trust SBOM generation, or Pre-Commit firewalls. -**🚀 [View the Full GitHub Action Integration Guide](github-action-read-me.md)** +**🚀 [View the Full GitHub Action Integration Guide](https://github.com/squid-protocol/gitgalaxy/blob/main/github-action-readme.md)** -Check out our comprehensive guide to set up the **"Golden Path" Pipeline** (Parallel Enforcement & Autonomous Reporting). It covers all available Sentinel tools, AI guardrails, and advanced configuration options like our hyper-sensitive `--paranoid` threat-hunting mode. +Check out our comprehensive guide to set up the **Pipeline Architecture** (Parallel Enforcement & Autonomous Reporting). It covers all available inspection tools, AI guardrails, and advanced configuration options like our hyper-sensitive `--paranoid` threat-hunting mode. -*Minimal Example (Running a single Sentinel):* +*Minimal Example (Running a single inspection tool):* ```yaml name: GitGalaxy Security Audit @@ -121,45 +124,45 @@ jobs: --- -### [GitGalaxy Core Analysis Engine](docs/wiki/01-project-overview.md) +### [GitGalaxy Core Analysis Engine](https://github.com/squid-protocol/gitgalaxy/blob/main/docs/wiki/01-project-overview.md) The central blAST engine. It bypasses rigid ASTs using mathematical heuristics to map O(N) multi-dimensional relationships across 50+ languages, managing signal processing, spatial layout, and high-speed SQLite telemetry recording. ## Enterprise Codebase Tools & Use Cases -GitGalaxy operates on a modular Hub-and-Spoke architecture. While the core engine provides the overarching physics and cartography, our specialized toolsets leverage that deterministic graph to execute enterprise-grade operations. +GitGalaxy operates on a Decoupled Architecture. While the core engine provides the overarching structural mechanics and topological mapping, our specialized Decoupled Execution Controllers leverage that deterministic graph to execute enterprise-grade operations. -### [Automated Legacy Migration: COBOL to Java Spring Boot](gitgalaxy/tools/cobol_to_java/) +### [Automated Legacy Migration: COBOL to Java Spring Boot](https://github.com/squid-protocol/gitgalaxy/tree/main/gitgalaxy/tools/cobol_to_java/) A deterministic, high-fidelity translation pipeline. It converts legacy COBOL into fully compiling, modern Spring Boot architectures, mapping memory exactly and scaffolding JPA entities, REST controllers, and Maven builds before utilizing AI to translate isolated business logic. * **Proven Metric:** Achieved a perfect 27/27 Maven compile success rate across a batch test of distinct legacy repos. -* **Verify for Yourself:** [Inspect the raw outputs of the IBM CICS Application Translation here.](examples/ibm_cics_translation/) +* **Verify for Yourself:** [Inspect the raw outputs of the IBM CICS Application Translation here.](https://github.com/squid-protocol/gitgalaxy/tree/main/examples/ibm_cics_translation/) -![Java Forge & Batch Test](./docs/wiki/assets/java_forge_and_batch_test.gif) +![Java Forge & Batch Test](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/java_forge_and_batch_test.gif) -### [Mainframe Refactoring: COBOL & JCL Optimization](gitgalaxy/tools/cobol_to_cobol/) -A mathematical x-ray suite for sanitizing mainframe monoliths. It safely neutralizes legacy lexical traps, extracts dead "Graveyard" memory, maps topological DAG execution orders, and generates Zero-Trust JCL configurations for modern cloud deployments. -* **Proven Metric:** The Graveyard Reaper engine extracted over 6,700 lines of dead execution blocks and orphaned variables from the standard IBM CICS benchmark app in seconds. +### [Mainframe Refactoring: COBOL & JCL Optimization](https://github.com/squid-protocol/gitgalaxy/tree/main/gitgalaxy/tools/cobol_to_cobol/) +An analytical suite for sanitizing mainframe monoliths. It safely neutralizes legacy lexical traps, extracts dead execution memory, maps topological DAG execution orders, and generates Zero-Trust JCL configurations for modern cloud deployments. +* **Proven Metric:** The dead-code extraction engine removed over 6,700 lines of dead execution blocks and orphaned variables from the standard IBM CICS benchmark app in seconds. -### [Software Supply Chain Security & Pre-Commit Firewalls](gitgalaxy/tools/supply_chain_security/) -Extreme-velocity pre-commit firewalls. Instead of trusting manifest files, it scans physical internals to block steganography, sub-atomic XOR decryption loops, homoglyph typosquatting, and exposed cryptographic vaults before they ever enter your CI/CD pipeline. **[Deploy directly via our GitHub Action](github-action-read-me.md).** +### [Software Supply Chain Security & Pre-Commit Firewalls](https://github.com/squid-protocol/gitgalaxy/tree/main/gitgalaxy/tools/supply_chain_security/) +Extreme-velocity pre-commit firewalls. Instead of trusting manifest files, it scans physical internals to block steganography, byte-level XOR decryption loops, homoglyph typosquatting, and exposed cryptographic vaults before they ever enter your CI/CD pipeline. **[Deploy directly via our GitHub Action](https://github.com/squid-protocol/gitgalaxy/blob/main/github-action-readme.md).** -### [Zero-Trust SBOM Generation & Dependency Auditing](gitgalaxy/tools/compliance/) +### [Zero-Trust SBOM Generation & Dependency Auditing](https://github.com/squid-protocol/gitgalaxy/tree/main/gitgalaxy/tools/compliance/) A Zero-Trust Software Bill of Materials (SBOM) generator. It refuses to blindly trust `package.json` or `requirements.txt` files, instead locating the physical dependencies on disk, mathematically verifying their entropy and linguistic identity, and generating strict CycloneDX 1.4 JSON reports. * **Proven Metric:** Successfully mapped and mathematically verified the physical internals of 170 unique Go modules inside the local Kubernetes repository. -### [API Security & Shadow API Detection](gitgalaxy/tools/network_auditing/) +### [API Security & Shadow API Detection](https://github.com/squid-protocol/gitgalaxy/tree/main/gitgalaxy/tools/network_auditing/) A deterministic mapping tool that hunts undocumented vulnerabilities. It uses structural regex to find active physical routing logic (Express, Spring Boot, FastAPI) and applies set theory against official OpenAPI/Swagger documentation to isolate critical Shadow APIs and outdated Ghost APIs. -### [High-Speed PII Detection & Log Analysis](gitgalaxy/tools/terabyte_log_scanning/) +### [High-Speed PII Detection & Log Analysis](https://github.com/squid-protocol/gitgalaxy/tree/main/gitgalaxy/tools/terabyte_log_scanning/) Unindexed, tactical log analysis operating at 0.07 GB/sec. It streams massive database dumps to deterministically hunt and mask PII (Credit Cards, SSNs, AWS Keys) and uses static architecture maps to prove exact runtime execution frequencies with ASCII time-series histograms. -### [AI Agent Guardrails & Codebase Protection](gitgalaxy/tools/ai_guardrails/) -Specialized keyword sensors protecting both your application and your codebase. The AppSec Sensor detects weaponized LLM features (RCE funnels, exfiltration risks), while the Dev Agent Firewall evaluates token mass and blast radius to restrict autonomous coding agents from modifying dangerous over context token-draining files. Helps identify which files need to be chunked to reduce context overload. +### [AI Agent Guardrails & Codebase Protection](https://github.com/squid-protocol/gitgalaxy/tree/main/gitgalaxy/tools/ai_guardrails/) +Specialized keyword sensors protecting both your application and your codebase. The AppSec Sensor detects weaponized LLM features (RCE funnels, exfiltration risks), while the Dev Agent Firewall evaluates token mass and blast radius to restrict autonomous coding agents from modifying dangerous or context-token-draining files. Helps identify which files need to be chunked to reduce context overload. ## Local Browser-Based 3D Codebase Visualization -If you prefer visual analytics, we've built a non-numerical dashboard where each file represents a star, sized and colored according to specific risk metrics. +If you prefer visual analytics, we've built a topological dashboard where each file represents a node, sized and colored according to specific risk metrics. Simply drag and drop your generated `your_repo_GPU_galaxy.json` file (or a `.zip` of your raw repository) directly into [GitGalaxy.io](https://gitgalaxy.io/). All rendering and scanning happens entirely in your browser's local memory. @@ -167,10 +170,7 @@ Simply drag and drop your generated `your_repo_GPU_galaxy.json` file (or a `.zip **Mapping 3.2 Million Lines of C++ in 11 Seconds | OpenCV** [![OpenCV Demo](https://img.youtube.com/vi/3ScQCSUBdZw/maxresdefault.jpg)](https://youtu.be/3ScQCSUBdZw) -**Visualizing Architectural Risk | Ruby on Rails** [![Ruby on Rails Demo](https://img.youtube.com/vi/3ScQCSUBdZw/maxresdefault.jpg)](https://youtu.be/3ScQCSUBdZw) -*(Note: Replace the video IDs in this link with your actual Rails video ID)* - -![GitGalaxy Meta Visualizer 3D star map rendering complex software repository structures and K-means clustering archetypes in the browser](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/metavisualizer.png) +![GitGalaxy Topological Visualizer 3D graph rendering complex software repository structures and K-means clustering archetypes in the browser](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/metavisualizer.png) ## Zero-Trust Data Security @@ -198,4 +198,4 @@ export GITGALAXY_LICENSE_KEY="COMMUNITY_FREE_TIER" ### 🏢 Commercial & Enterprise Use Running GitGalaxy in corporate environments, proprietary codebases, or commercial CI/CD pipelines requires an enterprise license. Unlicensed corporate pipelines will experience intentional execution friction, and attempting to use the Community Free Tier key in a corporate environment will trigger explicit non-compliance warnings in your audit logs. -To acquire a zero-trust commercial key for your organization and ensure clean compliance logs, please contact: **joe@gitgalaxy.io** +To acquire a zero-trust commercial key for your organization and ensure clean compliance logs, please contact: **joe@gitgalaxy.io** \ No newline at end of file diff --git a/gitgalaxy/README.md b/gitgalaxy/README.md index 34010381..c643888d 100755 --- a/gitgalaxy/README.md +++ b/gitgalaxy/README.md @@ -6,66 +6,69 @@ Welcome to the internal source code for the **GitGalaxy Core Engine**. -This directory contains the central orchestrator—**GalaxyScope**—alongside the core physics, optical routing, and mathematical heuristics that power the entire system. If you are a developer looking to contribute, understand the pipeline, or run the primary CLI, here is your architectural map. +This directory contains the central orchestrator—**GalaxyScope**—alongside the core structural mechanics, lexical routing, and mathematical heuristics that power the entire DevSecOps ecosystem. If you are a developer looking to contribute, understand the data pipeline, or run the primary CLI, here is your architectural map. ### 🗺️ The Developer Map (How the Pipeline Flows) -When you trigger the `galaxyscope` command, the data flows through these five physical directories: +When you trigger the `galaxyscope` command, the data flows through these physical directories: -* **`/core/` (The Frontline):** The optical routing layer. Contains the [Aperture Filter](https://squid-protocol.github.io/gitgalaxy/02-03-aperture-filter/) and [The Prism](https://squid-protocol.github.io/gitgalaxy/02-07-the-prism/), which break down source code into structural signals, separating executable logic from ghost mass (comments) and inert binaries. -* **`/physics/` (The Math):** The heuristics engine. Contains the [Signal Processor](https://squid-protocol.github.io/gitgalaxy/02-09-signal-processing/) and [Neural Auditor](https://squid-protocol.github.io/gitgalaxy/02-19-neural-auditor/), which apply GitGalaxy mathematics to score O(N) complexity, topological blast radius, and state flux without using ASTs. -* **`/recorders/` (The Exporters):** The translation layer. Converts the internal state maps into highly relational [SQLite Databases](https://squid-protocol.github.io/gitgalaxy/02-21-record-keeper/), AI-agent JSON tickets, and the final 3D WebGPU payload. -* **`/security/` (The Sentinel):** The zero-trust validation layer. Contains the [Security Lens](https://squid-protocol.github.io/gitgalaxy/02-06-security-lens/) responsible for intercepting embedded malware, hardcoded secrets, and logic bombs on the fly. -* **`/tools/` (The Spokes):** The enterprise automation layer. Contains specialized controllers for CI/CD pipelines—like the [Supply Chain Firewall](https://squid-protocol.github.io/gitgalaxy/04-03-supply-chain-firewall/) and [PII Leak Hunter](https://squid-protocol.github.io/gitgalaxy/04-06-pii-leak-hunter/)—that consume the core engine's telemetry. **These specialized tools power our official [Zero-Trust DevSecOps GitHub Action](../github-action-read-me.md).** +* **`/core/` (The Frontline):** The lexical routing layer. Contains the [Aperture Filter](https://squid-protocol.github.io/gitgalaxy/02-03-aperture-filter/) and [The Prism](https://squid-protocol.github.io/gitgalaxy/02-07-the-prism/), which break down source code into structural signals, stripping away inert binaries and separating executable logic from documentation. +* **`/metrics/` (The Math):** The heuristic and statistical engine. Contains the [Signal Processor](https://squid-protocol.github.io/gitgalaxy/02-09-signal-processing/) and [Statistical Auditor](https://squid-protocol.github.io/gitgalaxy/02-19-neural-auditor/), which apply GitGalaxy mathematics to calculate O(N) complexity, topological blast radius, and architectural drift without requiring ASTs. +* **`/security/` (The Threat Validator):** The security inference layer. Contains the [Security Lens](https://squid-protocol.github.io/gitgalaxy/02-06-security-lens/) responsible for identifying embedded malware signatures, autonomous AI execution vectors, and destructive execution patterns during Phase 1 ingestion. +* **`/recorders/` (The Exporters):** The translation layer. Converts the internal state RAM maps into highly relational [SQLite Databases](https://squid-protocol.github.io/gitgalaxy/02-21-record-keeper/), intermediate JSON representation for AI agents, and the final WebGPU visualization payload. +* **`/standards/` (The Calibration Layer):** The source of truth for the engine. Contains the polyglot lexical taxonomies and the global configuration profiles that tune the engine's strictness levels. +* **`/tools/` (The Execution Controllers):** The enterprise automation layer. Contains specialized controllers for CI/CD pipelines—like the [Supply Chain Firewall](https://squid-protocol.github.io/gitgalaxy/04-03-supply-chain-firewall/) and [PII Leak Hunter](https://squid-protocol.github.io/gitgalaxy/04-06-pii-leak-hunter/)—that independently consume the core engine's telemetry. + +*(Note: The `cobol_*_controller.py` scripts at the root level act as dedicated entry points for the Mainframe Legacy Modernization suite, bypassing standard Git orchestration to process flat mainframe directories).* --- ### ⚡ Performance Showcase: NVDA (NonVisual Desktop Access) -To demonstrate the GalaxyScope orchestrator's capability on complex, cross-language system architecture, we unleashed it on **NVDA**, the open-source Windows screen reader. +To demonstrate the GalaxyScope orchestrator's capability on complex, cross-language system architecture, we executed it against **NVDA**, the open-source Windows screen reader. Because NVDA relies heavily on bridging Python application logic with low-level C++ system hooks, it requires advanced polyglot dependency mapping. The blAST engine successfully parsed the mixed-language architecture, analyzing **236,754 lines of code** in just **5.59 seconds** (a velocity of 42,357 LOC/sec). -Crucially, during the import resolution phase, the Air-Gapped Dependency Radar successfully intercepted a structural naming collision (`fstream` vs `sstream`), proving the real-time typosquatting defenses are fully operational without relying on cloud APIs. +Crucially, during the import resolution phase, the local dependency scanner successfully intercepted a structural naming collision (`fstream` vs `sstream`), proving the real-time typosquatting defenses are fully operational without relying on cloud-based CVE APIs. -> **Note on False Positives:** Because `fstream` and `sstream` are both standard C++ libraries, this specific flag is a false positive. To prevent the engine from halting on trusted internal libraries, contributors can whitelist them by adding them to the global `approved_imports.json` registry (see [GitGalaxy Config](https://squid-protocol.github.io/gitgalaxy/06-01-gitgalaxy-config/)). +> **Enterprise Calibration (Zero-Trust Enforcement):** Because `fstream` and `sstream` are both standard C++ libraries, flagging this collision demonstrates the engine's default Zero-Trust strictness. To prevent the pipeline from failing on trusted internal or standard libraries, DevSecOps teams simply add them to the `APPROVED_IMPORTS` allowlist in the [GitGalaxy Config](https://squid-protocol.github.io/gitgalaxy/06-01-gitgalaxy-config/). -![NVDA Processing Demo](../../docs/wiki/assets/nvda_processing.gif) +![NVDA Processing Demo](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/nvda_processing.gif) -```text +~~~text [INFO] PASS_1.5: Running Air-Gapped Typosquatting & Dependency Confusion Radar... [CRITICAL] 🚨 TYPOSQUATTING DETECTED: 'fstream' in nvdaHelper/vbufBase/storage.cpp closely matches anchor 'sstream'! [WARNING] Intercepted 1 typosquatting attempts via repository baseline analysis. ... [INFO] --- MISSION_SUCCESS: 849 files mapped in 5.59s --- [INFO] --- ENGINE_TELEMETRY: Processed 236,754 lines of code at 42,357 LOC/s --- -``` +~~~ --- ### 🛠️ Local Development & GalaxyScope Execution -If you are modifying the internal physics or optical routing, it is highly recommended to install the package in editable mode so your CLI commands instantly reflect your local code changes. +If you are modifying the internal analysis logic or lexical routing, it is highly recommended to install the package in editable mode so your CLI commands instantly reflect your local code changes. From the **root directory** of the repository, run: -```bash +~~~bash pip install -e . -``` +~~~ **Important:** GitGalaxy contains an embedded commercial licensing guardrail. To prevent a 5-second execution delay while testing your code locally, you must export the Community Free Tier key into your development environment before running the orchestrator: -```bash +~~~bash export GITGALAXY_LICENSE_KEY="COMMUNITY_FREE_TIER" -``` +~~~ Once installed and the key is set, you can trigger the main orchestrator globally from your terminal. This command runs the full [Data Pipeline](https://squid-protocol.github.io/gitgalaxy/02-01-pipeline-overview/) and outputs the final artifact. -```bash +~~~bash galaxyscope /path/to/test/repo --debug -``` +~~~ Before submitting a Pull Request, ensure your changes do not skew the core baseline risk equations by running the test suite: -```bash +~~~bash python3 -m unittest discover tests/ -``` +~~~ --- ### 🌌 Deep Dive into the Pipeline Architecture @@ -74,4 +77,4 @@ To fully understand how GalaxyScope processes data, maps files, and applies risk * 📖 **[GalaxyScope CLI Reference](https://squid-protocol.github.io/gitgalaxy/01-02-galaxyscope-cli-reference/)** (Flags, outputs, and behaviors) * 📖 **[The Data Pipeline Overview](https://squid-protocol.github.io/gitgalaxy/02-01-pipeline-overview/)** (Step-by-step breakdown of the runtime) * 📖 **[Risk Exposures & Methodology](https://squid-protocol.github.io/gitgalaxy/08-01-methodology/)** (The math behind the heuristics) -* 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** +* 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** \ No newline at end of file diff --git a/gitgalaxy/cobol_refractor_controller.py b/gitgalaxy/cobol_refractor_controller.py index 04e1ed6f..bbff9e45 100644 --- a/gitgalaxy/cobol_refractor_controller.py +++ b/gitgalaxy/cobol_refractor_controller.py @@ -69,7 +69,7 @@ def __init__(self, mode: str, db_path: Path): self._init_sql_schema() def _init_sql_schema(self): - """Sets up the relational structure for the massive codebase.""" + """Sets up the relational structure for the massive codebase. (Maintains legacy schema names for downstream compatibility)""" cursor = self.conn.cursor() cursor.execute(""" CREATE TABLE IF NOT EXISTS Graveyard ( @@ -133,9 +133,7 @@ def close(self): # ============================================================================== -def process_payload( - filepath: Path, state_manager: IRStateManager, target_var: str = None -) -> dict: +def process_payload(filepath: Path, state_manager: IRStateManager, target_var: str = None) -> dict: """Processes a single COBOL payload through the enriched, shared-state pipeline.""" print(f" ⚙️ Analyzing {filepath.name}...") program_id = filepath.stem @@ -154,29 +152,23 @@ def process_payload( # Check for the Corporate Header stamp header_file = filepath.parent / "corporate_header.txt" if header_file.exists(): - ir["metadata"]["corporate_header"] = header_file.read_text( - encoding="utf-8", errors="ignore" - ) + ir["metadata"]["corporate_header"] = header_file.read_text(encoding="utf-8", errors="ignore") try: - ir["metadata"]["loc"] = len( - filepath.read_text(encoding="utf-8", errors="ignore").splitlines() - ) + ir["metadata"]["loc"] = len(filepath.read_text(encoding="utf-8", errors="ignore").splitlines()) except Exception: return ir # --- PHASE 0: PRE-PROCESSING (Sanitizing the code) --- was_patched = patch_lexical_traps(filepath) if was_patched: - print( - f" ↳ [!] Lexical Patcher applied to {filepath.name} (NEXT SENTENCE neutralized)" - ) + print(f" ↳ [!] Lexical Patcher applied to {filepath.name} (NEXT SENTENCE neutralized)") # --- PHASE 1: RECONNAISSANCE & ANALYSIS --- - # A. Graveyard Reaper (Identifies Dead Memory & Phantom Logic) + # A. Deprecated Trails Analyzer (Identifies Dead Memory & Unreachable Logic) graveyard_data = x_ray_dead_code(filepath) - ir["analysis"]["graveyard"] = graveyard_data + ir["analysis"]["dead_code"] = graveyard_data if graveyard_data: # Save to the abstracted State Manager (RAM or SQL) @@ -190,7 +182,7 @@ def process_payload( dead_paras = state_manager.get_dead_paras(program_id) orphans = state_manager.get_orphaned_vars(program_id) - # B. DAG Architect (Maps I/O Intent - Utilizing Graveyard RAM to deflect Ghost Dependencies!) + # B. DAG Architect (Maps I/O Intent - Utilizing Deprecated Trails RAM to deflect Hallucinated Dependencies!) ir["analysis"]["lineage"] = extract_lineage(filepath, dead_paras=dead_paras) # C. JCL Forge (Extracts Program ID and Subsystems) @@ -200,7 +192,7 @@ def process_payload( # --- PHASE 2: CONTEXT-AWARE GENERATION --- - # A. Schema Forge (Injecting Graveyard RAM to prevent Cloud Schema Bloat) + # A. Schema Forge (Injecting Deprecated Trails RAM to prevent Schema Bloat) ir["generation"]["schemas"] = forge_schemas( filepath, ignore_vars=orphans, @@ -218,7 +210,7 @@ def process_payload( corporate_header=ir["metadata"]["corporate_header"], ) - # C. Microservice Slicer (Injecting Graveyard RAM to bypass dead execution blocks) + # C. Microservice Slicer (Injecting Deprecated Trails RAM to bypass dead execution blocks) if target_var: slice_result = slice_business_logic( filepath, @@ -247,9 +239,7 @@ def main(): enforce_licensing_guard("COBOL Refractor (The Legacy Forge)") - parser = argparse.ArgumentParser( - description="GitGalaxy COBOL Refractor Controller (v4)" - ) + parser = argparse.ArgumentParser(description="GitGalaxy COBOL Refractor Controller (v4)") parser.add_argument("target", help="The legacy repository or directory to scan") parser.add_argument( "--var", @@ -315,9 +305,7 @@ def main(): # Write JSON IR Dump for downstream visualizers ir_dump_file = ir_dir / f"{file_path.stem}_ir.json" - safe_ir = json.loads( - json.dumps(ir_state, default=lambda o: list(o) if isinstance(o, set) else o) - ) + safe_ir = json.loads(json.dumps(ir_state, default=lambda o: list(o) if isinstance(o, set) else o)) ir_dump_file.write_text(json.dumps(safe_ir, indent=2)) # Write JCL Artifacts @@ -329,9 +317,7 @@ def main(): # Write Schema Artifacts if ir_state["generation"].get("schemas"): schema_output = schema_dir / f"{file_path.stem}_schema.sql" - schema_output.write_text( - ir_state["generation"]["schemas"]["sql"], encoding="utf-8" - ) + schema_output.write_text(ir_state["generation"]["schemas"]["sql"], encoding="utf-8") json_output = schema_dir / f"{file_path.stem}_schema.json" json_output.write_text( json.dumps(ir_state["generation"]["schemas"]["json"], indent=2), @@ -344,25 +330,21 @@ def main(): slice_data = ir_state["generation"]["microservice"] if slice_data.get("business_rules"): slice_output = slice_dir / f"{file_path.stem}_slice.json" - slice_output.write_text( - json.dumps(slice_data, indent=2), encoding="utf-8" - ) + slice_output.write_text(json.dumps(slice_data, indent=2), encoding="utf-8") master_scaffold_stats["slices_extracted"] += 1 - # Aggregate Graveyard Stats - gy = ir_state["analysis"].get("graveyard") + # Aggregate Deprecated Trails Stats + gy = ir_state["analysis"].get("dead_code") if gy: master_graveyard_stats["loc_saved"] += gy.get("loc_saved", 0) master_graveyard_stats["orphaned_vars"] += len(gy.get("orphaned_vars", [])) master_graveyard_stats["dead_paras"] += len(gy.get("dead_paras", [])) - # Aggregate Honesty Protocol Flags + # Aggregate Architectural Anomalies lineage = ir_state["analysis"].get("lineage") if lineage and lineage.get("unresolved_calls"): for call in lineage["unresolved_calls"]: - master_honesty_flags.append( - f"[{file_path.name}] Unresolved Dynamic CALL to: {call}" - ) + master_honesty_flags.append(f"[{file_path.name}] Unresolved Dynamic CALL to: {call}") system_limits = ir_state["analysis"].get("honesty_flags") if system_limits: @@ -384,60 +366,36 @@ def main(): f.write(" GITGALAXY MODERNIZATION REPORT\n") f.write("==========================================================\n\n") - f.write("[1] EXECUTIVE METRICS & NECROSIS REDUCTION\n") + f.write("[1] EXECUTIVE METRICS & DEPRECATED TRAILS REDUCTION\n") f.write("----------------------------------------------------------\n") f.write(f" • Files Scanned : {len(cobol_files)}\n") f.write(f" • State Manager Mode : {ir_mode}\n") - f.write( - f" • Unused Memory Addresses : {master_graveyard_stats['orphaned_vars']} orphaned variables\n" - ) - f.write( - f" • Unreachable Logic Blocks: {master_graveyard_stats['dead_paras']} phantom paragraphs\n" - ) - f.write( - f" ✂️ Estimated Bloat Removed: ~{master_graveyard_stats['loc_saved']} Lines of Code\n\n" - ) + f.write(f" • Unused Memory Addresses : {master_graveyard_stats['orphaned_vars']} orphaned variables\n") + f.write(f" • Unreachable Logic Blocks: {master_graveyard_stats['dead_paras']} unreachable blocks\n") + f.write(f" ✂️ Estimated Bloat Removed: ~{master_graveyard_stats['loc_saved']} Lines of Code\n\n") f.write("[2] ZERO-TRUST JCL ARCHITECTURE\n") f.write("----------------------------------------------------------\n") f.write(f" • Programs Audited : {audit_metrics['audited']}\n") - f.write( - f" • Original Legacy LOC : {audit_metrics['original_loc']} lines\n" - ) - f.write( - f" • GitGalaxy Zero-Trust LOC : {audit_metrics['forged_loc']} lines\n" - ) - f.write( - f" 📉 Total Code Bloat Removed : {audit_metrics.get('bloat_reduction_pct', 0)}%\n" - ) - f.write( - f" 🛡️ Over-Permissioned I/O : {audit_metrics['excess_dds_blocked']} physical files secured\n\n" - ) + f.write(f" • Original Legacy LOC : {audit_metrics['original_loc']} lines\n") + f.write(f" • GitGalaxy Zero-Trust LOC : {audit_metrics['forged_loc']} lines\n") + f.write(f" 📉 Total Code Bloat Removed : {audit_metrics.get('bloat_reduction_pct', 0)}%\n") + f.write(f" 🛡️ Over-Permissioned I/O : {audit_metrics['excess_dds_blocked']} physical files secured\n\n") f.write("[3] GENERATED CLOUD SCAFFOLDING\n") f.write("----------------------------------------------------------\n") - f.write( - f" • PostgreSQL DDLs & JSON Schemas Forged : {master_scaffold_stats['schemas_forged']}\n" - ) - f.write( - f" • Zero-Trust Emulator JCLs Generated : {master_scaffold_stats['jcls_forged']}\n" - ) - f.write( - f" • Isolated Microservice Slices Extracted: {master_scaffold_stats['slices_extracted']}\n\n" - ) + f.write(f" • PostgreSQL DDLs & JSON Schemas Forged : {master_scaffold_stats['schemas_forged']}\n") + f.write(f" • Zero-Trust Emulator JCLs Generated : {master_scaffold_stats['jcls_forged']}\n") + f.write(f" • Isolated Microservice Slices Extracted: {master_scaffold_stats['slices_extracted']}\n\n") - f.write("[4] ⚠️ MANUAL INTERVENTION AUDIT (HONESTY PROTOCOL)\n") + f.write("[4] ⚠️ MANUAL INTERVENTION AUDIT (ARCHITECTURAL ANOMALIES)\n") f.write("----------------------------------------------------------\n") f.write(f" • AI Agent Job Tickets Generated : {agent_jobs_created}\n\n") if not master_honesty_flags: - f.write( - " ✅ No structural anomalies detected. DAG is highly deterministic.\n" - ) + f.write(" ✅ No structural anomalies detected. DAG is highly deterministic.\n") else: - f.write( - " The following files contain structural anomalies that require architectural review:\n" - ) + f.write(" The following files contain structural anomalies that require architectural review:\n") for flag in master_honesty_flags: f.write(f" [!] {flag}\n") f.write("\n==========================================================\n") @@ -449,4 +407,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/cobol_to_java_controller.py b/gitgalaxy/cobol_to_java_controller.py index f553876f..a223144f 100644 --- a/gitgalaxy/cobol_to_java_controller.py +++ b/gitgalaxy/cobol_to_java_controller.py @@ -1,10 +1,21 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy: COBOL to Java Translation Controller (v3 - Full Compilation Scaffolding) -# Purpose: Orchestrates the Cloud Escape Hatch. Ingests the JSON Intermediate -# Representation (IR) from the clean room and generates a Spring Boot -# microservice scaffolding ready for an AI agent to complete. -# Includes Corporate Header injection, CI/CD Audit Reporting, and Maven Build generation. +# GitGalaxy Tool: COBOL to Java Translation Controller +# +# PURPOSE: +# Orchestrates the Cloud Modernization Pathway. Ingests the JSON Intermediate +# Representation (IR) from the isolated staging environment and generates a +# Spring Boot microservice scaffolding ready for an autonomous agent to complete. +# Includes Corporate Header injection, CI/CD Audit Reporting, and Maven Build generation. +# +# ARCHITECTURAL DECISION: +# Autonomous AI agents struggle to generate entire enterprise architectures from +# scratch without hallucinating external dependencies or breaking Dependency +# Injection (DI) chains. This controller deterministically generates the 100% +# compilable boilerplate (POM, YML, JPA Entities, REST Controllers, and Mock +# Services) based on the strict COBOL structural extraction. It delegates ONLY +# the internal business logic to the AI agent, ensuring architectural integrity +# and guaranteed compilability out-of-the-box. # ============================================================================== import argparse import sys @@ -73,10 +84,8 @@ def format_java_header(header_text: str) -> str: def generate_mock_service(subroutine_name: str, package_name: str) -> str: - """Forges a dummy @Service interface to satisfy Spring DI for missing external dependencies.""" - camel_name = "".join( - word.capitalize() for word in subroutine_name.replace("-", "_").split("_") - ) + """Generates a mock @Service interface to satisfy Spring DI for missing external dependencies.""" + camel_name = "".join(word.capitalize() for word in subroutine_name.replace("-", "_").split("_")) return f"""package {package_name}.service; import org.springframework.stereotype.Service; @@ -103,38 +112,29 @@ def generate_mock_service(subroutine_name: str, package_name: str) -> str: def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("COBOL-to-Java Translator (The Legacy Forge)") + enforce_licensing_guard("COBOL-to-Java Translator") parser = argparse.ArgumentParser(description="GitGalaxy COBOL to Java Controller") - parser.add_argument( - "clean_room", help="Path to the generated gitgalaxy_clean_[TIMESTAMP] directory" - ) - parser.add_argument( - "--pkg", default="com.gitgalaxy.modernized", help="Base Java package name" - ) - parser.add_argument( - "--header", default="header.txt", help="Path to the custom header text file" - ) + parser.add_argument("clean_room", help="Path to the isolated staging directory (gitgalaxy_clean_[TIMESTAMP])") + parser.add_argument("--pkg", default="com.gitgalaxy.modernized", help="Base Java package name") + parser.add_argument("--header", default="header.txt", help="Path to the custom header text file") args = parser.parse_args() clean_room_path = Path(args.clean_room).resolve() if not clean_room_path.exists(): - print(f"Error: Target clean room {clean_room_path} does not exist.") + print(f"Error: Target staging directory {clean_room_path} does not exist.") sys.exit(1) - java_out_dir = ( - clean_room_path.parent - / f"{clean_room_path.name.replace('clean', 'java_spring')}" - ) + java_out_dir = clean_room_path.parent / f"{clean_room_path.name.replace('clean', 'java_spring')}" if java_out_dir.exists(): shutil.rmtree(java_out_dir) - # Determine Artifact ID from clean room name + # Determine Artifact ID from the isolated staging directory name artifact_id = clean_room_path.name.split("_gitgalaxy_clean")[0].lower() app_class_name = "".join(word.capitalize() for word in artifact_id.split("-")) print("\n" + "=" * 70) - print(" ☕ GITGALAXY JAVA SPRING BOOT FORGE ENGAGED") + print(" ☕ GITGALAXY JAVA SPRING BOOT GENERATOR ENGAGED") print(f" Ingesting : {clean_room_path.name}") print(f" Artifact : {artifact_id}") print(f" Package : {args.pkg}") @@ -161,35 +161,27 @@ def main(): # Generate application.yml yml_content = generate_application_yml(artifact_id=artifact_id) - (java_dirs["resources"] / "application.yml").write_text( - yml_content, encoding="utf-8" - ) + (java_dirs["resources"] / "application.yml").write_text(yml_content, encoding="utf-8") stats["config_files"] += 1 # Generate Application Main Class main_class_content = generate_main_class(args.pkg, app_class_name) if java_header: main_class_content = java_header + main_class_content - (java_dirs["base_pkg"] / f"{app_class_name}Application.java").write_text( - main_class_content, encoding="utf-8" - ) + (java_dirs["base_pkg"] / f"{app_class_name}Application.java").write_text(main_class_content, encoding="utf-8") stats["config_files"] += 1 - # --- NEW: Generate EBCDIC Decoder Utility --- + # --- Generate EBCDIC Decoder Utility --- decoder_content = generate_decoder_util(args.pkg) if java_header: decoder_content = java_header + decoder_content - (java_dirs["util"] / "EbcdicDecoderUtil.java").write_text( - decoder_content, encoding="utf-8" - ) + (java_dirs["util"] / "EbcdicDecoderUtil.java").write_text(decoder_content, encoding="utf-8") stats["config_files"] += 1 # ------------------------------------------- - print( - " [+] Forged Build System: pom.xml, application.yml, Main Class, DecoderUtil" - ) + print(" [+] Generated Build System: pom.xml, application.yml, Main Class, DecoderUtil") - # 2. Forge JPA Entities from Schemas + # 2. Generate JPA Entities from Schemas schema_dir = clean_room_path / "02_cloud_schemas" if schema_dir.exists(): for schema_file in schema_dir.glob("*_schema.json"): @@ -198,10 +190,7 @@ def main(): java_code = generate_java_entity(schema, args.pkg) if java_header: java_code = java_header + java_code - class_name = "".join( - word.capitalize() - for word in schema.get("title", "Entity").split("_") - ) + class_name = "".join(word.capitalize() for word in schema.get("title", "Entity").split("_")) # Apply the exact same reserved word sanitization to the file name reserved_classes = { @@ -220,21 +209,17 @@ def main(): out_path = java_dirs["entity"] / f"{class_name}.java" out_path.write_text(java_code, encoding="utf-8") stats["entities"] += 1 - print(f" [+] Forged Entity: {class_name}.java") + print(f" [+] Generated Entity: {class_name}.java") except Exception as e: - print(f" [!] Failed to forge entity from {schema_file.name}: {e}") + print(f" [!] Failed to generate entity from {schema_file.name}: {e}") - # 3. Forge REST Controllers & Service Layers from IR Dumps + # 3. Generate REST Controllers & Service Layers from IR State Files ir_dir = clean_room_path / "04_ir_state_dumps" if ir_dir.exists(): for ir_file in ir_dir.glob("*_ir.json"): try: ir_state = json.loads(ir_file.read_text(encoding="utf-8")) - raw_prog_id = ( - ir_state.get("metadata", {}) - .get("file_name", "Unknown") - .split(".")[0] - ) + raw_prog_id = ir_state.get("metadata", {}).get("file_name", "Unknown").split(".")[0] # 🛡️ Prevent collision with Spring Boot's @Service annotation AND handle empty names if not raw_prog_id or raw_prog_id.strip() == "": @@ -242,13 +227,11 @@ def main(): elif raw_prog_id.lower() == "service": raw_prog_id = "legacy-service" - # ⚠️ CRITICAL: Inject the safe raw name back into IR State so the forges process it correctly + # ⚠️ CRITICAL: Inject the safe raw name back into IR State so the generators process it correctly ir_state.setdefault("metadata", {})["file_name"] = raw_prog_id + ".cbl" # Ensure file names are perfectly camel-cased with no hyphens for this controller - safe_file_name = "".join( - word.capitalize() for word in raw_prog_id.split("-") - ) + safe_file_name = "".join(word.capitalize() for word in raw_prog_id.split("-")) # 3A. Generate the @Service Skeleton service_code = generate_service_skeleton(ir_state, args.pkg) @@ -256,55 +239,45 @@ def main(): service_code = java_header + service_code out_path_svc = java_dirs["service"] / f"{safe_file_name}Service.java" out_path_svc.write_text(service_code, encoding="utf-8") - print(f" [+] Forged Service: {safe_file_name}Service.java") + print(f" [+] Generated Service: {safe_file_name}Service.java") # 3B. Generate the @RestController lineage = ir_state.get("analysis", {}).get("lineage", {}) - if lineage and ( - lineage.get("inputs") - or lineage.get("outputs") - or lineage.get("unresolved_calls") - ): + if lineage and (lineage.get("inputs") or lineage.get("outputs") or lineage.get("unresolved_calls")): java_code = generate_rest_controller(ir_state, args.pkg) if java_header: java_code = java_header + java_code - out_path_ctrl = ( - java_dirs["controller"] / f"{safe_file_name}Controller.java" - ) + out_path_ctrl = java_dirs["controller"] / f"{safe_file_name}Controller.java" out_path_ctrl.write_text(java_code, encoding="utf-8") stats["controllers"] += 1 - print(f" [+] Forged API : {safe_file_name}Controller.java") + print(f" [+] Generated API : {safe_file_name}Controller.java") - # 3C. Forge Mock Services for Unresolved Subroutines + # 3C. Generate Mock Services for Unresolved Subroutines unresolved = lineage.get("unresolved_calls", []) for sub in unresolved: # 🛡️ Skip empty, dynamic, or invalid subroutine calls if not sub or not sub.strip(): continue - safe_sub_name = "".join( - word.capitalize() for word in sub.replace("-", "_").split("_") - ) + safe_sub_name = "".join(word.capitalize() for word in sub.replace("-", "_").split("_")) # If it stripped down to nothing, skip it to prevent writing "Service.java" if not safe_sub_name: continue # Ensure we don't accidentally overwrite a real service if it was already generated - out_path_mock = ( - java_dirs["service"] / f"{safe_sub_name}Service.java" - ) + out_path_mock = java_dirs["service"] / f"{safe_sub_name}Service.java" if not out_path_mock.exists(): mock_code = generate_mock_service(sub, args.pkg) if java_header: mock_code = java_header + mock_code out_path_mock.write_text(mock_code, encoding="utf-8") - print(f" [+] Forged Mock : {safe_sub_name}Service.java") + print(f" [+] Generated Mock : {safe_sub_name}Service.java") except Exception as e: - print(f" [!] Failed to forge architecture from {ir_file.name}: {e}") + print(f" [!] Failed to generate architecture from {ir_file.name}: {e}") - # 4. Forge Autonomous AI Agent Tickets + # 4. Generate Autonomous AI Agent Tickets slice_dir = clean_room_path / "05_microservice_slices" if slice_dir.exists(): for slice_file in slice_dir.glob("*_slice.json"): @@ -312,19 +285,15 @@ def main(): slice_data = json.loads(slice_file.read_text(encoding="utf-8")) prog_id = slice_file.name.split("_")[0] ir_file = ir_dir / f"{prog_id}_ir.json" - ir_state = ( - json.loads(ir_file.read_text(encoding="utf-8")) - if ir_file.exists() - else None - ) + ir_state = json.loads(ir_file.read_text(encoding="utf-8")) if ir_file.exists() else None ticket_json = generate_java_agent_ticket(slice_data, prog_id, ir_state) out_path = java_dirs["agent_jobs"] / f"{prog_id}_java_service_job.json" out_path.write_text(json.dumps(ticket_json, indent=2), encoding="utf-8") stats["agent_jobs"] += 1 - print(f" [+] Forged Agent Job: {out_path.name}") + print(f" [+] Generated Agent Job: {out_path.name}") except Exception as e: - print(f" [!] Failed to forge job from {slice_file.name}: {e}") + print(f" [!] Failed to generate job from {slice_file.name}: {e}") # 5. Generate Master CI/CD Audit Report audit_report_path = java_out_dir / "java_migration_audit.txt" @@ -332,29 +301,29 @@ def main(): f.write("==========================================================\n") f.write(" GITGALAXY JAVA SPRING BOOT MIGRATION AUDIT\n") f.write("==========================================================\n\n") - f.write(f" • Source Clean Room : {clean_room_path.name}\n") - f.write(f" • Target Artifact : {artifact_id}\n") - f.write(f" • Target Package : {args.pkg}\n") - f.write(f" • Corporate Header Applied : {'Yes' if java_header else 'No'}\n\n") + f.write(f" • Source Staging Environment : {clean_room_path.name}\n") + f.write(f" • Target Artifact : {artifact_id}\n") + f.write(f" • Target Package : {args.pkg}\n") + f.write(f" • Corporate Header Applied : {'Yes' if java_header else 'No'}\n\n") f.write("[1] GENERATED CLOUD SCAFFOLDING\n") f.write("----------------------------------------------------------\n") - f.write(f" • Build & Config Files Forged : {stats['config_files']}\n") - f.write(f" • JPA Entities Generated : {stats['entities']}\n") - f.write(f" • REST Controllers Generated : {stats['controllers']}\n") - f.write(f" • AI Agent Tickets Generated : {stats['agent_jobs']}\n\n") + f.write(f" • Build & Config Files Scaffolded : {stats['config_files']}\n") + f.write(f" • JPA Entities Generated : {stats['entities']}\n") + f.write(f" • REST Controllers Generated : {stats['controllers']}\n") + f.write(f" • AI Agent Tickets Generated : {stats['agent_jobs']}\n\n") f.write("==========================================================\n") print("\n" + "=" * 70) print(" 🏁 SPRING BOOT TRANSLATION COMPLETE") print(f" 📁 Location: {java_out_dir}") print("----------------------------------------------------------------------") - print(f" • Build & Config Files Forged : {stats['config_files']}") - print(f" • JPA Entities Generated : {stats['entities']}") - print(f" • REST Controllers Generated : {stats['controllers']}") - print(f" • AI Agent Tickets Generated : {stats['agent_jobs']}") + print(f" • Build & Config Files Scaffolded : {stats['config_files']}") + print(f" • JPA Entities Generated : {stats['entities']}") + print(f" • REST Controllers Generated : {stats['controllers']}") + print(f" • AI Agent Tickets Generated : {stats['agent_jobs']}") print("======================================================================\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/core/README.md b/gitgalaxy/core/README.md index df922095..d1ffbc7b 100644 --- a/gitgalaxy/core/README.md +++ b/gitgalaxy/core/README.md @@ -1,44 +1,77 @@ -# GitGalaxy: Core Lexical Parsing & Topology Engine +# GitGalaxy Core: Ingestion, Standardization, and Structural Detection -[![Core](https://img.shields.io/badge/Core-Lexical_Parsing-00BFFF.svg)](#) -[![Velocity](https://img.shields.io/badge/Velocity-Zero_AST_Overhead-00C957.svg)](#) -[![Architecture](https://img.shields.io/badge/Architecture-Structural_Extraction-8A2BE2.svg)](#) +[![Pipeline](https://img.shields.io/badge/Pipeline-Phase_0_to_7-00BFFF.svg)](#) +[![Architecture](https://img.shields.io/badge/Architecture-AST--Free_Heuristics-8A2BE2.svg)](#) +[![Performance](https://img.shields.io/badge/Performance-Zero--Trust_Ingestion-FF4500.svg)](#) -This directory contains the primary ingestion, lexical tokenization, and topological mapping layers for the **blAST Engine**. +Welcome to **GitGalaxy Core**. This directory serves as the frontline of the entire analysis engine. -These files form the core ingestion pipeline. They are responsible for reading raw source code from disk, filtering out irrelevant noise (like massive minified files or `.git` directories), slicing the code into measurable architectural components, and wiring the mathematical network graph. +Before any risk equations are calculated or machine learning models are applied in the `metrics/` or `security/` directories, raw source code must be safely ingested, sanitized, and structurally identified. The modules in this directory are responsible for exactly that: transforming a chaotic, multi-gigabyte repository on disk into a standardized, mathematical state in RAM. -> **⚠️ Configuration Warning:** Do not modify these core files to tune the engine's behavior. Almost all variables, thresholds, language regexes, and mathematical tuning parameters have been abstracted to the **[Standards Registry](../standards/README.md)**. If you need to tweak risk curves or add a language, do it there. +## The Why: Defensive Engineering & The AST-Free Paradigm -### 🗺️ The Architecture +Enterprise repositories are rarely pristine. They contain minified vendor blobs, undocumented legacy monoliths, embedded malware, and broken syntax that refuses to compile. Traditional static analysis tools attempt to build Abstract Syntax Trees (ASTs) for these files, resulting in Out-Of-Memory (OOM) crashes, infinite loops, and pipeline timeouts. -Each file in this core represents a discrete phase in the GitGalaxy ingestion pipeline. Read the official documentation links for deep dives into the underlying mathematics. +GitGalaxy operates on a different philosophy: **Visualizing functional intent over rigid syntax parsing.** We bypass ASTs entirely. Instead, this core utilizes high-velocity, ReDoS-proof regular expressions to extract **Structural Signatures**. To achieve processing speeds exceeding 90,000 lines of code per second, the pipeline relies on extreme defensive engineering to protect the CPU and RAM from saturation. -* **`aperture.py` (The Boundary Filter):** The primary perimeter gate. It enforces Zero-Trust ingestion rules, blocking steganography, minified blobs, and infrastructure directories before they consume system memory. - * 📖 **[Read the Aperture Filter Specs](https://squid-protocol.github.io/gitgalaxy/02-03-aperture-filter/)** +--- + +## The What: The Information Flow (Module Breakdown) + +Data flows through these modules sequentially. If a file is unparsable by this engine, we deal with it gracefully, attempting to extract as much information while routing the file to the **Unparsable Artifacts** queue, preventing pipeline bottlenecks. + +### 1. `aperture.py` (The Boundary Filter) +**Role:** Zero-Trust Ingestion. +Information hits this filter first. It evaluates OS-level metadata (file path, extension, byte size) *before* executing any disk I/O. It actively shunts massive data dumps, neural network weights (`.safetensors`), and binary payloads masking as text files (via null-byte detection). This protects the Python memory space from immediate exhaustion. -* **`guidestar_lens.py` (The Metadata Resolver):** The contextual intelligence module. It parses standard project manifests (`package.json`, `Cargo.toml`), resolves explicit linguistic overrides via `.gitattributes`, and hunts for evasion tactics (like force-includes) hidden in `.gitignore`. - * 📖 **[Read the GuideStar Protocol Specs](https://squid-protocol.github.io/gitgalaxy/02-04-guidestar-protocol/)** +### 2. `guidestar_lens.py` (Contextual Baselines) +**Role:** Architectural Intelligence. +Rather than guessing what a file does, this module parses explicit project manifests (`package.json`, `.gitattributes`, `Cargo.toml`). If a file is defined as a roadmap anchor or test suite by the developer, GuideStar assigns an **Intent Lock**. This provides a contextual baseline that bypasses expensive heuristic guessing downstream. -* **`prism.py` (The Lexical Tokenizer):** The structural separator. It surgically separates human intent (documentation/comments) from structural execution logic across multiple syntax families, all while safely preserving complex string literals. - * 📖 **[Read the Prism Optics Specs](https://squid-protocol.github.io/gitgalaxy/02-07-the-prism/)** +### 3. `prism.py` (Payload & Surface Splitter) +**Role:** Lexical Tokenization. +This module takes the raw string data and surgically decouples it into mutually exclusive components: the **Executable Payload** (coding_stream) and the **Documentation Surface** (comment_stream). It utilizes an O(1) atomic literal shield to temporarily mask strings during the split, preventing the regex scanner from accidentally mutating URLs or string contents that mimic comment delimiters. -* **`detector.py` (The Function Slicer & Spatial Engine):** The architectural extractor. It splices files into discrete functions, calculates Big-O algorithmic nesting depth, tracks recursive functions, and assigns exact 3D coordinates using deterministic fractal distribution algorithms. - * 📖 **[Read the Detector Mechanics](https://squid-protocol.github.io/gitgalaxy/02-08-the-detector/)** +### 4. `detector.py` (The Structural Extractor) +**Role:** Structural Signature Identification. +This file categorizes different keyword terms into structural signature counts. It evaluates the Executable Payload to slice the code into discrete functional blocks, map intra-file invocations, and detect critical security behaviors (e.g., I/O boundaries, state mutation, RCE triggers). +* **Fluid-State Language Switching:** Rather than failing on polyglot files, the engine dynamically swaps syntax registries mid-file. It uses scope-aware handshakes to seamlessly isolate and parse embedded languages (e.g., evaluating SQL execution inside a Python string, or extracting JavaScript logic nested within HTML blocks) without losing context. +* **AST-Free Cyclomatic Complexity:** Instead of compiling an Abstract Syntax Tree to determine nesting depth, this module uses standard code indentation as a blazing-fast, highly accurate proxy for Big-O algorithmic complexity, allowing it to evaluate structural density at ~100,000 LOC/s. -* **`network_risk_sensor.py` (The Topology Mapper):** The mathematical routing layer. It wires the ingested files into a directed graph, executing PageRank mathematics to determine absolute Blast Radius, betweenness centrality, and ecosystem roles (Producer vs. Consumer). - * 📖 **[Read the Network Risk Sensor Specs](https://squid-protocol.github.io/gitgalaxy/02-16-network-risk-sensor/)** +### 5. `network_risk_sensor.py` (The Topology Mapper) +**Role:** Dependency Graphing. +Once files are structurally parsed, this module wires them together into a Directed Acyclic Graph (DAG) using their raw import statements. It executes PageRank mathematics to determine each file's absolute **Dependency Blast Radius**, identifies **Architectural Choke Points**, and classifies their **Ecosystem Role** (Producer vs. Consumer). -* **`state_rehydrator.py` (The Cache Manager):** The incremental differential scanner. It extracts the previous temporal state from the SQLite database and rehydrates it directly into RAM, enabling ultra-fast differential delta scanning for CI/CD pipelines. - * 📖 **[Read the State Rehydrator Specs](https://squid-protocol.github.io/gitgalaxy/02-22-state-rehydrator/)** +### 6. `spatial_mapper.py` (The Positioning Engine) +**Role:** 3D Geometric Resolution. +Transforms the mathematical DAG into a deterministic 3D Cartesian coordinate map for the WebGPU visualizer. It groups files into directory clusters relative to high-impact central nodes. -

+### 7. `state_rehydrator.py` (The Cache Manager) +**Role:** Incremental Delta Scanning. +During CI/CD pipelines, it is highly inefficient to re-parse 10,000 unchanged files for a 2-file pull request. This module extracts the previous structural state from the SQLite database and rehydrates it directly into RAM, allowing the pipeline to skip the heavy regex extraction phases for unchanged artifacts. --- -### 🌌 Powered by the blAST Engine +## Engineering Highlights + +If you are onboarding into the `core/` architecture, pay special attention to how we solve traditional static analysis scaling problems. By relying on high-velocity heuristics rather than heavy compilation steps, we achieve capabilities and speeds that standard tooling cannot match. + +* **Multi-Tiered ReDoS Defense Architecture (`detector.py` & `prism.py`):** Regular Expression Denial of Service (ReDoS) is a critical threat when scanning unknown or minified code. We do not rely on a single timeout guillotine. The engine utilizes a three-tiered defense: + 1. **O(1) Atomic Literal Shielding:** Temporarily masks string literals to prevent the regex engine from catastrophically backtracking on overlapping quotes. + 2. **Line-Length Limiters:** Identifies abnormally long lines (e.g., hex arrays or minified data blobs) and truncates them before regex evaluation, while perfectly preserving the mathematical Lines of Code (LOC) count. + 3. **OS-Level Interrupts:** If a malformed file still traps the engine in an evaluation loop, a hardware-level OS interrupt fires after 15 seconds. It safely terminates the isolated worker process, downgrades the file to `plaintext`, and ensures the CI/CD pipeline never hangs. +* **Dynamic Mid-File Language Switching (`detector.py`):** Standard parsers routinely fail or miscategorize polyglot files (e.g., SQL logic embedded within a Python string, or JavaScript nested inside HTML). Instead of failing, the engine dynamically swaps syntax registries mid-file. It uses scope-aware handshakes to isolate and correctly parse embedded languages, preserving perfect structural context across 50+ languages. +* **AST-Free Algorithmic Complexity (`detector.py`):** Compiling an Abstract Syntax Tree to determine cyclomatic nesting depth requires massive overhead. GitGalaxy bypasses this by using standard code indentation as a blazing-fast, highly accurate proxy for Big-O complexity. This allows the engine to evaluate structural density and recursive depth at speeds exceeding 90,000 LOC/second. +* **Topological Call Graphs & Architectural Test Coverage (`network_risk_sensor.py`):** Recreating a granular, cross-repository function call graph using Abstract Syntax Trees (ASTs) is computational overkill for DevSecOps. ASTs require perfectly compiling code, massive memory overhead, and brittle, language-specific parsers. We bypass this bottleneck by utilizing a high-velocity topological proxy. By mapping file-level `import` statements to establish ecosystem boundaries, and extracting targeted outbound function invocations via structural signatures, we achieve the necessary precision at a fraction of the compute cost. This allows us to calculate the systemic **Dependency Blast Radius** of specific logic purely in RAM. For example, by mapping outbound calls from test files directly to their production targets, we mathematically calculate the exact architectural vulnerability footprint of untested modules across polyglot microservices. +* **The Repository Knowledge Graph (Core Vision):** All of these extractions culminate in a unified mathematical model of the codebase. By treating files as **Nodes** and import statements as **Edges**, we stitch together a cross-repository Knowledge Graph. We then overlay our extracted *Structural Signatures* (state mutations, I/O boundaries, RCE triggers) directly onto these nodes as properties. This provides deep, queryable clarity into how vulnerable information flows across polyglot microservices, surfacing systemic risks that isolated file scanners miss. + +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) + +GitGalaxy Core is the foundational ingestion layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed to extract knowledge from any repository. -This documentation is part of the [GitGalaxy Ecosystem](https://github.com/squid-protocol/gitgalaxy), an AST-free, LLM-free heuristic knowledge graph engine. +Explore the ecosystem: -* 🪐 **[Explore the GitHub Repository](https://github.com/squid-protocol/gitgalaxy)** for code, tools, and updates. -* 🔭 **[Visualize your own repository at GitGalaxy.io](https://gitgalaxy.io/)** using our interactive 3D WebGPU dashboard. \ No newline at end of file +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/core/aperture.py b/gitgalaxy/core/aperture.py index b4212c0f..18a01890 100644 --- a/gitgalaxy/core/aperture.py +++ b/gitgalaxy/core/aperture.py @@ -22,21 +22,28 @@ # --- CUSTOM EXCEPTION HIERARCHY --- + class ApertureError(Exception): """Base class for all errors generated by the Aperture filtering process.""" + pass + class InaccessibleArtifactError(ApertureError): """Raised when an artifact cannot be accessed due to OS permissions or path corruption.""" + pass + class SaturationError(ApertureError): """Raised when a signal is too dense or minified to be safely evaluated by the engine.""" + pass class FilterResult(TypedDict): """Structured telemetry returned by the Filter for the Pipeline Orchestrator.""" + is_in_scope: bool classification: str # e.g., 'source_code', 'binary_payload', 'generated_noise' reason: Optional[str] @@ -48,8 +55,8 @@ class FilterResult(TypedDict): class ApertureFilter: """ Primary ingestion filter for the analysis engine. Performs perimeter gating to ensure - only valid, maintainable source code reaches the CPU-bound Regex/AST detectors. - Integrates with GuideStar's Bayesian 'Intent Locks' to dynamically adjust suppression + only valid, maintainable source code reaches the CPU-bound Structural Signature extractors. + Integrates with GuideStar's Bayesian 'Contextual Baselines' to dynamically adjust suppression thresholds for known, high-priority artifacts (like package.json). """ @@ -111,20 +118,15 @@ def __init__( self.ignore_patterns = self._load_gitignore_patterns() - self.logger.info( - f"Aperture Filter Online | " - f"Tracking {len(self.whitelisted_extensions)} valid extensions." - ) + self.logger.info(f"Aperture Filter Online | Tracking {len(self.whitelisted_extensions)} valid extensions.") - def evaluate_path_integrity( - self, file_path: Union[str, Path], has_intent: bool = False - ) -> Tuple[bool, int, str]: + def evaluate_path_integrity(self, file_path: Union[str, Path], has_intent: bool = False) -> Tuple[bool, int, str]: """ [PHASE 0 ENTRY POINT] Performs high-speed path analysis to build the initial File Census. - + DEFENSIVE DESIGN: We determine if a file is physically valid based on OS metadata - *before* any disk I/O (file opening/reading) occurs. This prevents OS-level locks + *before* any disk I/O (file opening/reading) occurs. This prevents OS-level locks and drastically reduces Memory/CPU overhead on large monolithic repositories. """ path_obj = Path(file_path) @@ -142,16 +144,23 @@ def evaluate_path_integrity( size_bytes = 0 # --- TIER 0.1: THE SECRETS RADAR --- - if path_obj.name in self.config.get( - "SECRETS_EXACT", set() - ) or ext.lower() in self.config.get("SECRETS_EXTENSIONS", set()): + if path_obj.name in self.config.get("SECRETS_EXACT", set()) or ext.lower() in self.config.get( + "SECRETS_EXTENSIONS", set() + ): reason = f"CRITICAL LEAK (Exposed Secret: '{path_obj.name}')" return False, size_bytes, reason # --- TIER 0.2: THE NEURAL AUDITOR SHUNT (Model Weights) --- AI_MODEL_EXTS = { - ".safetensors", ".gguf", ".onnx", ".pt", ".pth", - ".bin", ".tflite", ".pb", ".h5", + ".safetensors", + ".gguf", + ".onnx", + ".pt", + ".pth", + ".bin", + ".tflite", + ".pb", + ".h5", } if ext.lower() in AI_MODEL_EXTS: reason = f"AI MODEL WEIGHTS (Bypassing Standard Logic: '{ext}')" @@ -168,7 +177,7 @@ def evaluate_path_integrity( if active_intent: self._intent_cache.add(normalized_path) -# --- TIER 1: CHECK EXPLICIT IGNORE RULES (.gitignore, node_modules, etc) --- + # --- TIER 1: CHECK EXPLICIT IGNORE RULES (.gitignore, node_modules, etc) --- if not self._check_ignore_rules(relative_path, has_intent=active_intent): reason = "Blocked (System Exclusion, Hidden Directory, or Dynamic Ignored Dir)" return False, size_bytes, reason @@ -199,11 +208,7 @@ def is_in_scope( """ path_obj = Path(file_path) normalized_path = path_obj.as_posix() - relative_path = ( - str(path_obj.relative_to(self.root)) - if path_obj.is_relative_to(self.root) - else normalized_path - ) + relative_path = str(path_obj.relative_to(self.root)) if path_obj.is_relative_to(self.root) else normalized_path active_intent = has_intent or (normalized_path in self._intent_cache) result: FilterResult = { @@ -259,9 +264,7 @@ def is_in_scope( result["reason"] = "Protocol Violation: Missing content buffer" return result - integrity = self._check_artifact_integrity( - content, relative_path, has_intent=active_intent - ) + integrity = self._check_artifact_integrity(content, relative_path, has_intent=active_intent) result["total_loc"] = integrity["loc"] if not integrity["valid"]: @@ -277,9 +280,7 @@ def is_in_scope( result["reason"] = f"Internal Exception: {str(e)}" return result - def _check_artifact_integrity( - self, content: str, rel_path: str, has_intent: bool = False - ) -> Dict[str, Any]: + def _check_artifact_integrity(self, content: str, rel_path: str, has_intent: bool = False) -> Dict[str, Any]: """ Deep-scans the content buffer for corruption, binary data, arrays, or documentation generator signatures. @@ -296,7 +297,7 @@ def _check_artifact_integrity( report["loc"] = len(lines_list) # --- TIER 3: OPAQUE BINARY DETECTION --- - # DEFENSIVE DESIGN: Checking for a null byte is the fastest, most reliable + # DEFENSIVE DESIGN: Checking for a null byte is the fastest, most reliable # heuristic to identify compiled binaries or images masquerading as text files. if "\x00" in content: report.update( @@ -310,7 +311,7 @@ def _check_artifact_integrity( # --- TIER 3.1: THE MONOLITH AMALGAMATION SHIELD --- # DEFENSIVE DESIGN: 30,000+ lines in a single file is usually an amalgamation (e.g. sqlite3.c). - # Standard Regex engines suffer from Catastrophic Backtracking on files of this magnitude. + # Standard Structural Signature extractors suffer from Catastrophic Backtracking on files of this magnitude. if report["loc"] > 30000: report.update( { @@ -398,7 +399,7 @@ def _check_artifact_integrity( # --- TIER 3.9: TEST DATA & ARRAY SHIELD --- # DEFENSIVE DESIGN: Massive comma-separated arrays or hex blobs (like embedded images - # inside C++ headers) contain 0 architectural logic, but will completely stall an AST parser. + # inside C++ headers) contain 0 executable payload, but will completely stall a structural parser. if report["loc"] > 500: hex_count = content.count("0x") + content.count("0X") if hex_count > report["loc"]: @@ -432,7 +433,7 @@ def _check_artifact_integrity( { "valid": False, "classification": "oversized_minified", - "reason": f"Blocked (Saturation: Line {i+1} exceeds {max_line} chars)", + "reason": f"Blocked (Saturation: Line {i + 1} exceeds {max_line} chars)", } ) return report @@ -481,9 +482,7 @@ def _check_ignore_rules(self, rel_path: str, has_intent: bool = False) -> bool: if any(fnmatch.fnmatch(p + "/", pattern) for p in parts): return False else: - if fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch( - filename, pattern - ): + if fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch(filename, pattern): return False return True @@ -503,4 +502,4 @@ def _load_gitignore_patterns(self) -> List[str]: except Exception as e: self.logger.warning(f"Failed to parse .gitignore: {e}") - return patterns \ No newline at end of file + return patterns diff --git a/gitgalaxy/core/detector.py b/gitgalaxy/core/detector.py index e8bfb079..b280de17 100644 --- a/gitgalaxy/core/detector.py +++ b/gitgalaxy/core/detector.py @@ -36,7 +36,7 @@ def get_token_mass(text: str, deep_scan: bool = False) -> Optional[int]: # ============================================================================== -# GitGalaxy Phase 2.5 & 7.5: Logic Splicer & Cartographer +# GitGalaxy Phase 2.5 & 7.5: Logic Splicer & Topological Mapper # Strategy v6.3.0 Protocol: Fluid-State Counters, Language Sliding & Semantic Modes # ============================================================================== @@ -97,18 +97,18 @@ class LogicData(TypedDict, total=False): # ============================================================================== -# THE OPTICAL CONFIGURATION MATRIX +# THE STRUCTURAL SIGNATURE CONFIGURATION MATRIX # ============================================================================== class ScopeParsingRegistry: """ - The Optical Calibration Matrix for GalaxyScope's Primary Detector. + The Structural Signature Calibration Matrix for GalaxyScope's Primary Detector. Defines the structural heuristics required to slice non-brace languages. DEFENSIVE ARCHITECTURE: - By categorizing languages into integration modes, the engine avoids building - heavy Abstract Syntax Trees (ASTs). It visualizes functional intent across + By categorizing languages into integration modes, the engine avoids building + heavy Abstract Syntax Trees (ASTs). It visualizes functional intent across 50+ languages natively without requiring the codebase to compile. - MODE D: Keyword Scope Tracking (Depth tracking via language-specific keywords) @@ -228,7 +228,7 @@ class ScopeParsingRegistry: @classmethod def get_config(cls, lang_id: str) -> Optional[dict]: - """Resolves aliases and returns the optical physics config for the language.""" + """Resolves aliases and returns the structural signature config for the language.""" if not lang_id: return None normalized_id = lang_id.lower() @@ -243,7 +243,7 @@ def get_mode(cls, lang_id: str) -> Optional[str]: # ------------------------------------------------------------------------------ -# THE DETECTOR (Optical Detector) +# THE DETECTOR (Structural Detector) # ------------------------------------------------------------------------------ @@ -251,16 +251,16 @@ class StructuralExtractor: """ GitGalaxy Structural Extractor (Primary Heuristic Logic & Function Mapper). - PURPOSE: Performs AST-less analysis of executable logic streams to extract + PURPOSE: Performs AST-less analysis of executable logic streams to extract functional nodes, calculate complexity, and detect structural security signatures. DEFENSIVE ARCHITECTURE (Lexical Heuristics vs. AST Parsing): - AST parsers often fail when encountering non-standard syntax, legacy dialects, - or partially-broken codebases. This extractor utilizes Fluid State Counters - and O(1) lexical masking to achieve high-fidelity node extraction at - ~100,000 LOC/sec, maintaining high performance without requiring + AST parsers often fail when encountering non-standard syntax, legacy dialects, + or partially-broken codebases. This extractor utilizes Fluid State Counters + and O(1) lexical masking to achieve high-fidelity node extraction at + ~100,000 LOC/sec, maintaining high performance without requiring fully-compilable source code. - + ARCHITECTURE: 1. Fluid State Counter: Dynamically swaps regex registries mid-file for embedded languages. 2. Bucket Continuation: Accumulates secondary language hits into the primary vector. @@ -321,31 +321,26 @@ def __init__( "branching": "branch", "io_ops": "io", "safety": "safety", - "danger": "danger", + "high_risk_execution": "high_risk_execution", "concurrency": "concurrency", - "logic_flux": "flux", + "logic_flux": "state_mutation", } self.MAX_SATELLITES = 250 self.MAX_DEPTH = 50 self.HANDSHAKE_LOOKAHEAD_LIMIT = 50000 - if ( - self.primary_lang_id not in self.languages - or "rules" not in self.languages.get(self.primary_lang_id, {}) - ): + if self.primary_lang_id not in self.languages or "rules" not in self.languages.get(self.primary_lang_id, {}): try: from gitgalaxy.standards.language_standards import LANGUAGE_DEFINITIONS - + # Apply the healed definitions to the instance state self.languages = LANGUAGE_DEFINITIONS lang_config = self.languages.get(self.primary_lang_id, {}) self.primary_rules = lang_config.get("rules", {}) self.primary_family = lang_config.get("lexical_family", "c_style_comment") - self.logger.warning( - f"[AUTO-HEAL] Re-injected LANGUAGE_DEFINITIONS for '{self.primary_lang_id}'" - ) + self.logger.warning(f"[AUTO-HEAL] Re-injected LANGUAGE_DEFINITIONS for '{self.primary_lang_id}'") except ImportError: pass @@ -361,18 +356,18 @@ def splice( self.raw_content_lines = raw_content.splitlines() if raw_content else [] regex_telemetry = {} - # We always extract the metadata first, even for Dark Matter files + # We always extract the metadata first, even for Unparsable Artifacts ghost_meta = self._decode_comment_stream(comment_stream) # ---> THE ECOSYSTEM GRAVITY OVERRIDE <--- # If the broader ecosystem safely locked a contested file (like a .h header) # into a C-family language, we trust the gravity and artificially boost the confidence. - # This prevents pure-macro headers from falling below the 0.42 floor and vanishing into Dark Matter. + # This prevents pure-macro headers from falling below the 0.42 floor and vanishing into Unparsable Artifacts. if self.primary_lang_id in ["c", "cpp", "objective-c"]: confidence = 1.0 - # 1. The Custom Singularity Bypass & Prose Deflection - # Rejects unverified artifacts AND Inert Matter files before wasting compute + # 1. The Custom Unparsable Artifact Bypass & Prose Deflection + # Rejects unverified artifacts AND Static Assets before wasting compute if confidence < 0.42 or self.primary_lang_id in ( "plaintext", "markdown", @@ -381,7 +376,7 @@ def splice( "csv", ): self.logger.debug( - f"[DIAGNOSTIC] Bypass triggered (Conf: {confidence:.2f} | Lang: {self.primary_lang_id}). Relegating to Dark Matter/Ghost Mass." + f"[DIAGNOSTIC] Bypass triggered (Conf: {confidence:.2f} | Lang: {self.primary_lang_id}). Relegating to Unparsable Artifacts." ) return { "equations": {}, @@ -421,24 +416,18 @@ def splice( try: line_count = sum(1 for l in code_stream.splitlines() if l.strip()) - # --- EXISTING OPTICAL PIPELINE --- + # --- EXISTING STRUCTURAL PIPELINE --- segments = self._partition_segments(code_stream, self.primary_lang_id) - equations, mitigation_telemetry, segment_spatial_maps, extracted_parents = ( - self.coding_analysis( - segments, regex_telemetry if profile_regex else None - ) + equations, mitigation_telemetry, segment_spatial_maps, extracted_parents = self.coding_analysis( + segments, regex_telemetry if profile_regex else None ) if extracted_parents: # Store the top 3 parent entities to prevent massive string bloat on huge files - ghost_meta["parent_entity"] = ", ".join( - list(dict.fromkeys(extracted_parents))[:3] - ) + ghost_meta["parent_entity"] = ", ".join(list(dict.fromkeys(extracted_parents))[:3]) - equations = self.comment_analysis( - comment_stream, self.primary_lang_id, equations - ) + equations = self.comment_analysis(comment_stream, self.primary_lang_id, equations) functions, sum_fxn_impact = self._function_slice( segments, @@ -458,11 +447,7 @@ def splice( for i, match in enumerate(class_matches): start_idx = match.start() # Scope ends at the next class declaration, or the end of the file - end_idx = ( - class_matches[i + 1].start() - if i + 1 < len(class_matches) - else len(code_stream) - ) + end_idx = class_matches[i + 1].start() if i + 1 < len(class_matches) else len(code_stream) # Convert raw string indices to line numbers for spatial bounding start_line = code_stream.count("\n", 0, start_idx) + 1 @@ -485,31 +470,21 @@ def splice( class_methods = [] for func in functions: # If the function falls within the spatial bounds of the class - if ( - cls["_start_line"] - <= func.get("start_line", 0) - <= cls["_end_line"] - ): + if cls["_start_line"] <= func.get("start_line", 0) <= cls["_end_line"]: func["parent_class_name"] = cls["name"] class_methods.append(func) cls["method_count"] = len(class_methods) # State Entanglement: Density of state mutations (flux) inside the class methods - total_flux = sum( - m.get("hit_vector", {}).get("flux", 0) for m in class_methods - ) - cls["state_entanglement"] = round( - (total_flux / max(cls["method_count"], 1)) * 5.0, 2 - ) + total_flux = sum(m.get("hit_vector", {}).get("state_mutation", 0) for m in class_methods) + cls["state_entanglement"] = round((total_flux / max(cls["method_count"], 1)) * 5.0, 2) # LCOM (Lack of Cohesion of Methods): Approximation using arguments vs mutations total_args = sum(m.get("args", 0) for m in class_methods) if cls["method_count"] > 1: cohesion_ratio = total_flux / max(total_args, 1) - cls["lcom_score"] = round( - max(0.0, min(100.0, 100.0 - (cohesion_ratio * 25.0))), 2 - ) + cls["lcom_score"] = round(max(0.0, min(100.0, 100.0 - (cohesion_ratio * 25.0))), 2) else: cls["lcom_score"] = 0.0 @@ -518,16 +493,12 @@ def splice( del cls["_end_line"] branch_hits = equations.get("branch", 0) - linear_hits = equations.get("linear", 0) - total_control_flow_ratio = round( - branch_hits / max(branch_hits + linear_hits, 1), 3 - ) + linear_hits = equations.get("structural_boundaries", 0) + total_control_flow_ratio = round(branch_hits / max(branch_hits + linear_hits, 1), 3) # Use the newly standardized keys from the updated coding_analysis total_signals = sum(equations.values()) - logic_density = ( - round(total_signals / line_count, 3) if line_count > 0 else 0.0 - ) + logic_density = round(total_signals / line_count, 3) if line_count > 0 else 0.0 # --- NEW: INTRA-FILE ORPHAN & DUPLICATE DETECTOR --- import collections @@ -560,12 +531,10 @@ def splice( func["usage_status"] = usage_status if orphan_count > 0: - equations["design_slop_orphans"] = orphan_count + equations["orphaned_logic"] = orphan_count # Calculate total file footprint, preferring the unshielded raw text if available - file_token_mass = get_token_mass( - raw_content if raw_content else code_stream - ) + file_token_mass = get_token_mass(raw_content if raw_content else code_stream) result_payload = { "equations": equations, @@ -578,9 +547,7 @@ def splice( "mitigation_telemetry": mitigation_telemetry, "token_mass": file_token_mass, "financial_read_cost": ( - round((file_token_mass / 1000000) * 3.00, 5) - if file_token_mass is not None - else None + round((file_token_mass / 1000000) * 3.00, 5) if file_token_mass is not None else None ), } if profile_regex: @@ -591,9 +558,7 @@ def splice( # Let the Hardware Guillotine drop cleanly to the Worker thread! raise except Exception as e: - self.logger.error( - f"Catastrophic failure during structural splicing: {e}", exc_info=True - ) + self.logger.error(f"Catastrophic failure during structural splicing: {e}", exc_info=True) return { "equations": {}, "functions": [], @@ -616,9 +581,7 @@ def _decode_comment_stream(self, comment_stream: str) -> Dict[str, str]: m_owner = re_ownership.search(comment_stream) if m_owner: ownership_val = ( - m_owner.group(m_owner.lastindex).strip() - if m_owner.lastindex - else m_owner.group(0).strip() + m_owner.group(m_owner.lastindex).strip() if m_owner.lastindex else m_owner.group(0).strip() ) except Exception: pass @@ -664,11 +627,7 @@ def _decode_comment_stream(self, comment_stream: str) -> Dict[str, str]: break else: continue - if ( - re_boundary - and hasattr(re_boundary, "match") - and re_boundary.match(line_str) - ): + if re_boundary and hasattr(re_boundary, "match") and re_boundary.match(line_str): break purpose_buffer.append(line_str) has_block_text = True @@ -677,37 +636,21 @@ def _decode_comment_stream(self, comment_stream: str) -> Dict[str, str]: if active_capture == "line": if ( not line_str - or ( - re_boundary - and hasattr(re_boundary, "match") - and re_boundary.match(line_str) - ) - or ( - re_purpose_block - and hasattr(re_purpose_block, "match") - and re_purpose_block.match(line_str) - ) + or (re_boundary and hasattr(re_boundary, "match") and re_boundary.match(line_str)) + or (re_purpose_block and hasattr(re_purpose_block, "match") and re_purpose_block.match(line_str)) ): active_capture = None else: fallback_buffer.append(line_str) continue - if ( - re_purpose_block - and hasattr(re_purpose_block, "match") - and re_purpose_block.match(line_str) - ): + if re_purpose_block and hasattr(re_purpose_block, "match") and re_purpose_block.match(line_str): active_capture = "block" purpose_buffer = [] has_block_text = False continue - if ( - re_purpose_line - and hasattr(re_purpose_line, "match") - and not purpose_buffer - ): + if re_purpose_line and hasattr(re_purpose_line, "match") and not purpose_buffer: try: m_purpose = re_purpose_line.match(line_str) if m_purpose: @@ -732,7 +675,7 @@ def _decode_comment_stream(self, comment_stream: str) -> Dict[str, str]: return meta - def _extract_ghost_tether(self, start_line: int, lang_id: str) -> str: + def _extract_documentation_tether(self, start_line: int, lang_id: str) -> str: """Surgically extracts the human intent (docstring/comments) using exact spatial coordinates.""" if not hasattr(self, "raw_content_lines") or not self.raw_content_lines: return "" @@ -749,15 +692,11 @@ def _extract_ghost_tether(self, start_line: int, lang_id: str) -> str: prev = self.raw_content_lines[j].strip() if not prev: continue - if prev.startswith( - ("#", "//", "/*", "*", "///", "--", " FAST SINGLE-PASS COMMENT STRIP <--- # Ensures #var or #foo are not erroneously treated as comments if they are not at the start of a word. - safe_code = re.sub( - r"(^|[ \t])(?:#|--|//).*$", r"\1", safe_code, flags=re.MULTILINE - ) + safe_code = re.sub(r"(^|[ \t])(?:#|--|//).*$", r"\1", safe_code, flags=re.MULTILINE) # 2. Split both into parallel arrays original_lines = code.splitlines(keepends=True) @@ -1769,9 +1631,7 @@ def _slice_by_keywords( # The Ruby/Elixir Inline Modifier Guard if lang_key in ["ruby", "elixir"] and opens > 0: # Find all valid condition keywords on the line - inline_mods = len( - re.findall(r"(? 0: # Check if one of them is the actual start of the statement @@ -1835,9 +1695,7 @@ def _slice_by_keywords( current_line_offset += 1 current_char_offset += len(orig_line) - self.logger.debug( - "[DIAGNOSTIC] Mode D: Finished traversing. Processing remnants..." - ) + self.logger.debug("[DIAGNOSTIC] Mode D: Finished traversing. Processing remnants...") if stack_depth > 0 and current_satellite: block = "\n".join(current_satellite).strip() @@ -1872,9 +1730,7 @@ def _slice_by_keywords( satellites.append(sat) sum_fxn_impact += mag - self.logger.debug( - f"[DIAGNOSTIC] Mode D: Extracted {len(satellites)} satellites." - ) + self.logger.debug(f"[DIAGNOSTIC] Mode D: Extracted {len(satellites)} satellites.") return satellites, sum_fxn_impact def _slice_by_terminator( @@ -1909,15 +1765,9 @@ def _slice_by_terminator( def preserve_newlines(m): return '""' + "\n" * m.group(0).count("\n") - safe_code = re.sub( - r'"(?:\\.|[^"\\])*"', preserve_newlines, code, flags=re.DOTALL - ) - safe_code = re.sub( - r"'(?:\\.|[^'\\])*'", preserve_newlines, safe_code, flags=re.DOTALL - ) - safe_code = re.sub( - r"`(?:\\.|[^`\\])*`", preserve_newlines, safe_code, flags=re.DOTALL - ) + safe_code = re.sub(r'"(?:\\.|[^"\\])*"', preserve_newlines, code, flags=re.DOTALL) + safe_code = re.sub(r"'(?:\\.|[^'\\])*'", preserve_newlines, safe_code, flags=re.DOTALL) + safe_code = re.sub(r"`(?:\\.|[^`\\])*`", preserve_newlines, safe_code, flags=re.DOTALL) # ---> FAST SINGLE-PASS COMMENT STRIP <--- # Execute the regex once globally. Prevents 500,000+ regex calls on massive SQL dumps. @@ -1943,13 +1793,9 @@ def preserve_newlines(m): sat_start_char = current_char_offset match = igniter_pattern.search(safe_line) if match: - lang_key = ScopeParsingRegistry._ALIASES.get( - lang_id.lower(), lang_id.lower() - ) + lang_key = ScopeParsingRegistry._ALIASES.get(lang_id.lower(), lang_id.lower()) satellite_name = ( - f"{match.group(1).upper()}_Statement" - if "sql" in lang_key - else match.group(0).strip() + f"{match.group(1).upper()}_Statement" if "sql" in lang_key else match.group(0).strip() ) satellite_name = re.sub(r"[^a-zA-Z0-9_]", "", satellite_name) @@ -2023,12 +1869,12 @@ def _calculate_block_metrics( spatial_map: Dict[str, List[int]] = None, ) -> Tuple[FunctionNode, float]: """ - Calculates the structural weight, algorithmic complexity, and hit vector + Calculates the structural weight, algorithmic complexity, and hit vector for an extracted functional block. DEFENSIVE ARCHITECTURE (Big-O without ASTs): - ASTs require intense compilation overhead to determine cyclomatic nesting depth. - Because we prioritize functional intent, this engine uses standard indentation + ASTs require intense compilation overhead to determine cyclomatic nesting depth. + Because we prioritize functional intent, this engine uses standard indentation as a 95% accurate proxy for O(N) complexity at a fraction of the compute cost. """ args_pattern = rules.get("args") @@ -2044,24 +1890,20 @@ def _calculate_block_metrics( hit_vector[key] = count branch_hits = hit_vector.get("branch", 0) - linear_hits = hit_vector.get("linear", 0) + linear_hits = hit_vector.get("structural_boundaries", 0) else: # Fallback for untested manual calls branch_pattern = rules.get("branch") - linear_pattern = rules.get("linear") + linear_pattern = rules.get("structural_boundaries") branch_hits = ( len(branch_pattern.findall(block)) if hasattr(branch_pattern, "findall") - else ( - len(re.findall(str(branch_pattern), block)) if branch_pattern else 0 - ) + else (len(re.findall(str(branch_pattern), block)) if branch_pattern else 0) ) linear_hits = ( len(linear_pattern.findall(block)) if hasattr(linear_pattern, "findall") - else ( - len(re.findall(str(linear_pattern), block)) if linear_pattern else 0 - ) + else (len(re.findall(str(linear_pattern), block)) if linear_pattern else 0) ) total_hits = branch_hits + linear_hits @@ -2073,11 +1915,7 @@ def _calculate_block_metrics( # --- FAST CODING LOC HEURISTIC (Syntax Fixed!) --- # Quickly strip out blank lines and standard single-line comments to find the true logic mass # THE FIX: Preserve leading whitespace to calculate Big-O nesting depth! - raw_lines = [ - l - for l in block.splitlines() - if l.strip() and not l.lstrip().startswith(("#", "//", "/*", "*")) - ] + raw_lines = [l for l in block.splitlines() if l.strip() and not l.lstrip().startswith(("#", "//", "/*", "*"))] coding_loc = len(raw_lines) # --- NEW: BIG-O ALGORITHMIC COMPLEXITY TRACKER --- @@ -2099,24 +1937,20 @@ def _calculate_block_metrics( # Check if the function's name appears followed by a parenthesis/space inside its own body. # We check for > 1 because the first hit is the function definition itself! is_recursive = False - if ( - name - and len(name) > 2 - and name not in {"Unknown_Sat", "Anonymous_Block", "Main"} - ): + if name and len(name) > 2 and name not in {"Unknown_Sat", "Anonymous_Block", "Main"}: # Fast heuristic: Count occurrences. If it appears more than once, it's highly likely recursive. occurrence_count = len(re.findall(r"\b" + re.escape(name) + r"\b", block)) if occurrence_count > 1: is_recursive = True # --- NEW: FUNCTION-LEVEL DATABASE COMPLEXITY (Data Gravity) --- - # Mapped to active v6 schemas: 'io' (DB connections/SQL), 'flux' (mutations), and 'serialization_parsing' (JSON/ORMs). + # Mapped to active v6 schemas: 'io' (DB connections/SQL), 'state_mutation' (mutations), and 'serialization_parsing' (JSON/ORMs). db_complexity = 0 if hit_vector: db_complexity = ( (hit_vector.get("io", 0) * 3) + (hit_vector.get("serialization_parsing", 0) * 2) - + (hit_vector.get("flux", 0) * 1) + + (hit_vector.get("state_mutation", 0) * 1) ) # --- NEW: FUNCTION-LEVEL KEYWORD DENSITY (The Micro-Auditor) --- @@ -2129,11 +1963,7 @@ def _calculate_block_metrics( try: arg_match = args_pattern.search(block) if arg_match: - args_str = ( - arg_match.group(arg_match.lastindex) - if arg_match.lastindex - else arg_match.group(0) - ) + args_str = arg_match.group(arg_match.lastindex) if arg_match.lastindex else arg_match.group(0) if args_str and args_str.strip() != "()": if "," in args_str: args_count = args_str.count(",") + 1 @@ -2152,35 +1982,30 @@ def _calculate_block_metrics( effective_loc = min(loc, (total_signals + 1) * 10) # ---> THE FIX 2: SUB-LINEAR ARGUMENT DAMPENER & BIG-O SCALAR <--- - # Apply a square root to the arguments to prevent combinatorial mass explosions - # on edge-case mega-functions, while preserving the core physics philosophy. + # Apply a square root to the arguments to prevent combinatorial magnitude explosions + # on edge-case mega-functions, while preserving the core structural philosophy. arg_multiplier = math.sqrt(args_count + 1) # Apply Big O Depth as an exponential gravity multiplier. # O(N)=1.0x, O(N^2)=1.5x, O(N^3)=2.0x, etc. complexity_multiplier = 1.0 + ((big_o_depth - 1) * 0.5) - # Recursive functions are dangerous and mathematically dense. Double their mass. + # Recursive functions are dangerous and mathematically dense. Double their magnitude. if is_recursive: complexity_multiplier *= 2.0 # Calculate magnitude using the dampened arguments, Big-O depth, and logic-bounded length - magnitude = float( - (branch_hits + 1) * arg_multiplier * complexity_multiplier - + (0.05 * effective_loc) - ) + magnitude = float((branch_hits + 1) * arg_multiplier * complexity_multiplier + (0.05 * effective_loc)) - # ---> THE FIX: SPATIAL GEOMETRY MATH <--- + # ---> THE FIX: LOGIC TOPOLOGY MATH <--- # Calculate the Control Flow Ratio and the Fractal Fibonacci Angle (Theta) total_cf_signals = branch_hits + linear_hits - control_flow_ratio = ( - (branch_hits / total_cf_signals) if total_cf_signals > 0 else 0.0 - ) + control_flow_ratio = (branch_hits / total_cf_signals) if total_cf_signals > 0 else 0.0 angle = 22.5 + (1.0 - control_flow_ratio) * 67.5 - # ---> NEW: THE GHOST TETHER <--- + # ---> NEW: THE DOCUMENTATION TETHER <--- # Re-attach the human intent using the exact starting line coordinate! - docstring = self._extract_ghost_tether(start_line, self.primary_lang_id) + docstring = self._extract_documentation_tether(start_line, self.primary_lang_id) # ---> NEW: LEVEL 3 WIRING (Function Call Chains) <--- # We scan the block for any word followed by a parenthesis, minus common language keywords. @@ -2255,9 +2080,7 @@ def _calculate_block_metrics( "Boolean", } # Deduplicate and filter (excluding the function calling itself recursively) - calls_out = list( - set([c for c in raw_calls if c not in ignore_keywords and c != name]) - )[:20] + calls_out = list(set([c for c in raw_calls if c not in ignore_keywords and c != name]))[:20] sat: FunctionNode = { "name": name[:40], @@ -2292,7 +2115,7 @@ def _calculate_block_metrics( def _extract_name(self, raw_match: str) -> str: """ Heuristic Token Normalizer. - Safely extracts the functional identifier (function, class, or method name) from a raw + Safely extracts the functional identifier (function, class, or method name) from a raw regex capture block by isolating the last valid alphanumeric token before parameter boundaries. """ match_strip = raw_match.strip() @@ -2301,9 +2124,7 @@ def _extract_name(self, raw_match: str) -> str: if match_strip.startswith("-") or match_strip.startswith("+"): clean_objc = re.sub(r"^[-+]\s*(?:\([^)]+\))?\s*", "", match_strip) clean_objc = clean_objc.split(":")[0].split("(")[0].split("{")[0].strip() - words = [ - w for w in re.findall(r"[a-zA-Z0-9_.-]+", clean_objc) if w.strip("_-") - ] + words = [w for w in re.findall(r"[a-zA-Z0-9_.-]+", clean_objc) if w.strip("_-")] return words[0] if words else "Unknown_Block" # --- 1.5 Overloaded Operator Extraction (C++) --- @@ -2341,42 +2162,33 @@ def _extract_name(self, raw_match: str) -> str: clean = clean.split(":")[0].strip() else: # ---> Namespace Resolution Preservation (C++/PHP) <--- - # DEFENSIVE ARCHITECTURE: Rather than utilizing expensive regex lookaheads to ignore - # double-colons (::) while splitting on single colons (:) for type hints, we utilize + # DEFENSIVE ARCHITECTURE: Rather than utilizing expensive regex lookaheads to ignore + # double-colons (::) while splitting on single colons (:) for type hints, we utilize # a high-speed O(N) string replacement to temporarily mask the namespace operator. clean = clean.replace("::", "__NAMESPACE_SCOPE__") - + # Truncate at parameter lists, body openings, or return type hints clean = clean.split("(")[0].split("{")[0].split(":")[0].strip() - + # Restore the namespace operator clean = clean.replace("__NAMESPACE_SCOPE__", "::") # Allow standard characters, plus Makefiles ($/%), and Scopes (:) - words = [ - w for w in re.findall(r"[a-zA-Z0-9_./%$():-]+", clean) if w.strip("_-:") - ] + words = [w for w in re.findall(r"[a-zA-Z0-9_./%$():-]+", clean) if w.strip("_-:")] return words[-1] if words else "Unknown_Block" + def _classify_function(self, name: str, block: str, rules: Dict[str, Any]) -> str: tag_match = re.search(r"[\@](?:type|gal_type)[:\s]+(\w+)", block, re.IGNORECASE) if tag_match: return tag_match.group(1).lower() name_lower = name.lower() - if any( - v in name_lower for v in ["get", "fetch", "load", "read", "query", "select"] - ): + if any(v in name_lower for v in ["get", "fetch", "load", "read", "query", "select"]): return "io" - if any( - v in name_lower - for v in ["set", "write", "save", "update", "delete", "post", "send", "put"] - ): + if any(v in name_lower for v in ["set", "write", "save", "update", "delete", "post", "send", "put"]): return "mutation" - if any( - v in name_lower - for v in ["on", "handle", "click", "submit", "route", "rupt", "task"] - ): + if any(v in name_lower for v in ["on", "handle", "click", "submit", "route", "rupt", "task"]): return "event" if any( v in name_lower @@ -2398,15 +2210,11 @@ def _classify_function(self, name: str, block: str, rules: Dict[str, Any]) -> st if any(v in name_lower for v in ["test", "assert", "mock", "stub"]): return "verification" - danger_pattern = rules.get("danger") + danger_pattern = rules.get("high_risk_execution") io_pattern = rules.get("io") - if ( - danger_pattern - and hasattr(danger_pattern, "search") - and danger_pattern.search(block) - ): - return "danger" + if danger_pattern and hasattr(danger_pattern, "search") and danger_pattern.search(block): + return "high_risk_execution" if io_pattern and hasattr(io_pattern, "search") and io_pattern.search(block): return "io" diff --git a/gitgalaxy/core/guidestar_lens.py b/gitgalaxy/core/guidestar_lens.py index 4afd45b6..42018eae 100644 --- a/gitgalaxy/core/guidestar_lens.py +++ b/gitgalaxy/core/guidestar_lens.py @@ -24,7 +24,7 @@ class GuideStarLens: """ - The GuideStar Lens provides 'Social Proof' for files by parsing repository + The GuideStar Lens provides Contextual Baselines for files by parsing repository instructions and structural metadata. DEFENSIVE DESIGN: Before spinning up heavy regex engines or AST parsers, @@ -79,7 +79,7 @@ def scan_project_config(self): Phase 0.5: Main orchestration method that dispatches scouts to scan manifests, configurations, and explicit directives. """ - self.logger.info("GuideStar: Scanning sectors for Social & Roadmap Proof...") + self.logger.info("GuideStar: Scanning sectors for Contextual Baselines & Roadmap Proof...") # 1. Inspect package managers and build manifests self._scan_package_manifests() @@ -102,9 +102,7 @@ def get_intent_status(self, path: Union[str, Path]) -> Tuple[bool, Dict[str, Any """Returns the specific Intent Lock for a given file path based on strict, pattern, or sector match.""" path_obj = Path(path) filename = path_obj.name - rel_path = str( - path_obj.relative_to(self.root) if path_obj.is_absolute() else path_obj - ).replace("\\", "/") + rel_path = str(path_obj.relative_to(self.root) if path_obj.is_absolute() else path_obj).replace("\\", "/") # 1. Check direct filename match (e.g., 'main.py') lock = self.intent_locks.get(filename) @@ -180,7 +178,7 @@ def _inject_pattern_lock(self, pattern: str, lang: str, confidence: float, proof # ============================================================================== def _scan_package_manifests(self): - """Identifies authoritative project anchors and parses their internal logic.""" + """Identifies authoritative project contextual baselines and parses their internal logic.""" # Dynamically inject requirements.txt if it wasn't in the global config active_manifests = dict(self.MANIFEST_MAP) if "requirements.txt" not in active_manifests: @@ -210,16 +208,23 @@ def _deep_inspect_manifest(self, path: Path, filename: str, lang: str): elif filename in ("pyproject.toml", "Cargo.toml", "requirements.txt"): self._parse_toml_style_manifest(path, lang) except Exception as e: - self.logger.debug( - f"GuideStar: Deep inspection failed for '{filename}': {e}" - ) + self.logger.debug(f"GuideStar: Deep inspection failed for '{filename}': {e}") def _detect_ai_ecosystem(self, content: str, filename: str): """Scans manifest files for explicit AI/LLM orchestrators or tensor frameworks.""" ai_keywords = { - "langchain", "llama_index", "openai", "anthropic", "torch", - "tensorflow", "transformers", "huggingface_hub", "vllm", "ollama", - "chromadb", "pinecone", + "langchain", + "llama_index", + "openai", + "anthropic", + "torch", + "tensorflow", + "transformers", + "huggingface_hub", + "vllm", + "ollama", + "chromadb", + "pinecone", } found = [kw for kw in ai_keywords if kw in content.lower()] @@ -251,7 +256,9 @@ def _parse_package_json(self, path: Path): for name, cmd in scripts.items(): files = re.findall(r"([a-zA-Z0-9_\-\./]+\.(?:js|ts|mjs|cjs))", cmd) for f in files: - self._inject_intent_lock(f, "javascript", 0.85, f"Manifest Script (package.json:scripts:{name})") + self._inject_intent_lock( + f, "javascript", 0.85, f"Manifest Script (package.json:scripts:{name})" + ) except Exception: pass @@ -262,9 +269,7 @@ def _parse_makefile(self, path: Path): content = f.read() # Strategy 1: Find variable assignments like SRCS = main.c ... - matches = re.findall( - r"(?:SRCS|SOURCES|FILES|TARGET)\s*[+:]?=\s*(.*)", content, re.I - ) + matches = re.findall(r"(?:SRCS|SOURCES|FILES|TARGET)\s*[+:]?=\s*(.*)", content, re.I) for m in matches: files = m.split() for f in files: @@ -311,7 +316,7 @@ def _extract_execution_triggers(self, text: str): self._inject_intent_lock(filename, predicted_lang, 0.85, f"Execution Trigger ({prefix_clean})") # ============================================================================== - # EXPLICIT AUTHORITY + # EXPLICIT AUTHORITY # ============================================================================== def _scan_gitattributes(self): @@ -357,12 +362,12 @@ def _scan_gitattributes(self): 0.99, f"Authoritative Override (.gitattributes: {attr})", ) - self.logger.debug(f"GuideStar: Locked pattern '{pattern}' to '{engine_lang}' via .gitattributes") + self.logger.debug( + f"GuideStar: Locked pattern '{pattern}' to '{engine_lang}' via .gitattributes" + ) except Exception as e: - self.logger.debug( - f"GuideStar: Deep inspection failed for .gitattributes: {e}" - ) + self.logger.debug(f"GuideStar: Deep inspection failed for .gitattributes: {e}") # ============================================================================== # SECURITY EVASION DETECTION @@ -371,10 +376,10 @@ def _scan_gitattributes(self): def _scan_gitignore_evasion(self): """ Scans .gitignore for hostile force-includes (e.g., !payload.so). - - DEFENSIVE DESIGN: Attackers frequently use force-includes in .gitignore - to bypass standard directory exclusions (like node_modules/) and force - malicious compiled binaries to be tracked by the repository. We intercept + + DEFENSIVE DESIGN: Attackers frequently use force-includes in .gitignore + to bypass standard directory exclusions (like node_modules/) and force + malicious compiled binaries to be tracked by the repository. We intercept these here and flag them for the X-Ray Binary Sensor. """ gitignore_path = self.root / ".gitignore" @@ -395,7 +400,9 @@ def _scan_gitignore_evasion(self): if ext in hostile_bins: clean_path = line[1:].strip("/") - self.logger.critical(f"🚨 EVASION DETECTED: .gitignore is force-including a binary -> '{line}'") + self.logger.critical( + f"🚨 EVASION DETECTED: .gitignore is force-including a binary -> '{line}'" + ) self._inject_intent_lock( clean_path, @@ -405,9 +412,7 @@ def _scan_gitignore_evasion(self): ) except Exception as e: - self.logger.debug( - f"GuideStar: Evasion inspection failed for .gitignore: {e}" - ) + self.logger.debug(f"GuideStar: Evasion inspection failed for .gitignore: {e}") # ============================================================================== # DOCUMENTATION COVERAGE MAP @@ -416,16 +421,25 @@ def _scan_gitignore_evasion(self): def _calculate_documentation_coverage(self): """ Scans the repository for high-value architectural literature. - - PERFORMANCE OPTIMIZATION: Instead of opening and reading thousands of - Markdown files to determine their value, we use `os.stat()` to fetch - the physical byte size of the file. This is an extremely fast O(1) disk - operation that allows us to build a heat map of documentation density. + + PERFORMANCE OPTIMIZATION: Instead of opening and reading thousands of + Markdown files to determine their value, we use `os.stat()` to fetch + the physical byte size of the file. This is an extremely fast O(1) disk + operation that allows us to build a topological map of documentation coverage, + making the assumption the larger doc files have more information in them. """ anchor_patterns = { - "README.md", "README.txt", "README.rst", "ARCHITECTURE.md", - "DESIGN.md", "SPEC.md", "swagger.json", "openapi.yaml", - "openapi.json", "CONTRIBUTING.md", "USAGE.md", + "README.md", + "README.txt", + "README.rst", + "ARCHITECTURE.md", + "DESIGN.md", + "SPEC.md", + "swagger.json", + "openapi.yaml", + "openapi.json", + "CONTRIBUTING.md", + "USAGE.md", } for root_dir, dirs, files in os.walk(self.root): @@ -434,7 +448,7 @@ def _calculate_documentation_coverage(self): if any(part in self._gs_config.get("IGNORED_DIRECTORIES", set()) for part in dir_path.parts): continue - local_shield_mass = 0 + local_shield_footprint = 0 for file in files: if file in anchor_patterns or file.lower().endswith(".md"): @@ -444,17 +458,19 @@ def _calculate_documentation_coverage(self): # Ignore stubs (e.g., "# Project Title" and nothing else) if size_bytes > 150: - local_shield_mass += size_bytes + local_shield_footprint += size_bytes except OSError: pass - if local_shield_mass > 0: + if local_shield_footprint > 0: # 3000+ bytes of documentation provides a 100% (1.0) shield for this folder. - shield_strength = min(local_shield_mass / 3000.0, 1.0) + shield_strength = min(local_shield_footprint / 3000.0, 1.0) rel_dir = str(dir_path.relative_to(self.root)).replace("\\", "/") if rel_dir == ".": rel_dir = "__root__" self.documentation_coverage[rel_dir] = round(shield_strength, 3) - self.logger.debug(f"GuideStar: Projected {shield_strength*100:.1f}% Documentation Coverage over '{rel_dir}'") + self.logger.debug( + f"GuideStar: Projected {shield_strength * 100:.1f}% Documentation Coverage over '{rel_dir}'" + ) diff --git a/gitgalaxy/core/network_risk_sensor.py b/gitgalaxy/core/network_risk_sensor.py index 456da55a..505479c5 100644 --- a/gitgalaxy/core/network_risk_sensor.py +++ b/gitgalaxy/core/network_risk_sensor.py @@ -28,23 +28,17 @@ class NetworkRiskSensor: """ def __init__(self, parent_logger: Optional[logging.Logger] = None): - self.logger = ( - parent_logger.getChild("network_sensor") - if parent_logger - else logging.getLogger("network_sensor") - ) + self.logger = parent_logger.getChild("network_sensor") if parent_logger else logging.getLogger("network_sensor") self.RISK_SCHEMA = RECORDING_SCHEMAS.get("RISK_SCHEMA", []) - def extract_test_coverage_mapping( - self, files: List[Dict[str, Any]] - ) -> Dict[str, Dict[str, List[Dict[str, Any]]]]: + def extract_test_coverage_mapping(self, files: List[Dict[str, Any]]) -> Dict[str, Dict[str, List[Dict[str, Any]]]]: """ Maps function calls from test files to their imported production targets. Returns a dictionary mapping: production_file_path -> { production_function_name: [test_function_data] } - - DEFENSIVE DESIGN: Traditional code coverage only checks if a line was executed. - By mapping outbound AST calls from tests to production targets, we can calculate - the exact architectural "Blast Radius" of untested functions. + + DEFENSIVE DESIGN: Traditional code coverage only checks if a line was executed. + By mapping outbound AST calls from tests to production targets, we can calculate + the exact architectural "Dependency Blast Radius" of untested functions. """ coverage_map = {} resolution_map = {} @@ -63,19 +57,14 @@ def extract_test_coverage_mapping( low_path = path.lower() # Structural heuristic for test files - is_test = any( - x in low_path - for x in ["/test/", "/tests/", "test_", "_test", ".spec.", ".test."] - ) + is_test = any(x in low_path for x in ["/test/", "/tests/", "test_", "_test", ".spec.", ".test."]) if not is_test: continue # Identify which production files this test file imports target_paths = set() for imp in f.get("raw_imports", []): - target_token = ( - imp[0] if isinstance(imp, tuple) and len(imp) == 2 else imp - ) + target_token = imp[0] if isinstance(imp, tuple) and len(imp) == 2 else imp target_path = resolution_map.get(target_token) if target_path and target_path != path: @@ -95,9 +84,7 @@ def extract_test_coverage_mapping( "impact": test_func.get("impact", 0.0), "target_count": target_count, "test_hits": test_func.get("hit_vector", {}).get("test", 0), - "test_skip_hits": test_func.get("hit_vector", {}).get( - "test_skip", 0 - ), + "test_skip_hits": test_func.get("hit_vector", {}).get("test_skip", 0), "decorators": test_func.get("hit_vector", {}).get("decorators", 0), } @@ -112,9 +99,7 @@ def extract_test_coverage_mapping( return coverage_map - def build_dependency_graph( - self, parsed_files: List[Dict[str, Any]] - ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: + def build_dependency_graph(self, parsed_files: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: """ Builds the directed graph and calculates multi-dimensional risk vectors. Modifies the 'telemetry' dictionary of each file in place. @@ -122,9 +107,7 @@ def build_dependency_graph( if not HAS_NETWORKX: return self._fallback_build_graph(parsed_files) - self.logger.info( - f"Network Risk Sensor: Initializing Directed Graph for {len(parsed_files)} nodes..." - ) + self.logger.info(f"Network Risk Sensor: Initializing Directed Graph for {len(parsed_files)} nodes...") G = nx.DiGraph() @@ -152,9 +135,7 @@ def build_dependency_graph( risk_vector=f.get("risk_vector", [0.0] * len(self.RISK_SCHEMA)), max_big_o=max_big_o, is_recursive=is_recursive, - db_complexity=( - max([func.get("db_complexity", 0) for func in funcs]) if funcs else 0 - ), + db_complexity=(max([func.get("db_complexity", 0) for func in funcs]) if funcs else 0), ) # 2. Wire the Edges (File-to-File Level 1 & Entity Level 2) @@ -181,9 +162,9 @@ def build_dependency_graph( G.add_edge(curr_path, target_path, weight=weight) # ========================================================================= - # 3. NETWORK MATHEMATICS (Blast Radius & Centrality) - # DEFENSIVE DESIGN: Centrality algorithms (Betweenness/Closeness) scale non-linearly - # at O(V^3). For massive monolithic repositories (>1500 nodes), we MUST implement + # 3. NETWORK MATHEMATICS (Dependency Blast Radius & Centrality) + # DEFENSIVE DESIGN: Centrality algorithms (Betweenness/Closeness) scale non-linearly + # at O(V^3). For massive monolithic repositories (>1500 nodes), we MUST implement # strict sampling or bypasses, otherwise the CI/CD pipeline will hit a timeout deadlock. # PageRank is safe as it uses iterative convergence. # ========================================================================= @@ -196,17 +177,13 @@ def build_dependency_graph( # Closeness Centrality has no built-in sampling. Hard bypass at 1500 nodes. if len(G.nodes()) > 1500: - self.logger.warning( - "Graph too massive for exact Closeness Centrality. Bypassing." - ) + self.logger.warning("Graph too massive for exact Closeness Centrality. Bypassing.") closeness = {n: 0.0 for n in G.nodes()} else: closeness = nx.closeness_centrality(G) except Exception as e: - self.logger.warning( - f"Network math failed to converge, defaulting to 0: {e}" - ) + self.logger.warning(f"Network math failed to converge, defaulting to 0: {e}") pagerank = {n: 0.0 for n in G.nodes()} betweenness = {n: 0.0 for n in G.nodes()} closeness = {n: 0.0 for n in G.nodes()} @@ -246,10 +223,8 @@ def build_dependency_graph( systemic_threat_vector = [] for local_risk in local_risk_vector: - # Systemic Threat = Blast Radius * Local Vulnerability Severity - systemic_threat_vector.append( - round(pr_normalized * (local_risk / 100.0), 3) - ) + # Systemic Threat = Dependency Blast Radius * Local Vulnerability Severity + systemic_threat_vector.append(round(pr_normalized * (local_risk / 100.0), 3)) # --- Algorithmic Network Bottleneck Detection --- max_big_o = G.nodes[path].get("max_big_o", 1) @@ -281,7 +256,7 @@ def build_dependency_graph( f["telemetry"]["popularity"] = in_d # ========================================================================= - # 6. MACRO-ECOSYSTEM PHYSICS (Repo-Level Health & Resilience) + # 6. MACRO-ECOSYSTEM TOPOLOGY (Repo-Level Health & Resilience) # ========================================================================= macro_metrics = { "modularity": 0.0, @@ -298,9 +273,7 @@ def build_dependency_graph( # A. Modularity (Spaghetti vs Microservice) try: if len(U) > 5000: - self.logger.warning( - "Graph too massive for Modularity. Bypassing." - ) + self.logger.warning("Graph too massive for Modularity. Bypassing.") macro_metrics["modularity"] = 0.0 else: # Attempt Louvain (blazing fast), fallback to Greedy (slow) @@ -309,22 +282,18 @@ def build_dependency_graph( except AttributeError: communities = community.greedy_modularity_communities(U) - macro_metrics["modularity"] = round( - community.modularity(U, communities), 4 - ) + macro_metrics["modularity"] = round(community.modularity(U, communities), 4) except Exception: pass # B. Assortativity (Resiliency) try: assort = nx.degree_assortativity_coefficient(G) - macro_metrics["assortativity"] = ( - round(assort, 4) if not math.isnan(assort) else 0.0 - ) + macro_metrics["assortativity"] = round(assort, 4) if not math.isnan(assort) else 0.0 except Exception: pass - # C. Cyclic Density (Static Friction / Dependency Loops) + # C. Cyclic Density (Circular Dependencies / Dependency Loops) try: sccs = list(nx.strongly_connected_components(G)) nodes_in_cycles = sum(len(c) for c in sccs if len(c) > 1) @@ -335,38 +304,28 @@ def build_dependency_graph( # D. Average Shortest Path (Coupling Distance) try: if len(U) > 5000: - self.logger.warning( - "Graph too massive for Avg Path Length. Bypassing." - ) + self.logger.warning("Graph too massive for Avg Path Length. Bypassing.") macro_metrics["avg_path_length"] = 0.0 else: largest_cc = max(nx.connected_components(U), key=len) subgraph = U.subgraph(largest_cc) - macro_metrics["avg_path_length"] = round( - nx.average_shortest_path_length(subgraph), 4 - ) + macro_metrics["avg_path_length"] = round(nx.average_shortest_path_length(subgraph), 4) except Exception: pass - # E. Articulation Points (Shatter Risk) + # E. Articulation Points (Fragmentation Risk) try: - macro_metrics["articulation_points"] = len( - list(nx.articulation_points(U)) - ) + macro_metrics["articulation_points"] = len(list(nx.articulation_points(U))) except Exception: pass except Exception as e: self.logger.warning(f"Macro network math failed: {e}") - self.logger.info( - "Network Risk Sensor: Vector Mathematics & Graph Topology Complete." - ) + self.logger.info("Network Risk Sensor: Vector Mathematics & Graph Topology Complete.") return parsed_files, macro_metrics - def _fallback_build_graph( - self, parsed_files: List[Dict[str, Any]] - ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: + def _fallback_build_graph(self, parsed_files: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: self.logger.warning( "[!] 'networkx' not found. Operating in Zero-Dependency Mode. Using linear counting for Ecosystem Roles." ) @@ -389,9 +348,7 @@ def _fallback_build_graph( for f in parsed_files: curr_path = f.get("path", "") for imp in f.get("raw_imports", []): - target_token = ( - imp[0] if isinstance(imp, tuple) and len(imp) == 2 else imp - ) + target_token = imp[0] if isinstance(imp, tuple) and len(imp) == 2 else imp if target_token in resolution_map: target_path = resolution_map[target_token] if target_path != curr_path: @@ -439,4 +396,4 @@ def _fallback_build_graph( "avg_path_length": 0.0, "articulation_points": 0, } - return parsed_files, macro_metrics \ No newline at end of file + return parsed_files, macro_metrics diff --git a/gitgalaxy/core/prism.py b/gitgalaxy/core/prism.py index ab7e207d..63460fa4 100644 --- a/gitgalaxy/core/prism.py +++ b/gitgalaxy/core/prism.py @@ -13,18 +13,18 @@ from gitgalaxy.standards.language_standards import LENS_CONFIG, PRISM_CONFIG # ============================================================================== -# GitGalaxy Phase 2: Lexical Comment Scanner (The Prism) +# GitGalaxy Phase 2: Payload & Surface Splitter (The Prism) # Strategy v6.2.0 Protocol: Safe Delimiter Extraction & Format Bypasses # ============================================================================== class PrismResult(TypedDict): """ - The dual-stream output of the Prism. + The dual-output of the Prism. Attributes: - code_stream (str): The pure executable logic stream. - comment_stream (str): The pure documentation/comment stream. + code_stream (str): The executable payload. + comment_stream (str): The documentation surface. coding_loc (int): Lines of code (non-empty, non-comment). doc_loc (int): Lines of comments/documentation. """ @@ -43,21 +43,21 @@ class PrismError(Exception): class Prism: """ - GitGalaxy Phase 2: The Prism (Lexical Stream Splitter) + GitGalaxy Phase 2: The Prism (Payload & Surface Splitter) - PURPOSE: Just as a physical prism splits a unified beam of light into distinct - spectrums, this class performs high-speed lexical scanning to separate a unified - file into pure executable logic and documentation streams while preserving string literals. + PURPOSE: Just as a physical prism splits a unified beam of light into distinct + spectrums, this class performs high-speed structural scanning to separate a unified + file into a pure executable payload and documentation surface while preserving string literals. DEFENSIVE ARCHITECTURE (Why Regex over AST?): - Standard Abstract Syntax Trees (ASTs) are brittle, language-specific, and require - compilable code. To achieve polyglot velocity and prioritize functional intent across + Standard Abstract Syntax Trees (ASTs) are brittle, language-specific, and require + compilable code. To achieve polyglot velocity and prioritize functional intent across 50+ languages, the Prism utilizes highly bounded, ReDoS-proof regular expressions. PIPELINE RULES (v6.2.0): 1. Format Bypass: Respects 'undeterminable' files by passing them untouched to prevent pipeline stalls. 2. Dynamic Regex Matrix: Pre-compiles standard comment rules at runtime based on the JSON configuration. - 3. O(1) String Literal Masking: Temporarily masks string literals to prevent the scanner from + 3. O(1) String Literal Masking: Temporarily masks string literals to prevent the scanner from accidentally mutating URLs or string contents that mimic comment delimiters. 4. Polyglot Delegation: Defers embedded language-mixing resolution to the primary Detector. """ @@ -81,9 +81,7 @@ def __init__( self.lexical_families = comment_definitions.get("mechanical_families", {}) self.languages = language_definitions - self.logger.debug( - "Initializing Prism and warming up regex matrix..." - ) + self.logger.debug("Initializing Prism and warming up regex matrix...") # --- TIER 1: STRING LITERAL MASKING --- # Defends against catastrophic backtracking and logic erosion inside strings @@ -92,7 +90,7 @@ def __init__( # --- TIER 2: REGEX PRE-COMPILATION --- self.REGEX_MATRIX: Dict[str, re.Pattern] = self._compile_regex_matrix() - # Phase 6.1 Handshake Registry (Synchronized securely via Universal Laws) + # Phase 6.1 Handshake Registry (Synchronized securely via Language Standards) self.EMBEDDED_TRIGGERS = [] for trigger_config in LENS_CONFIG.get("HANDSHAKE_REGISTRY", []): self.EMBEDDED_TRIGGERS.append( @@ -105,35 +103,21 @@ def __init__( ) # Performance Constants - self.EMBEDDED_LOOKAHEAD_LIMIT = LENS_CONFIG.get("THRESHOLDS", {}).get( - "HANDSHAKE_LOOKAHEAD_LIMIT", 50000 - ) - self.NESTED_PEEL_LIMIT = PRISM_CONFIG.get("THRESHOLDS", {}).get( - "NESTED_PEEL_LIMIT", 500 - ) - self.POSITIONAL_ANCHORS = PRISM_CONFIG.get( - "POSITIONAL_ANCHORS", {"*", "C", "c", "/", "!"} - ) + self.EMBEDDED_LOOKAHEAD_LIMIT = LENS_CONFIG.get("THRESHOLDS", {}).get("HANDSHAKE_LOOKAHEAD_LIMIT", 50000) + self.NESTED_PEEL_LIMIT = PRISM_CONFIG.get("THRESHOLDS", {}).get("NESTED_PEEL_LIMIT", 500) + self.POSITIONAL_ANCHORS = PRISM_CONFIG.get("POSITIONAL_ANCHORS", {"*", "C", "c", "/", "!"}) # Hardened Language Specific Extractors - self.PYTHON_DOC_PATTERN = re.compile( - PRISM_CONFIG.get("PYTHON_DOC_PATTERN", ""), re.M - ) - self.PHP_HEREDOC_PATTERN = re.compile( - PRISM_CONFIG.get("PHP_HEREDOC_PATTERN", ""), re.M - ) - self.PHP_MULTILINE_STRING = re.compile( - PRISM_CONFIG.get("PHP_MULTILINE_STRING", ""), re.M - ) - - self.logger.info( - f"Lexical Scanner Online | Calibrated {len(self.REGEX_MATRIX)} syntax rules." - ) + self.PYTHON_DOC_PATTERN = re.compile(PRISM_CONFIG.get("PYTHON_DOC_PATTERN", ""), re.M) + self.PHP_HEREDOC_PATTERN = re.compile(PRISM_CONFIG.get("PHP_HEREDOC_PATTERN", ""), re.M) + self.PHP_MULTILINE_STRING = re.compile(PRISM_CONFIG.get("PHP_MULTILINE_STRING", ""), re.M) + + self.logger.info(f"Structural Scanner Online | Calibrated {len(self.REGEX_MATRIX)} syntax rules.") def split_streams(self, content: str, primary_lang: str) -> PrismResult: - """Decouples the signal into mutually exclusive streams (Executable Logic vs Documentation).""" + """Decouples the file into mutually exclusive components (Executable Payload vs Documentation Surface).""" if not content: - self.logger.debug("Lexical Scan skipped: Empty content buffer.") + self.logger.debug("Structural Scan skipped: Empty content buffer.") return { "code_stream": "", "comment_stream": "", @@ -143,9 +127,7 @@ def split_streams(self, content: str, primary_lang: str) -> PrismResult: # --- THE UNPARSABLE BYPASS (Spec 2.3.4.A.1) --- if primary_lang in ("undeterminable", "unknown"): - self.logger.debug( - f"Unparsable Bypass: '{primary_lang}' signal routed to Executable Logic intact." - ) + self.logger.debug(f"Unparsable Bypass: '{primary_lang}' signal routed to Executable Logic intact.") coding_loc = len([l for l in content.split("\n") if l.strip()]) return { "code_stream": content, @@ -157,9 +139,7 @@ def split_streams(self, content: str, primary_lang: str) -> PrismResult: # --- THE PROSE BYPASS --- # Simply add "xml" to the tuple! if primary_lang in ("markdown", "plaintext", "xml"): - self.logger.debug( - f"Prose Bypass: '{primary_lang}' signal routed to Documentation intact." - ) + self.logger.debug(f"Prose Bypass: '{primary_lang}' signal routed to Documentation intact.") doc_loc = len([l for l in content.split("\n") if l.strip()]) return { "code_stream": "", @@ -187,14 +167,10 @@ def split_streams(self, content: str, primary_lang: str) -> PrismResult: for lang_id, segment_text in segments: family = self.languages.get(lang_id, {}).get("lexical_family", "c_style_comment") - self.logger.debug( - f"Scanning segment [{lang_id}] using syntax family '{family}'..." - ) + self.logger.debug(f"Scanning segment [{lang_id}] using syntax family '{family}'...") # Strip comments from the segment - seg_code, seg_comments = self._strip_segment_comments( - segment_text, lang_id, family - ) + seg_code, seg_comments = self._strip_segment_comments(segment_text, lang_id, family) code_parts.append(seg_code) comment_parts.append(seg_comments) @@ -214,9 +190,7 @@ def split_streams(self, content: str, primary_lang: str) -> PrismResult: # This forces mutual exclusivity: if a line has code and a comment, it counts as Code. doc_loc = max(0, total_active_lines - coding_loc) - self.logger.debug( - f"Lexical Scan Complete: {coding_loc} Executable LOC | {doc_loc} Documentation LOC." - ) + self.logger.debug(f"Structural Scan Complete: {coding_loc} Executable LOC | {doc_loc} Documentation LOC.") return { "code_stream": final_code, @@ -227,7 +201,7 @@ def split_streams(self, content: str, primary_lang: str) -> PrismResult: except Exception as e: self.logger.error( - f"Catastrophic structural failure during lexical scan: {e}", + f"Catastrophic structural failure during structural scan: {e}", exc_info=True, ) raise PrismError(f"Prism failure: {e}") @@ -235,8 +209,8 @@ def split_streams(self, content: str, primary_lang: str) -> PrismResult: def _strip_segment_comments(self, text: str, lang_id: str, family: str) -> Tuple[str, str]: """Surgically strips documentation using an ordered, additive pipeline.""" lits = [] - - # 1. PRE-PROCESSING: Extract doc-mass BEFORE any early returns + + # 1. PRE-PROCESSING: Extract documentation surface BEFORE any early returns if lang_id in ("python", "micropython", "ruby"): text, python_lits = self._strip_python_docstrings(text) lits.extend(python_lits) @@ -249,13 +223,13 @@ def _strip_segment_comments(self, text: str, lang_id: str, family: str) -> Tuple code, nested_lits = self._strip_nested_comments(text) lits.extend(nested_lits) return code, "\n".join(lits) - + if family == "column_sensitive": code, pos_lits = self._strip_positional_comments(text) if pos_lits: lits.extend(pos_lits.splitlines()) return code, "\n".join(lits) - + if family == "single_line_only": code, single_lits = self._strip_single_line_comments(text) if single_lits: @@ -264,11 +238,12 @@ def _strip_segment_comments(self, text: str, lang_id: str, family: str) -> Tuple # 3. ATOMIC SHIELDING: Mask literals to prevent generic stripping masked_literals = [] + def shield_callback(m: re.Match) -> str: masked_literals.append(m.group(0)) - return f"__MASK_{len(masked_literals)-1}__" + return f"__MASK_{len(masked_literals) - 1}__" - text = re.sub(self.LITERAL_MASK_PATTERN, shield_callback, text, flags=re.S|re.M) + text = re.sub(self.LITERAL_MASK_PATTERN, shield_callback, text, flags=re.S | re.M) # 4. GENERIC STRIPPER pattern = self.REGEX_MATRIX.get(family) @@ -278,17 +253,17 @@ def shield_callback(m: re.Match) -> str: return code, "\n".join(lits) def strip_callback(m: re.Match) -> str: - if m.group(2): # Match group 2 is your documentation group + if m.group(2): # Match group 2 is your documentation group lits.append(m.group(2).strip()) return "" code = pattern.sub(strip_callback, text) - + # 5. RESTORE SHIELDED LITERALS code = re.sub(r"__MASK_(\d+)__", lambda m: masked_literals[int(m.group(1))], code) return code, "\n".join(lits) - + def _compile_regex_matrix(self) -> Dict[str, re.Pattern]: """Safely pre-compiles the standard regex matrix based on dynamic config lengths.""" matrix = {} @@ -364,24 +339,20 @@ def _compile_regex_matrix(self) -> Dict[str, re.Pattern]: flags |= re.IGNORECASE matrix[fam_key] = re.compile(full_pattern, flags) - self.logger.debug( - f"Regex matrix compiled for family: {fam_key}" - ) + self.logger.debug(f"Regex matrix compiled for family: {fam_key}") except re.error as e: - self.logger.error( - f"Regex compilation failed for family '{fam_key}': {e}" - ) + self.logger.error(f"Regex compilation failed for family '{fam_key}': {e}") return matrix def _strip_python_docstrings(self, text: str) -> Tuple[str, List[str]]: """Extracts triple-quoted strings as documentation.""" docs = [] - + # Use the relaxed pattern def callback(m: re.Match) -> str: docs.append(m.group(0).strip()) - return "\n" # Maintain line count stability + return "\n" # Maintain line count stability # Using re.DOTALL ensures [\s\S] matches newlines correctly clean = re.sub(r'(?:"""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\')', callback, text) @@ -405,9 +376,7 @@ def capture_lit(m: re.Match) -> str: return text, lits - def _partition_embedded_languages( - self, content: str, primary_id: str - ) -> List[Tuple[str, str]]: + def _partition_embedded_languages(self, content: str, primary_id: str) -> List[Tuple[str, str]]: """Splits content into language segments based on embedded language triggers.""" segments = [] last_idx = 0 @@ -463,21 +432,13 @@ def _partition_embedded_languages( if t["pair"]: open_char, close_char = t["pair"] - end_idx = self._find_balanced_end( - content, t["start"], open_char, close_char - ) + end_idx = self._find_balanced_end(content, t["start"], open_char, close_char) else: - search_limit = min( - t["trigger_end"] + self.EMBEDDED_LOOKAHEAD_LIMIT, len(content) - ) - end_match = t["end_pattern"].search( - content, pos=t["trigger_end"], endpos=search_limit - ) + search_limit = min(t["trigger_end"] + self.EMBEDDED_LOOKAHEAD_LIMIT, len(content)) + end_match = t["end_pattern"].search(content, pos=t["trigger_end"], endpos=search_limit) end_idx = end_match.end() if end_match else len(content) if not end_match and end_idx == search_limit: - self.logger.warning( - "Scanner Scope Guard: Failed to find closure within limit. Forcing clip." - ) + self.logger.warning("Scanner Scope Guard: Failed to find closure within limit. Forcing clip.") segments.append((t["target"], content[t["start"] : end_idx])) last_idx = end_idx @@ -487,9 +448,7 @@ def _partition_embedded_languages( return segments if segments else [(primary_id, content)] - def _find_balanced_end( - self, text: str, start_pos: int, opener: str, closer: str - ) -> int: + def _find_balanced_end(self, text: str, start_pos: int, opener: str, closer: str) -> int: """Balanced scoping implementation for paired-bracket embedded segments.""" depth = 0 in_string: Optional[str] = None @@ -522,16 +481,12 @@ def _find_balanced_end( elif char == closer: depth -= 1 if depth <= 0: - self.logger.debug( - f"Balanced scoping closed at offset +{i - start_pos} chars." - ) + self.logger.debug(f"Balanced scoping closed at offset +{i - start_pos} chars.") return i + 1 i += 1 - self.logger.warning( - f"Scanner Scope Guard: Failed to find balanced '{opener}{closer}'. Forcing closure." - ) + self.logger.warning(f"Scanner Scope Guard: Failed to find balanced '{opener}{closer}'. Forcing closure.") return limit def _strip_nested_comments(self, text: str) -> Tuple[str, List[str]]: @@ -598,15 +553,11 @@ def single_callback(m: re.Match) -> str: lits.append(unmask(block_content).strip()) # Remove from logic stream - protected_code = ( - protected_code[:start_idx] + protected_code[end_match.end() :] - ) + protected_code = protected_code[:start_idx] + protected_code[end_match.end() :] safety += 1 if safety >= self.NESTED_PEEL_LIMIT: - self.logger.warning( - f"Nested Peel Guard triggered: Reached max iteration limit ({self.NESTED_PEEL_LIMIT})." - ) + self.logger.warning(f"Nested Peel Guard triggered: Reached max iteration limit ({self.NESTED_PEEL_LIMIT}).") # 4. Final Logic Unmasking return unmask(protected_code), lits @@ -649,13 +600,13 @@ def _guard_metadata_signal(self, content: str) -> Tuple[str, str]: return first + "\n", lines[1] if len(lines) > 1 else "" return "", content - + def _strip_single_line_comments(self, text: str) -> Tuple[str, str]: """Generic single-line comment stripper (for '#' or ';' or '--').""" lines = text.splitlines() code, comments = [], [] - pattern = re.compile(r"(#|--|;|//)") - + pattern = re.compile(r"(#|--|;|//)") + for line in lines: if pattern.search(line): parts = pattern.split(line, 1) diff --git a/gitgalaxy/core/spatial_mapper.py b/gitgalaxy/core/spatial_mapper.py index f2dafddf..efd3884d 100644 --- a/gitgalaxy/core/spatial_mapper.py +++ b/gitgalaxy/core/spatial_mapper.py @@ -5,29 +5,29 @@ # This source code is licensed under the PolyForm Noncommercial License 1.0.0. # You may not use this file except in compliance with the License. # A copy of the license can be found in the LICENSE file in the root directory -# of this project, or at [https://polyformproject.org/licenses/noncommercial/1.0.0/](https://polyformproject.org/licenses/noncommercial/1.0.0/) +# of this project, or at https://polyformproject.org/licenses/noncommercial/1.0.0/ # ============================================================================== import math import hashlib import logging from typing import Dict, List, Any, Optional - # ------------------------------------------------------------------------------ # SPATIAL MAPPER (Phase 7.5: Spatial Positioning Engine) # ------------------------------------------------------------------------------ class SpatialMapper: """ - Transforms a flat list of files into a deterministic 3D Cartesian coordinate map. + Transforms a flat list of artifacts into a deterministic 3D Cartesian coordinate map. - Groups files into Directory Clusters (folders) and positions them relative to the - highest-impact central node (God Object) of each sector while maintaining spatial clearance. + Groups files into Directory Clusters (folders) and positions them relative to the + highest-impact central node (Critical Node) of each sector while maintaining spatial clearance. DEFENSIVE ARCHITECTURE (Angular Spatial Hashing): - Standard physics engines crash on O(N^2) collision detection loops when placing thousands - of nodes. This mapper neutralizes that by bucketing the map into 360 angular degrees. - A placement ray only checks the exact degree it points at, securing O(1) collision avoidance. + Standard layout engines crash on O(N^2) collision detection loops when placing thousands + of nodes. This mapper neutralizes that bottleneck by bucketing the map into 360 angular degrees. + A placement ray only checks the exact degree it points at (and its immediate neighbors), + securing O(1) collision avoidance. This guarantees extreme velocity even on massive enterprise monoliths. """ def __init__(self, parent_logger: Optional[logging.Logger] = None): @@ -40,10 +40,9 @@ def __init__(self, parent_logger: Optional[logging.Logger] = None): self.logger.setLevel(logging.INFO) # --- SPATIAL CONSTANTS --- - # Micro Angle: Nodes within folders follow the classic Golden Angle - self.MICRO_GOLDEN_ANGLE = math.pi * ( - 3.0 - math.sqrt(5.0) - ) # ~2.39996 rad (~137.5 deg) + # Micro Angle: Nodes within folders follow the classic mathematical Golden Angle + # for optimal, non-overlapping organic distribution. + self.MICRO_GOLDEN_ANGLE = math.pi * (3.0 - math.sqrt(5.0)) # ~2.39996 rad (~137.5 deg) # Macro Angle: Directory Clusters follow the user-tuned 92.4 degree step self.MACRO_GOLDEN_ANGLE = math.radians(92.4) @@ -51,23 +50,22 @@ def __init__(self, parent_logger: Optional[logging.Logger] = None): # Base expansion multipliers self.MICRO_SPACING = 250.0 # Internal node-to-node density baseline self.MACRO_STEP_FACTOR = 1.5 # Inter-cluster step multiplier (Center-to-Center) - self.MAX_TILT_DEG = ( - 15.0 # Max degrees a cluster can tilt from horizontal plane - ) - self.CORE_EXCLUSION_RADIUS = 600.0 # Clear center zone + self.MAX_TILT_DEG = 15.0 # Max degrees a cluster can tilt from the horizontal plane + self.CORE_EXCLUSION_RADIUS = 600.0 # Clear center zone to prevent origin overlapping self.JITTER_MAGNITUDE = 100 - def _calculate_spatial_clearance(self, mass: float) -> float: - """Determines the required tight clearance radius for a node based on mass.""" - visual_radius = 10 + (math.pow(max(mass, 1), 1 / 3) * 2) - clearance = 40 + (math.log2(max(mass, 2)) * 5) + def _calculate_spatial_clearance(self, magnitude: float) -> float: + """Determines the required tight clearance radius for a node based on its structural magnitude.""" + visual_radius = 10 + (math.pow(max(magnitude, 1), 1 / 3) * 2) + clearance = 40 + (math.log2(max(magnitude, 2)) * 5) return visual_radius + clearance def _hash_jitter(self, seed: str, amplitude: float) -> float: """ Applies a deterministic pseudo-random jitter based on a filename hash. - Ensures the same codebase generates the exact same geometry every time. + This ensures that running the analysis multiple times on the same codebase + generates the exact same geometry every time, providing visual stability across audits. """ if not seed: return 0.0 @@ -76,20 +74,16 @@ def _hash_jitter(self, seed: str, amplitude: float) -> float: normalized = (h / 0xFFFFFFFF) * 2.0 - 1.0 return normalized * amplitude - def map_repository( - self, parsed_files: List[Dict[str, Any]] - ) -> List[Dict[str, Any]]: + def map_repository(self, parsed_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Injects 3D coordinates using a Ray-Casting Dynamic Mask. - Ensures ecosystem graphs wrap around previous turns of the spiral by measuring + Ensures ecosystem graphs wrap cleanly around previous turns of the spiral by measuring all previously placed obstruction circles. """ if not parsed_files: return [] - self.logger.info( - f"Spatial Mapper: Executing Ray-Casting Dynamic Mask packing for {len(parsed_files)} nodes..." - ) + self.logger.info(f"Spatial Mapper: Executing Ray-Casting Dynamic Mask packing for {len(parsed_files)} nodes...") # 1. Sectorization (Directory Grouping) sectors: Dict[str, List[Dict[str, Any]]] = {} @@ -105,9 +99,9 @@ def map_repository( # 2. Hull Calculation sector_stats = [] for name, items in sectors.items(): - items.sort(key=self._get_mass, reverse=True) - central_node_mass = self._get_mass(items[0]) - central_footprint = self._calculate_spatial_clearance(central_node_mass) + items.sort(key=self._get_magnitude, reverse=True) + central_node_magnitude = self._get_magnitude(items[0]) + central_footprint = self._calculate_spatial_clearance(central_node_magnitude) hull_radius = central_footprint + (math.sqrt(len(items)) * self.MICRO_SPACING) sector_stats.append({"name": name, "nodes": items, "radius": hull_radius}) @@ -194,22 +188,18 @@ def map_repository( # Jitter and Tilt logic sec_y = self._hash_jitter(s_name, 250.0) - tilt_mag = math.radians( - self._hash_jitter(s_name + "_tilt_mag", self.MAX_TILT_DEG) - ) - tilt_dir = math.radians( - (self._hash_jitter(s_name + "_tilt_dir", 0.5) + 0.5) * 360.0 - ) + tilt_mag = math.radians(self._hash_jitter(s_name + "_tilt_mag", self.MAX_TILT_DEG)) + tilt_dir = math.radians((self._hash_jitter(s_name + "_tilt_dir", 0.5) + 0.5) * 360.0) - central_node_mass = self._get_mass(s_nodes[0]) - central_footprint = self._calculate_spatial_clearance(central_node_mass) + central_node_magnitude = self._get_magnitude(s_nodes[0]) + central_footprint = self._calculate_spatial_clearance(central_node_magnitude) for j, node in enumerate(s_nodes): f_name = node.get("name", node.get("filename", f"node_{j}")) if j == 0: lx, ly, lz = 0.0, 0.0, 0.0 else: - p_foot = self._calculate_spatial_clearance(self._get_mass(node)) + p_foot = self._calculate_spatial_clearance(self._get_magnitude(node)) local_r = central_footprint + p_foot + (math.sqrt(j) * self.MICRO_SPACING) local_th = j * self.MICRO_GOLDEN_ANGLE @@ -235,8 +225,8 @@ def map_repository( return parsed_files - def _get_mass(self, node: Dict[str, Any]) -> float: - """Safely extracts mass regardless of which JSON version the pipeline is using.""" + def _get_magnitude(self, node: Dict[str, Any]) -> float: + """Safely extracts structural magnitude regardless of which JSON version the pipeline is using.""" if "forensics" in node: return float(node["forensics"].get("structural_mass", 0.0)) return float(node.get("file_impact", node.get("sum_fxn_impact", 0.0))) \ No newline at end of file diff --git a/gitgalaxy/core/state_rehydrator.py b/gitgalaxy/core/state_rehydrator.py index c014dc35..b052051b 100644 --- a/gitgalaxy/core/state_rehydrator.py +++ b/gitgalaxy/core/state_rehydrator.py @@ -1,7 +1,13 @@ # ============================================================================== -# state_rehydrator.py -# GitGalaxy: SQLite to RAM Memory Rehydration +# GitGalaxy +# Copyright (c) 2026 Joe Esquibel +# +# This source code is licensed under the PolyForm Noncommercial License 1.0.0. +# You may not use this file except in compliance with the License. +# A copy of the license can be found in the LICENSE file in the root directory +# of this project, or at https://polyformproject.org/licenses/noncommercial/1.0.0/ # ============================================================================== + import sqlite3 from pathlib import Path from typing import Dict, Any @@ -11,11 +17,11 @@ class StateRehydrator: """ Restores the GitGalaxy engine's memory state from a previous SQLite audit. - DEFENSIVE DESIGN: During a 'Delta Scan' (incremental update), it is incredibly - inefficient to re-parse 10,000 unchanged files just to figure out how 2 modified - files impact them. This class rehydrates the previous architectural state directly - into RAM, allowing the engine to instantly execute dependency resolution without - triggering the CPU-bound logic splicers. + DEFENSIVE DESIGN: During a 'Delta Scan' (incremental update), it is incredibly + inefficient to re-parse 10,000 unchanged files just to figure out how 2 modified + files impact them. This class rehydrates the previous architectural state directly + into RAM, allowing the engine to instantly execute dependency resolution without + triggering the CPU-bound structural signature extractors. """ def __init__(self, db_path: str): @@ -68,8 +74,8 @@ def load_latest_state(self, repo_name: str) -> Dict[str, Any]: for f in file_rows: rel_path = f["file_path"] - # DEFENSIVE DESIGN: We must perfectly reconstruct the dictionary schema - # expected by `galaxyscope.py` so the Orchestrator can execute its + # DEFENSIVE DESIGN: We must perfectly reconstruct the dictionary schema + # expected by `galaxyscope.py` so the Orchestrator can execute its # downstream graph recalculation without throwing KeyError exceptions. ram_state[rel_path] = { "path": rel_path, @@ -93,6 +99,6 @@ def load_latest_state(self, repo_name: str) -> Dict[str, Any]: } conn.close() - + # Return the standardized payload return {"commit_hash": latest_hash, "ram_cache": ram_state} \ No newline at end of file diff --git a/gitgalaxy/galaxyscope.py b/gitgalaxy/galaxyscope.py index 7ac6c242..c48a5014 100644 --- a/gitgalaxy/galaxyscope.py +++ b/gitgalaxy/galaxyscope.py @@ -51,7 +51,7 @@ from gitgalaxy.standards.gitgalaxy_config import ( APERTURE_CONFIG, PRIORITY_WHITELIST, - COMMENT_DEFINITIONS, + LEXICAL_FAMILY_HEURISTICS, GUIDESTAR_CONFIG, TEST_NAMING_CONVENTIONS, ) @@ -76,14 +76,11 @@ _worker_state = {} -# ------START: Updated _init_worker to accept git_tracked files for intent caching - - -def redos_guillotine(signum, frame): +def execution_timeout_failsafe(signum, frame): """ Hardware-level OS interrupt for Catastrophic Backtracking (ReDoS) protection. - Registered via the Unix 'signal' library, this guillotine forcibly halts the worker + Registered via the Unix 'signal' library, this failsafe forcibly halts the worker process if a malformed file traps the regex engine in an exponential evaluation loop for more than 15 seconds, preventing pipeline starvation. """ @@ -100,20 +97,22 @@ def _init_worker( ): """ Initializes the CPU-bound optical modules within the worker process's isolated memory. + + ARCHITECTURAL DECISION (ISOLATED WORKER MEMORY): Python's Global Interpreter Lock (GIL) prevents true multi-threading for CPU-bound tasks. To map a massive repository at extreme velocity, GitGalaxy spawns entirely separate OS - processes. This function acts as the boot-loader for those child processes. It instantiates - the heavy regex matrices (The Splicer, Prism, etc.) entirely within the child's isolated - RAM. This prevents the OS from attempting to pickle/serialize massive compiled regex objects - across the IPC (Inter-Process Communication) boundary, which would instantly crash the pipeline. + processes. This boot-loader instantiates the heavy regex matrices entirely within the + child's isolated RAM. This prevents the OS from attempting to pickle/serialize massive + compiled regex objects across the IPC (Inter-Process Communication) boundary. """ from gitgalaxy.core.detector import StructuralExtractor as OpticalDetector + logging.getLogger().setLevel(log_level) worker_logger = logging.getLogger("GalaxyScope.Worker") root = Path(root_str) lang_defs = config.get("LANGUAGE_DEFINITIONS", {}) - comm_defs = config.get("COMMENT_DEFINITIONS", {}) + lexical_heuristics = config.get("LEXICAL_FAMILY_HEURISTICS", {}) aperture_cfg = config.get("APERTURE_CONFIG", {}) priority_whitelist = config.get("PRIORITY_WHITELIST", []) @@ -123,9 +122,7 @@ def _init_worker( # 1. Force-warm the fallbacks immediately. # This silences the [AUTO-HEAL] warnings and compiles the regex engine for these IDs. for fallback_id in ["plaintext", "markdown"]: - detector_cache[fallback_id] = OpticalDetector( - fallback_id, lang_defs, parent_logger=worker_logger - ) + detector_cache[fallback_id] = OpticalDetector(fallback_id, lang_defs, parent_logger=worker_logger) # 2. Warm up active project languages based on extensions found in Pass 0. active_langs = set() @@ -137,9 +134,7 @@ def _init_worker( for lang_id in active_langs: if lang_id not in detector_cache: - detector_cache[lang_id] = OpticalDetector( - lang_id, lang_defs, parent_logger=worker_logger - ) + detector_cache[lang_id] = OpticalDetector(lang_id, lang_defs, parent_logger=worker_logger) # --- NEW: Decide the Rules of Engagement before booting the engines --- if config.get("PARANOID_MODE", False): @@ -156,21 +151,15 @@ def _init_worker( "worker_logger": worker_logger, "git_tracked": git_tracked, "census": census, - "filter": ApertureFilter( - root, lang_defs, aperture_cfg, parent_logger=worker_logger - ), - "guidestar": GuideStarLens( - root, priority_whitelist, parent_logger=worker_logger - ), - "detector": LanguageDetector(lang_defs, comm_defs), - "prism": Prism(comm_defs, lang_defs, parent_logger=worker_logger), + "filter": ApertureFilter(root, lang_defs, aperture_cfg, parent_logger=worker_logger), + "guidestar": GuideStarLens(root, priority_whitelist, parent_logger=worker_logger), + "detector": LanguageDetector(lang_defs, lexical_heuristics), + "prism": Prism(lexical_heuristics, lang_defs, parent_logger=worker_logger), "detector_cache": detector_cache, "word_tokenizer": re.compile(r"\b\w+\b"), # --- NEW: Boot the Analysis Engines into worker memory --- "chronometer": Chronometer(root, parent_logger=worker_logger), - "signal": SignalProcessor( - aperture_config=config, parent_logger=worker_logger - ), + "signal": SignalProcessor(aperture_config=config, parent_logger=worker_logger), "security": SecurityLens(policy=_active_policy), # -------------------------------------------------------- } @@ -178,6 +167,7 @@ def _init_worker( _worker_state["guidestar"].scan_project_config() + def _process_file_worker(rel_path: str) -> Dict[str, Any]: """Processes a single file path using the worker's cached hardware modules.""" @@ -228,20 +218,14 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: try: # Phase 1: Aperture Filter t_aperture = time.perf_counter() - is_valid, size_bytes, reason = aperture.evaluate_path_integrity( - full_path_str, has_intent=has_prior - ) + is_valid, size_bytes, reason = aperture.evaluate_path_integrity(full_path_str, has_intent=has_prior) if is_file_profiling: phase_times["1_Aperture_Filter"] = time.perf_counter() - t_aperture if not is_valid: - # ---> NEW: THE X-RAY BINARY SENSOR <--- + # ---> NEW: THE BINARY ANALYSIS SENSOR <--- # Intercept binary and blacklisted extensions for deep inspection - if ( - "Binary Format" in reason - or "Blacklisted Extension" in reason - or "Embedded Data Payload" in reason - ): + if "Binary Format" in reason or "Blacklisted Extension" in reason or "Embedded Data Payload" in reason: try: with open(full_path_str, "rb") as f: # Read the first 8KB to check headers and entropy @@ -251,19 +235,15 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: binary_threats = security.scan_binary(head, ext) if binary_threats: - logger.critical( - f"🚨 X-RAY TRIGGERED: Weaponized binary detected at '{rel_path}'!" - ) + logger.critical(f"🚨 BINARY ANALYSIS TRIGGERED: Weaponized binary detected at '{rel_path}'!") - # Threat Escalation: Forge a synthetic star and force it into the visible galaxy + # Threat Escalation: Forge a synthetic artifact and force it into the repository graph from gitgalaxy.metrics.signal_processor import SignalProcessor hit_vector = [0] * len(SignalProcessor.SIGNAL_SCHEMA) for t_key, t_val in binary_threats.items(): if t_key in SignalProcessor.SIGNAL_SCHEMA: - hit_vector[ - SignalProcessor.SIGNAL_SCHEMA.index(t_key) - ] = t_val + hit_vector[SignalProcessor.SIGNAL_SCHEMA.index(t_key)] = t_val observation["status"] = "success" observation["reason"] = None @@ -274,7 +254,7 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: "is_minified": False, "lock_tier": 0, "intensity": 1.0, - "source_proof": "X-Ray Binary Sensor", + "source_proof": "Binary Analysis Sensor", "size_bytes": size_bytes, "total_loc": 1, "coding_loc": 1, @@ -284,14 +264,10 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: "equations": binary_threats, "satellites": [], "logic_density": 100.0, - "sum_fxn_impact": 5000.0, # Massive gravity! + "sum_fxn_impact": 5000.0, # Massive structural impact! "total_control_flow_ratio": 0.0, "threat_snippets": { - "binary_xray": [ - binary_threats.get( - "threat_snippet", "Unknown Threat" - ) - ] + "binary_xray": [binary_threats.get("threat_snippet", "Unknown Threat")] }, "metadata": { "alert": "WEAPONIZED BINARY DETECTED", @@ -301,9 +277,9 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: observation["processing_time"] = time.time() - t_start return observation except Exception as e: - logger.debug(f"X-Ray failed on '{rel_path}': {e}") + logger.debug(f"Binary Analysis failed on '{rel_path}': {e}") - # If no threats found, or it wasn't a binary, dump to Excluded Artifacts as usual + # If no threats found, or it wasn't a binary, dump to Unparsable Artifacts as usual observation["status"] = "parser_bypass" observation["reason"] = reason observation["size_bytes"] = size_bytes @@ -316,7 +292,7 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: with open(full_path_str, "r", encoding="utf-8", errors="ignore") as f: content_buffer = f.read() except FileNotFoundError: - # Replaces the Phantom Check! Fast, zero-overhead disk failure routing. + # Fast, zero-overhead disk failure routing. observation["status"] = "phantom" observation["reason"] = "Phantom file (missing on disk)" observation["processing_time"] = time.time() - t_start @@ -328,9 +304,7 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: if is_file_profiling: phase_times["2_Disk_IO"] = time.perf_counter() - t_io - filter_res = aperture.is_in_scope( - full_path_str, content=content_buffer, has_intent=has_prior - ) + filter_res = aperture.is_in_scope(full_path_str, content=content_buffer, has_intent=has_prior) if not filter_res["is_in_scope"]: observation["status"] = "parser_bypass" observation["reason"] = filter_res["reason"] @@ -338,7 +312,7 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: return observation # ========================================================================= - # THE HARDWARE GUILLOTINE (GLOBAL ReDoS Protection) + # THE EXECUTION TIMEOUT FAILSAFE (GLOBAL ReDoS Protection) # ========================================================================= import signal import sys @@ -347,7 +321,7 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: is_posix = sys.platform != "win32" if is_posix: - signal.signal(signal.SIGALRM, redos_guillotine) + signal.signal(signal.SIGALRM, execution_timeout_failsafe) signal.alarm(15) # 15-second fuse for POSIX systems try: @@ -366,16 +340,14 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: lang_id = detection_result["lang_id"] - # ---> NEW: INERT MATTER IDENTIFICATION <--- + # ---> NEW: STATIC ASSET IDENTIFICATION <--- is_inert = lang_id in ("plaintext", "markdown", "json", "yaml", "csv") is_supported = lang_id in lang_defs or is_inert if lang_id in ("undeterminable", "unknown") or not is_supported: observation["status"] = "parser_bypass" observation["reason"] = f"Unsupported Format (.{lang_id})" - observation["identity_confidence"] = detection_result.get( - "intensity", 0.0 - ) + observation["identity_confidence"] = detection_result.get("intensity", 0.0) observation["processing_time"] = time.time() - t_start return observation @@ -389,22 +361,14 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: if avg_line_length > 800 or (size_bytes > 50000 and total_loc < 15): is_minified = True - vendor_paths = ( - _worker_state["config"] - .get("APERTURE_CONFIG", {}) - .get("VENDOR_MINIFICATION_PATHS", []) - ) + vendor_paths = _worker_state["config"].get("APERTURE_CONFIG", {}).get("VENDOR_MINIFICATION_PATHS", []) safe_path = full_path_str.replace("\\", "/") - if re.search(r"\.min\.[a-z]+$", full_path_str, re.I) or any( - v in safe_path for v in vendor_paths - ): + if re.search(r"\.min\.[a-z]+$", full_path_str, re.I) or any(v in safe_path for v in vendor_paths): is_minified = True if is_minified: - logger.debug( - f"[WORKER-TRACE] MINIFIED/VENDOR DETECTED: {rel_path}. Bypassing structural Splicer." - ) + logger.debug(f"[WORKER-TRACE] MINIFIED/VENDOR DETECTED: {rel_path}. Bypassing structural extraction.") logic_data = {"equations": {}, "coding_loc": total_loc, "doc_loc": 0} refraction = { "coding_loc": total_loc, @@ -422,16 +386,12 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: if lang_id not in detector_cache: from gitgalaxy.core.detector import OpticalDetector - detector_cache[lang_id] = OpticalDetector( - lang_id, lang_defs, parent_logger=logger - ) + detector_cache[lang_id] = OpticalDetector(lang_id, lang_defs, parent_logger=logger) opt_detector = detector_cache[lang_id] # --- INJECTED DEBUG TRACE --- - logger.debug( - f"[WORKER-TRACE] >>> ENTERING DETECTOR: {rel_path} (Lang: {lang_id})" - ) + logger.debug(f"[WORKER-TRACE] >>> ENTERING DETECTOR: {rel_path} (Lang: {lang_id})") # Phase 5: Optical Detector t_detector_phase = time.perf_counter() @@ -454,7 +414,7 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: logic_data["metadata"] = {} logic_data["metadata"]["doc_umbrella"] = guidestar.documentation_coverage.get(dir_path, 0.0) - logger.debug(f"[WORKER-TRACE] <<< EXITING SPLICER: {rel_path}") + logger.debug(f"[WORKER-TRACE] <<< EXITING EXTRACTOR: {rel_path}") # --- Phase 5.5: Security Lens (Passive Observers) --- t_security = time.perf_counter() @@ -463,9 +423,7 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: if not is_inert: # Handle the new nested dictionary - sec_results = security.scan_content( - content_buffer, filter_res.get("total_loc", 0) - ) + sec_results = security.scan_content(content_buffer, filter_res.get("total_loc", 0)) for sec_key, hit_count in sec_results["counts"].items(): logic_data["equations"][f"sec_{sec_key}"] = hit_count @@ -484,58 +442,36 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: if not is_inert: # 1. Extract raw file dependencies - import_regex = ( - lang_defs.get(lang_id, {}) - .get("rules", {}) - .get("_dependency_capture") - ) + import_regex = lang_defs.get(lang_id, {}).get("rules", {}).get("_dependency_capture") if import_regex: try: for match in import_regex.finditer(content_buffer): - extracted_path = next( - (g for g in match.groups() if g), None - ) + extracted_path = next((g for g in match.groups() if g), None) if extracted_path: # Handle comma-separated blocks and brackets (e.g., Rust/Scala: {A, B}, Python: a, b as c) - clean_group = extracted_path.replace("{", "").replace( - "}", "" - ) + clean_group = extracted_path.replace("{", "").replace("}", "") for item in clean_group.split(","): # Strip 'as alias' and whitespace to isolate the pure module name - clean_module = re.split(r"\s+as\s+", item)[ - 0 - ].strip() + clean_module = re.split(r"\s+as\s+", item)[0].strip() if clean_module: raw_imports.add(clean_module) except Exception: - logging.exception( - "Import extraction failed for language '%s'.", lang_id - ) + logging.exception("Import extraction failed for language '%s'.", lang_id) # 2. Extract Named Tokens dynamically via Language Standards - named_token_regex = ( - lang_defs.get(lang_id, {}) - .get("rules", {}) - .get("_named_token_capture") - ) + named_token_regex = lang_defs.get(lang_id, {}).get("rules", {}).get("_named_token_capture") if named_token_regex: try: for match in named_token_regex.finditer(content_buffer): - extracted_group = next( - (g for g in match.groups() if g), None - ) + extracted_group = next((g for g in match.groups() if g), None) if extracted_group: # Split by comma and strip 'as' aliases to isolate the pure token for token in extracted_group.split(","): - clean_token = re.split(r"\s+as\s+", token)[ - 0 - ].strip() + clean_token = re.split(r"\s+as\s+", token)[0].strip() if clean_token: named_tokens.add(clean_token) except Exception: - logging.exception( - "Named token extraction failed for language '%s'.", lang_id - ) + logging.exception("Named token extraction failed for language '%s'.", lang_id) if is_file_profiling: phase_times["6_Import_Regex"] = time.perf_counter() - t_imports @@ -544,9 +480,7 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: t_token = time.perf_counter() popularity_hits = set() if not is_inert: - popularity_hits = ( - set(tokenizer.findall(refraction["code_stream"])) & census - ) + popularity_hits = set(tokenizer.findall(refraction["code_stream"])) & census t_end = time.perf_counter() if is_file_profiling: phase_times["7_Token_Intersection"] = t_end - t_token @@ -554,28 +488,19 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: # Append the new blind-spot telemetry to the regex output if is_profiling and not is_inert: logic_data["regex_telemetry"] = logic_data.get("regex_telemetry", {}) - logic_data["regex_telemetry"][f"{lang_id}::Worker_Imports"] = ( - t_token - t_imports - ) - logic_data["regex_telemetry"][ - f"{lang_id}::Worker_Popularity_Tokens" - ] = t_end - t_token + logic_data["regex_telemetry"][f"{lang_id}::Worker_Imports"] = t_token - t_imports + logic_data["regex_telemetry"][f"{lang_id}::Worker_Popularity_Tokens"] = t_end - t_token except TimeoutError: - # The bomb went off anywhere in Phase 3 through 7! - logger.warning( - f"⏳ TIMEOUT GUILLOTINE: '{rel_path}' exceeded 15s. Banishing to Singularity." - ) + logger.warning(f"TIMEOUT FAILSAFE: '{rel_path}' exceeded 15s. Relegating to Unparsable Artifacts.") observation["status"] = "parser_bypass" - observation["reason"] = ( - "Unparsable (Structural Saturation / Global Regex Timeout)" - ) + observation["reason"] = "Unparsable (Structural Saturation / Global Regex Timeout)" observation["size_bytes"] = filter_res.get("size_bytes", 0) observation["identity_confidence"] = detection_result.get("intensity", 0.0) observation["processing_time"] = time.time() - t_start return observation finally: - # IMPORTANT: Defuse the bomb immediately upon success! + # IMPORTANT: Clear the timeout failsafe immediately upon success! if is_posix: signal.alarm(0) # ========================================================================= @@ -596,15 +521,11 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: "raw_imports": list(raw_imports), "named_tokens": list(named_tokens), # <--- NEW: Send tokens to Orchestrator "popularity_hits": popularity_hits, - "regex_telemetry": ( - logic_data.pop("regex_telemetry", {}) if is_profiling else {} - ), + "regex_telemetry": (logic_data.pop("regex_telemetry", {}) if is_profiling else {}), } data_payload.update(logic_data) - data_payload["control_flow_ratio"] = logic_data.get( - "total_control_flow_ratio", 0.0 - ) + data_payload["control_flow_ratio"] = logic_data.get("total_control_flow_ratio", 0.0) data_payload["file_impact"] = logic_data.get("sum_fxn_impact", 0.0) observation.update( @@ -626,22 +547,20 @@ def _process_file_worker(rel_path: str) -> Dict[str, Any]: # ---> NEW: REAL-TIME SLOW FILE ALERT <--- if total_time > 10.0: - logger.warning( - f"🐌 SLOW PARSE DETECTED: '{rel_path}' took {total_time:.2f} seconds." - ) + logger.warning(f"🐌 SLOW PARSE DETECTED: '{rel_path}' took {total_time:.2f} seconds.") return observation # ============================================================================== -# GitGalaxy Phase 3: Pipeline Orchestrator (The GalaxyScope) +# GitGalaxy Phase 3: Pipeline Orchestrator # Bayesian Optics & Parser Bypasses # ============================================================================== class Orchestrator: """ - Mission Control: The GitGalaxy Central Processing Core. + Orchestrator Core: The GitGalaxy Central Processing Core. This class operates as the Hub in GitGalaxy's Hub-and-Spoke architecture. It is strictly a traffic cop—it delegates all heavy lifting to specialized computational engines. @@ -649,13 +568,13 @@ class Orchestrator: 1. Pre-Flight (Phase 0): The root path is scanned to build a 'Census' of tracked files, consulting Git/OS boundaries, .gitattributes, and dynamic micro-mass limits. 2. Parallel Extraction (Phase 1): Bypasses the GIL by spawning isolated worker processes. - Workers perform the heavy regex DNA/token extraction and filter out inert data. - 3. Structural Physics (Phases 2-4): Returns extracted features to the main thread. Maps out + Workers perform the heavy Structural Signature and token extraction and filter out Static Assets. + 3. Structural Impact Analysis (Phases 2-4): Returns extracted features to the main thread. Maps out DAGs (Directed Acyclic Graphs) and converts token frequencies into actionable metrics. Note: - risk exposures are calculated metrics from the DNA/regex hits, not the hits themselves. + risk exposures are calculated metrics from the raw Structural Signatures, not the hits themselves. 4. Threat Inference (Phases 5-10): Executes ML pipelines (XGBoost) and zero-trust policies (AppSec/Supply Chain Firewalls) to hunt behavioral anomalies. - 5. Output Routing (Phases 11-12): Destructively pivots the global RAM state into columnar + 5. Output Routing (Phases 11-12): Transforms the global RAM state into columnar JSON payloads, native SQLite bases, and LLM-ready markdown artifacts. """ @@ -678,26 +597,22 @@ def __init__( # CORE SENSOR SUBMODULES (The Spokes) # ============================================================================== # Perimeter shield rejecting unreadable/binary matter before deep scanning - self.filter = ApertureFilter( - self.root, lang_defs, aperture_cfg, parent_logger=logger - ) + self.filter = ApertureFilter(self.root, lang_defs, aperture_cfg, parent_logger=logger) # Bayesian prior injector (evaluates intent via Manifests, Readmes, .gitattributes) - self.guidestar = GuideStarLens( - self.root, priority_whitelist, parent_logger=logger - ) + self.guidestar = GuideStarLens(self.root, priority_whitelist, parent_logger=logger) # Temporal engine extracting Git volatility, churn velocity, and ownership entropy self.chronometer = Chronometer(self.root, parent_logger=logger) self.spatial_mapper = SpatialMapper(parent_logger=logger) - # The primary heuristic math engine converting raw DNA hits to risk exposure vectors + # The primary heuristic math engine converting raw Structural Signatures to risk exposure vectors self.processor = SignalProcessor(aperture_config=config, parent_logger=logger) # Third-Gate gatekeeper identifying and dropping un-parseable data dumps self.auditor = StatisticalAuditor(parent_logger=logger) - # Constructs the physical import DAG and calculates PageRank/Blast Radius + # Constructs the physical import DAG and calculates PageRank/Downstream Exposure self.network_sensor = NetworkRiskSensor(parent_logger=logger) # ============================================================================== @@ -710,7 +625,7 @@ def __init__( # LLM Recorder: Generates token-compressed RAG context text for AI Agents self.llm_recorder = LLMRecorder(parent_logger=logger) # DB Recorder: Archives relational tables natively to SQLite3 - self.db_recorder = RecordKeeper(parent_logger=logger) # <--- Add this line + self.db_recorder = RecordKeeper(parent_logger=logger) # --- NEW: THE SMART THREAT SWITCH (MAIN THREAD) --- if self.config.get("PARANOID_MODE", False): @@ -722,9 +637,7 @@ def __init__( self.security_analyzer = SecurityLens(policy=_active_policy) # Multi-class XGBoost threat classification model - self.model_auditor = SecurityAuditor( - model_path="gitgalaxy_malware_xgb_multiclass.json", parent_logger=logger - ) + self.model_auditor = SecurityAuditor(model_path="gitgalaxy_malware_xgb_multiclass.json", parent_logger=logger) # -------------------------------------------------- # ============================================================================== @@ -767,12 +680,12 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): PIPELINE ONBOARDING (Execution Flow): The method enforces a strict chronological dependency chain. For example, Workers (Phase 1) must run before Relational Analysis (Phase 3) so that - we have exact code tokens in RAM before mapping the API Blast Radius. + we have exact code tokens in RAM before mapping the API Downstream Exposure. Likewise, Network Topology (Phase 4) is required before XGBoost Inference (Phase 9) since a file's centrality influences its logic bomb threat weighting. """ start_time = time.time() - logger.info(f"--- MISSION_IGNITION: {self.root.name} (v{self.version}) ---") + logger.info(f"--- PIPELINE_START: {self.root.name} (v{self.version}) ---") if not HAS_NETWORKX or not HAS_TIKTOKEN or not ML_AVAILABLE or not HAS_PYYAML: missing_libs = [] @@ -788,44 +701,22 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): pip_cmd = f"pip install {' '.join(missing_libs)}" logger.warning("") - logger.warning( - " ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓" - ) - logger.warning( - " ┃ ⚠️ ZERO-DEPENDENCY MODE ACTIVE ┃" - ) - logger.warning( - " ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫" - ) - logger.warning( - " ┃ Missing computational engines. Metrics will be safely set to NULL: ┃" - ) + logger.warning(" ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓") + logger.warning(" ┃ ⚠️ ZERO-DEPENDENCY MODE ACTIVE ┃") + logger.warning(" ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫") + logger.warning(" ┃ Missing computational engines. Metrics will be safely set to NULL: ┃") if not HAS_NETWORKX: - logger.warning( - " ┃ - networkx (Network Topology, Blast Radius, Choke Points) ┃" - ) + logger.warning(" ┃ - networkx (Network Topology, Downstream Exposure, Choke Points) ┃") if not HAS_TIKTOKEN: - logger.warning( - " ┃ - tiktoken (Absolute Token Mass, Financial Read Cost) ┃" - ) + logger.warning(" ┃ - tiktoken (Absolute Token Mass, Financial Read Cost) ┃") if not ML_AVAILABLE: - logger.warning( - " ┃ - xgboost, pandas (Advanced ML Threat Inference & Taxonomy) ┃" - ) + logger.warning(" ┃ - xgboost, pandas (Advanced ML Threat Inference & Taxonomy) ┃") if not HAS_PYYAML: - logger.warning( - " ┃ - pyyaml (Required for parsing .yaml/.yml Swagger/OpenAPI specs) ┃" - ) - logger.warning( - " ┃ ┃" - ) - logger.warning( - " ┃ To unlock absolute precision, run: ┃" - ) + logger.warning(" ┃ - pyyaml (Required for parsing .yaml/.yml Swagger/OpenAPI specs) ┃") + logger.warning(" ┃ ┃") + logger.warning(" ┃ To unlock absolute precision, run: ┃") logger.warning(f" ┃ {pip_cmd}".ljust(75) + "┃") - logger.warning( - " ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛" - ) + logger.warning(" ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛") logger.warning("") try: @@ -834,88 +725,66 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): t_phase = time.time() self.guidestar.scan_project_config() self._build_file_census() - logger.info( - f"⏱️ MACRO-CLOCK [Phase 0 - Radar]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 0 - Radar]: {time.time() - t_phase:.2f}s") # PHASE 1: Workers & IPC Extraction # Bypasses the GIL, deploying CPU-heavy regex scanning into isolated Memory spaces. t_phase = time.time() self._extract_features_parallel() - logger.info( - f"⏱️ MACRO-CLOCK [Phase 1 - Workers & IPC]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 1 - Workers & IPC]: {time.time() - t_phase:.2f}s") # PHASE 2: Dependency Resolution (Import Graph) # Reconstructs inter-file linkages. Executes *before* Relational Analysis so we # can mathematically define a file's public exposure index. t_phase = time.time() self._resolve_dependency_graph() - logger.info( - f"⏱️ MACRO-CLOCK [Phase 2 - Dependency Resolution]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 2 - Dependency Resolution]: {time.time() - t_phase:.2f}s") - # PHASE 3: Relational Analysis (Structural Physics) + # PHASE 3: Structural Impact Analysis # Fuses chronological Git telemetry with raw token counts to calculate multi-dimensional # risks (e.g., Tech Debt, Cognitive Load, State Flux). t_phase = time.time() self._calculate_risk_exposures() - logger.info( - f"⏱️ MACRO-CLOCK [Phase 3 - Relational Analysis]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 3 - Structural Impact Analysis]: {time.time() - t_phase:.2f}s") - # PHASE 4: Network Topology & Blast Radius + # PHASE 4: Network Topology & Downstream Exposure # Computes PageRank and Betweenness Centrality on the assembled Dependency Graph. t_phase = time.time() - self.parsed_files, network_macro = self.network_sensor.build_dependency_graph( - self.parsed_files - ) - logger.info( - f"⏱️ MACRO-CLOCK [Phase 4 - Network Topology]: {time.time() - t_phase:.2f}s" - ) + self.parsed_files, network_macro = self.network_sensor.build_dependency_graph(self.parsed_files) + logger.info(f"⏱️ EXECUTION_TIME [Phase 4 - Network Topology]: {time.time() - t_phase:.2f}s") # PHASE 5: Zero-Trust Guardrails (AI & AppSec) - # Enforces explicit system rules identifying Prompt Injections or Context Window shredders. + # Enforces explicit system rules identifying Prompt Injections or Context Window Exhaustion. t_phase = time.time() dev_firewall = DevAgentFirewall(parent_logger=logger) self.parsed_files = dev_firewall.evaluate_ecosystem(self.parsed_files) appsec_sensor = AIAppSecSensor(parent_logger=logger) self.parsed_files = appsec_sensor.hunt_threats(self.parsed_files) - logger.info( - f"⏱️ MACRO-CLOCK [Phase 5 - Zero-Trust Guardrails]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 5 - Zero-Trust Guardrails]: {time.time() - t_phase:.2f}s") # PHASE 6: Spectral Audit & Verification # Uses standard deviations to identify and drop un-parseable data dumps or log files. t_phase = time.time() repository_graph, unparsable_audits = self.auditor.audit(self.parsed_files) total_unparsable = self.unparsable_files + unparsable_audits - logger.info( - f"⏱️ MACRO-CLOCK [Phase 6 - Spectral Audit]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 6 - Spectral Audit]: {time.time() - t_phase:.2f}s") - # PHASE 7: Cartography & 3D Mapping + # PHASE 7: Dependency Graphing & Visualization # Assigns coordinates based on topological hierarchies for WebGL. t_phase = time.time() if repository_graph: repository_graph = self.spatial_mapper.map_repository(repository_graph) files_mapped_count = len(repository_graph) if repository_graph else 0 - logger.info( - f"⏱️ MACRO-CLOCK [Phase 7 - 3D Cartography]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 7 - Dependency Graphing]: {time.time() - t_phase:.2f}s") # PHASE 8: Metrics Synthesis & Forensics # Aggregates raw outputs for the LLM payload generation. t_phase = time.time() - summary = self.processor.summarize_galaxy_metrics( - repository_graph, total_unparsable - ) + summary = self.processor.summarize_galaxy_metrics(repository_graph, total_unparsable) summary["network_macro"] = network_macro report = self.processor.generate_forensic_report(repository_graph) - logger.info( - f"⏱️ MACRO-CLOCK [Phase 8 - Metrics Synthesis]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 8 - Metrics Synthesis]: {time.time() - t_phase:.2f}s") # PHASE 9: ML Threat Inference & Graph Resolution # Processes the fully formed context through XGBoost trees to isolate embedded Trojans/Stealers. @@ -926,17 +795,13 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): repository_graph = self.model_auditor.audit_repository( repository_graph, is_shadow_patch=is_shadow_patch ) - logger.info( - f"⏱️ MACRO-CLOCK [Phase 9 - ML Threat Inference]: {time.time() - t_phase:.2f}s" - ) + logger.info(f"⏱️ EXECUTION_TIME [Phase 9 - ML Threat Inference]: {time.time() - t_phase:.2f}s") # ========================================================== # PHASE 10: ECOSYSTEM SECURITY AUDITS # Evaluates structural boundaries (Ghost APIs, Supply Chain spoofing). # ========================================================== - logger.info( - "Phase 10: Executing Ecosystem Security Audits (X-Ray, Firewall, API Mapper)..." - ) + logger.info("Phase 10: Executing Ecosystem Security Audits (X-Ray, Firewall, API Mapper)...") # 1. Gather all manifests instantly using the Phase 0 stem_map (Zero Disk Walk) target_manifests = set(GUIDESTAR_CONFIG.get("MANIFEST_MAP", {}).keys()) @@ -949,9 +814,7 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): # 2. Build the global translation map from gitgalaxy.security.manifest_parser import ManifestParser - alias_map = ManifestParser(parent_logger=logger).build_resolution_map( - manifest_paths - ) + alias_map = ManifestParser(parent_logger=logger).build_resolution_map(manifest_paths) ecosystem_audits = { "api_mapper": run_api_audit(self.root), @@ -987,21 +850,14 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): "xgboost": not ML_AVAILABLE, "pyyaml": not HAS_PYYAML, }, - "zero_dependency_mode": ( - not HAS_NETWORKX - or not HAS_TIKTOKEN - or not ML_AVAILABLE - or not HAS_PYYAML - ), + "zero_dependency_mode": (not HAS_NETWORKX or not HAS_TIKTOKEN or not ML_AVAILABLE or not HAS_PYYAML), } if "unparsable_files" not in summary: summary["unparsable_files"] = {} # Pass the array into the function, and merge the results directly - summary["unparsable_files"].update( - self._summarize_anomalies(total_unparsable) - ) + summary["unparsable_files"].update(self._summarize_anomalies(total_unparsable)) # --- PURE OUTPUT ROUTER --- # Respect the exact path provided, just ensure the parent folder exists @@ -1029,12 +885,8 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): try: out_path = Path(output_file) safe_suffix = out_path.suffix if out_path.suffix else ".json" - audit_output = str( - out_path.with_name(f"{out_path.stem}_audit{safe_suffix}") - ) - logger.debug( - f"AUDIT: Generating comprehensive human-readable forensic log -> {audit_output}" - ) + audit_output = str(out_path.with_name(f"{out_path.stem}_audit{safe_suffix}")) + logger.debug(f"AUDIT: Generating comprehensive human-readable forensic log -> {audit_output}") self.audit_recorder.generate_report( parsed_files=repository_graph, @@ -1054,9 +906,7 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): if not exclusive_mode or self.config.get("LLM_ONLY"): try: output_dir = str(Path(output_file).parent) - logger.info( - f"LLM: Generating AI translation artifacts -> {output_dir}" - ) + logger.info(f"LLM: Generating AI translation artifacts -> {output_dir}") self.llm_recorder.generate_artifacts( parsed_files=repository_graph, @@ -1075,22 +925,12 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): # --- Phase 12.3: SQLite Recorder (Native Database) --- if not exclusive_mode or self.config.get("DB_ONLY"): try: - db_output = str( - Path(output_file).with_name( - f"{Path(output_file).stem}_master.db" - ) - ) - logger.info( - f"SQLITE: Generating repository-specific database -> {db_output}" - ) + db_output = str(Path(output_file).with_name(f"{Path(output_file).stem}_master.db")) + logger.info(f"SQLITE: Generating repository-specific database -> {db_output}") self.db_recorder.record_mission( - parsed_files=( - list(repository_graph) if repository_graph else [] - ), # <--- PASS A COPY - unparsable_files=( - list(total_unparsable) if total_unparsable else [] - ), # <--- PASS A COPY + parsed_files=(list(repository_graph) if repository_graph else []), # <--- PASS A COPY + unparsable_files=(list(total_unparsable) if total_unparsable else []), # <--- PASS A COPY summary=summary, session_meta=session_meta, output_path=db_output, @@ -1102,9 +942,7 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): ) # --- Phase 12.4: GPU Recorder (Destructive Columnar Pivot) --- - gpu_output = str( - Path(output_file).with_name(f"{Path(output_file).stem}_gpu.json") - ) + gpu_output = str(Path(output_file).with_name(f"{Path(output_file).stem}_gpu.json")) if not exclusive_mode or self.config.get("GPU_ONLY"): logger.info(f"GPU: Generating minified payload -> {gpu_output}") @@ -1120,17 +958,13 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): payload["meta"]["session"] = session_meta self.gpu_recorder.save_minified(payload, gpu_output) - logger.info( - f"--- MISSION_SUCCESS: {files_mapped_count} files mapped in {duration}s ---" - ) - logger.info( - f"--- ENGINE_TELEMETRY: Processed {total_loc:,} lines of code at {loc_per_sec:,} LOC/s ---" - ) + logger.info(f"--- PIPELINE_SUCCESS: {files_mapped_count} files mapped in {duration}s ---") + logger.info(f"--- ENGINE_TELEMETRY: Processed {total_loc:,} lines of code at {loc_per_sec:,} LOC/s ---") logger.info(f"--- ARCHIVES_SEALED: {gpu_output} & {audit_output} ---") if not HAS_NETWORKX or not HAS_TIKTOKEN: logger.warning( - " ⚠️ NOTE: Mission completed in Zero-Dependency Mode. Run `pip install networkx tiktoken` for full precision." + " ⚠️ NOTE: Pipeline completed in Zero-Dependency Mode. Run `pip install networkx tiktoken` for full precision." ) if self.config.get("FILE_SPEED"): @@ -1149,10 +983,8 @@ def execute_pipeline(self, output_file: str = "galaxy.json"): print(" 🌌 READY FOR VISUALIZATION (100% LOCAL / ZERO UPLOAD)") print("=" * 75) - print( - " 1. Open your browser to: \033[94m\033[4m[https://gitgalaxy.io/](https://gitgalaxy.io/)\033[0m" - ) - print(f" 2. Drag and drop '{output_file}'") + print(" 1. Open your browser to: \033[94m\033[4m[https://gitgalaxy.io/](https://gitgalaxy.io/)\033[0m") + print(f" 2. Drag and drop '{gpu_output}'") print("\n * PRIVACY SECURED: Your data never leaves your machine.") print(" All architectural rendering executes locally in your browser.") print("=" * 75 + "\n") @@ -1177,9 +1009,7 @@ def _inspect_path(rel_path): path_obj = Path(rel_path) full_path = self.root / path_obj has_intent, _ = self.guidestar.get_intent_status(path_obj) - is_valid, size_bytes, reason = self.filter.evaluate_path_integrity( - full_path, has_intent=has_intent - ) + is_valid, size_bytes, reason = self.filter.evaluate_path_integrity(full_path, has_intent=has_intent) return rel_path, path_obj, is_valid, size_bytes, reason # Use 32 threads to saturate the disk I/O queue @@ -1191,18 +1021,11 @@ def _inspect_path(rel_path): # ---> NEW: THE NEIGHBORHOOD MICRO-MASS QUOTA <--- # Exempt mainframe files (COBOL/JCL) from being flagged as micro-debris safe_ext = path_obj.suffix.lower() - if ( - is_valid - and size_bytes < self.MICRO_MASS_BYTES - and safe_ext not in {".cpy", ".cbl", ".cob", ".jcl"} - ): + if is_valid and size_bytes < self.MICRO_MASS_BYTES and safe_ext not in {".cpy", ".cbl", ".cob", ".jcl"}: dir_path = str(path_obj.parent) self.neighborhood_tracker[dir_path] += 1 - if ( - self.neighborhood_tracker[dir_path] - > self.MICRO_MASS_GRACE_LIMIT - ): + if self.neighborhood_tracker[dir_path] > self.MICRO_MASS_GRACE_LIMIT: is_valid = False reason = "Excluded: Neighborhood Micro-Mass Limit Exceeded" # ------------------------------------------------ @@ -1219,7 +1042,7 @@ def _inspect_path(rel_path): self.ext_tally[ext] = self.ext_tally.get(ext, 0) + 1 self.ext_tally[name] = self.ext_tally.get(name, 0) + 1 else: - # Route directly to Dark Matter, bypassing the Multi-Processing pool + # Route directly to Unparsable Artifacts, bypassing the Multi-Processing pool self.unparsable_files.append( { "path": rel_path, @@ -1230,56 +1053,35 @@ def _inspect_path(rel_path): ) self._record_anomaly(rel_path, reason) - logger.info( - f"CENSUS_COMPLETE: Found {len(git_paths)} tracked artifacts via Git." - ) + logger.info(f"CENSUS_COMPLETE: Found {len(git_paths)} tracked artifacts via Git.") except (subprocess.CalledProcessError, FileNotFoundError): self.git_tracked_files = set() logger.warning("GIT_NOT_FOUND: Reverting to standard filesystem walk.") self._fallback_filesystem_walk() - def _fallback_filesystem_walk(self): - """ - Standard OS-level filesystem walk for non-Git projects or ZIP archives. - """ - self.cleanup() - def _fallback_filesystem_walk(self): """ Standard OS-level filesystem walk for non-Git projects or ZIP archives. Acts as the fallback mechanism if `git ls-files` fails. Evaluates every file against - the Aperture filter's Black Holes and dynamic micro-mass quotas, discarding ignored + the Aperture filter's Ignored Directories and dynamic micro-mass quotas, discarding ignored assets before they are added to the active Census. """ for root, dirs, files in os.walk(self.root): # Add [0] to extract just the boolean 'is_valid' - dirs[:] = [ - d - for d in dirs - if self.filter.evaluate_path_integrity(Path(root) / d)[0] - ] + dirs[:] = [d for d in dirs if self.filter.evaluate_path_integrity(Path(root) / d)[0]] for file in files: full_p = Path(root) / file - is_valid, size_bytes, reason = self.filter.evaluate_path_integrity( - full_p - ) + is_valid, size_bytes, reason = self.filter.evaluate_path_integrity(full_p) # ---> NEW: THE NEIGHBORHOOD MICRO-MASS QUOTA <--- # Exempt mainframe files (COBOL/JCL) from being flagged as micro-debris safe_ext = full_p.suffix.lower() - if ( - is_valid - and size_bytes < self.MICRO_MASS_BYTES - and safe_ext not in {".cpy", ".cbl", ".cob", ".jcl"} - ): + if is_valid and size_bytes < self.MICRO_MASS_BYTES and safe_ext not in {".cpy", ".cbl", ".cob", ".jcl"}: dir_path = str(full_p.parent.relative_to(self.root)) self.neighborhood_tracker[dir_path] += 1 - if ( - self.neighborhood_tracker[dir_path] - > self.MICRO_MASS_GRACE_LIMIT - ): + if self.neighborhood_tracker[dir_path] > self.MICRO_MASS_GRACE_LIMIT: is_valid = False reason = "Excluded: Neighborhood Micro-Mass Limit Exceeded" # ------------------------------------------------ @@ -1310,18 +1112,16 @@ def _fallback_filesystem_walk(self): def _extract_features_parallel(self): """ - Phase 1: Parallel Refraction & Matter Eviction via Multi-Core Map-Reduce. + Phase 1: Parallel Extraction & Asset Filtering via Multi-Core Map-Reduce. Dispatches the physical file paths to the isolated worker pool (bypassing the GIL). As the workers complete their high-speed regex extraction, this method consumes the futures dynamically to prevent O(N^2) polling wait states. It catches structural - saturations (ReDoS), logs processing telemetry, and aggregates the extracted DNA into - the global RAM cache. + saturations (ReDoS), logs processing telemetry, and aggregates the extracted + Structural Signatures into the global RAM cache. """ total_files = len(self.stem_map) - logger.info( - f"PASS_1: Optical sequence initiated for {total_files} artifacts via ProcessPoolExecutor." - ) + logger.info(f"PASS_1: Optical sequence initiated for {total_files} artifacts via ProcessPoolExecutor.") if total_files == 0: return @@ -1348,8 +1148,7 @@ def _extract_features_parallel(self): ) as executor: # Map futures to their file paths in a tracking dictionary active_futures = { - executor.submit(_process_file_worker, rel_path): rel_path - for rel_path in self.stem_map.values() + executor.submit(_process_file_worker, rel_path): rel_path for rel_path in self.stem_map.values() } # THE STARVATION MONITOR (Event-Driven Generator) @@ -1364,9 +1163,7 @@ def _extract_features_parallel(self): completed_count += 1 if completed_count % 50 == 0: - logger.info( - f"PROGRESS: Surveyed {completed_count}/{total_files} coordinates." - ) + logger.info(f"PROGRESS: Surveyed {completed_count}/{total_files} coordinates.") try: res = future.result() @@ -1378,57 +1175,43 @@ def _extract_features_parallel(self): if self.config.get("FILE_SPEED"): p_times = res.get("phase_times", {}) for phase, duration in p_times.items(): - self.file_speed_telemetry["phase_totals"][ - phase - ] += duration + self.file_speed_telemetry["phase_totals"][phase] += duration self.file_speed_telemetry["file_count"] += 1 if self.config.get("SPLICING_SPEED"): process_time = res.get("processing_time", 0) # 1. Always track the globally slowest files (bounded to save RAM) - self.splicing_telemetry["top_slowest"].append( - {"path": rel_path, "time": process_time} - ) + self.splicing_telemetry["top_slowest"].append({"path": rel_path, "time": process_time}) # Keep the array tiny: Sort and truncate every 50 files if len(self.splicing_telemetry["top_slowest"]) > 50: - self.splicing_telemetry["top_slowest"].sort( - key=lambda x: x["time"], reverse=True - ) - self.splicing_telemetry["top_slowest"] = ( - self.splicing_telemetry["top_slowest"][:10] - ) + self.splicing_telemetry["top_slowest"].sort(key=lambda x: x["time"], reverse=True) + self.splicing_telemetry["top_slowest"] = self.splicing_telemetry["top_slowest"][:10] # 2. Cap Regex Telemetry at 5,000 files to save RAM if not self.splicing_telemetry["regex_limit_reached"]: regex_stats = res["data"].pop("regex_telemetry", {}) for regex_name, duration in regex_stats.items(): - self.splicing_telemetry["regex_totals"][ - regex_name - ] += duration + self.splicing_telemetry["regex_totals"][regex_name] += duration self.splicing_telemetry["files_sampled"] += 1 if self.splicing_telemetry["files_sampled"] >= 5000: - self.splicing_telemetry[ - "regex_limit_reached" - ] = True + self.splicing_telemetry["regex_limit_reached"] = True logger.warning( "SPLICING SPEED: 5,000 file sample reached. Halting regex telemetry (Global file speeds still tracking)." ) elif status == "parser_bypass": logger.debug( - f"SINGULARITY_BYPASS: '{rel_path}' lacks structural integrity. Relegating to Excluded Artifacts." + f"UNPARSABLE_BYPASS: '{rel_path}' lacks structural integrity. Relegating to Excluded Artifacts." ) self.unparsable_files.append( { "path": rel_path, "reason": res["reason"], - "identity_confidence": res.get( - "identity_confidence", 0.0 - ), + "identity_confidence": res.get("identity_confidence", 0.0), "size_bytes": res.get("size_bytes", 0), } ) @@ -1444,12 +1227,8 @@ def _extract_features_parallel(self): except concurrent.futures.TimeoutError: logger.error("\n" + "=" * 75) logger.error(" SYSTEM HALT: Worker Thread Starvation") - logger.error( - " All CPU workers have exceeded the 60.0s execution limit." - ) - logger.error( - " This indicates Catastrophic Backtracking (ReDoS) in the regex engine." - ) + logger.error(" All CPU workers have exceeded the 60.0s execution limit.") + logger.error(" This indicates Catastrophic Backtracking (ReDoS) in the regex engine.") logger.error(" The following artifacts paralyzed the thread pool:") for future in active_futures: @@ -1467,23 +1246,17 @@ def _extract_features_parallel(self): ) logger.error("=" * 75 + "\n") - logger.warning( - "Aborting synthesis to unfreeze the terminal. Please check the Anti-ReDoS shields." - ) + logger.warning("Aborting synthesis to unfreeze the terminal. Please check the Anti-ReDoS shields.") executor.shutdown(wait=False, cancel_futures=True) - raise TimeoutError( - "Mission aborted due to worker starvation (ReDoS or IPC Deadlock)." - ) + raise TimeoutError("Mission aborted due to worker starvation (ReDoS or IPC Deadlock).") def _resolve_dependency_graph(self): """ Pass 1.5: Optimized relational token aggregation & Fuzzy Suffix Matching. Defused O(N^2) Bomb using O(1) Pre-Sliced Suffix Hash Maps. """ - logger.info( - "PASS_1.5: Resolving import graphs via O(1) Pre-computed Suffix Hash Maps..." - ) + logger.info("PASS_1.5: Resolving import graphs via O(1) Pre-computed Suffix Hash Maps...") self.popularity_scores = {rel_path: 0 for rel_path in self.stem_map.values()} repo_file_paths = set(self.stem_map.values()) @@ -1499,11 +1272,7 @@ def _resolve_dependency_graph(self): stem_to_paths[s].append(repo_file) norm_repo = repo_file.replace("\\", "/") - repo_no_ext = ( - norm_repo.rsplit(".", 1)[0] - if "." in Path(norm_repo).name - else norm_repo - ) + repo_no_ext = norm_repo.rsplit(".", 1)[0] if "." in Path(norm_repo).name else norm_repo parts_ext = norm_repo.split("/") for i in range(len(parts_ext)): @@ -1621,11 +1390,7 @@ def _resolve_dependency_graph(self): # --- THE FALLBACK: Stem Matching --- if not matched_internal: guess_stem = Path(clean_path).stem.lower() - if ( - guess_stem in stem_to_paths - and guess_stem not in stop_stems - and len(guess_stem) >= 3 - ): + if guess_stem in stem_to_paths and guess_stem not in stop_stems and len(guess_stem) >= 3: for target_path in stem_to_paths[guess_stem]: if clean_path in target_path or guess_stem == clean_path: self.popularity_scores[target_path] += 1 @@ -1643,9 +1408,7 @@ def _resolve_dependency_graph(self): # ========================================================================= # ---> NEW: THE AIR-GAPPED TYPOSQUATTING RADAR <--- # ========================================================================= - logger.info( - "PASS_1.5: Running Air-Gapped Typosquatting & Dependency Confusion Radar..." - ) + logger.info("PASS_1.5: Running Air-Gapped Typosquatting & Dependency Confusion Radar...") anchors = [] orphans = [] @@ -1733,8 +1496,7 @@ def _get_deletes(word): self.ram_cache[rel_path]["equations"] = {} self.ram_cache[rel_path]["equations"]["sec_homoglyphs"] = ( - self.ram_cache[rel_path]["equations"].get("sec_homoglyphs", 0) - + 1 + self.ram_cache[rel_path]["equations"].get("sec_homoglyphs", 0) + 1 ) if "metadata" not in self.ram_cache[rel_path]: @@ -1747,9 +1509,7 @@ def _get_deletes(word): break # Move to next orphan if typosquat_hits > 0: - logger.warning( - f"Intercepted {typosquat_hits} typosquatting attempts via repository baseline analysis." - ) + logger.warning(f"Intercepted {typosquat_hits} typosquatting attempts via repository baseline analysis.") # Evict memory before Pass 2 for rel_path, meta in self.ram_cache.items(): @@ -1760,12 +1520,12 @@ def _calculate_risk_exposures(self): """ Phase 3: Universal Exposure Framework & Signal Processing. - Translates raw regex DNA hits into 18-point physical risk vectors (e.g., Tech Debt, + Translates raw Structural Signatures into 18-point physical risk vectors (e.g., Tech Debt, Cognitive Load, State Flux). This pass applies architectural dampeners (like testing umbrellas and documentation shields), resolves test coverage graphs, and routes extracted metadata to the SignalProcessor for final heuristic scoring. """ - logger.info("PASS_2: Calculating structural physics and Tiered Normalization.") + logger.info("PASS_2: Calculating structural impact and Tiered Normalization.") # ============================================================== # NEW: CALCULATE FOLDER CONTEXTS (For Domain Ontologies) @@ -1804,9 +1564,7 @@ def _calculate_risk_exposures(self): # directly to the production functions they verify. # ============================================================== logger.info("PASS_2: Extracting Test Coverage Mapping...") - test_coverage_map = self.network_sensor.extract_test_coverage_mapping( - list(self.ram_cache.values()) - ) + test_coverage_map = self.network_sensor.extract_test_coverage_mapping(list(self.ram_cache.values())) # ============================================================== # ============================================================== @@ -1833,10 +1591,7 @@ def _calculate_risk_exposures(self): # Base Multiplier is 1.0. High-quality docs can double the shield (2.0) multiplier = 1.0 + min(instructional_mass / 50.0, 1.0) - if ( - folder not in instructional_multipliers - or multiplier > instructional_multipliers.get(folder, 0.0) - ): + if folder not in instructional_multipliers or multiplier > instructional_multipliers.get(folder, 0.0): instructional_multipliers[folder] = multiplier # Apply the multiplier to the existing doc_umbrellas @@ -1855,10 +1610,7 @@ def _calculate_risk_exposures(self): loc = meta.get("coding_loc", 0) total_loc += loc # Identify if the file lives in a test folder or is a test file - if ( - re.search(r"/tests?/|/testing/|\.test$", rel_path.lower()) - or "test" in Path(rel_path).stem.lower() - ): + if re.search(r"/tests?/|/testing/|\.test$", rel_path.lower()) or "test" in Path(rel_path).stem.lower(): test_loc += loc # Calculate percentage of repo dedicated to testing @@ -1878,23 +1630,21 @@ def _calculate_risk_exposures(self): meta["metadata"] = {} # Grab the winning language for this folder (defaulting to the file's own language) - meta["metadata"]["folder_dominant_lang"] = folder_dominant_langs.get( - folder, meta.get("lang_id", "unknown") - ) + meta["metadata"]["folder_dominant_lang"] = folder_dominant_langs.get(folder, meta.get("lang_id", "unknown")) # ----------------------------------------------------------------- # ================================================================= - # ---> THE NETWORK GRAVITY FIX <--- + # ---> THE CONTEXTUAL BASELINE FIX <--- # If the file is imported by the ecosystem, its "orphans" are actually its API. # ================================================================= popularity = self.popularity_scores.get(rel_path, 0) if popularity > 0 and "equations" in meta: - orphans = meta["equations"].get("design_slop_orphans", 0) + orphans = meta["equations"].get("orphaned_logic", 0) if orphans > 0: # 1. Convert the dead weight into API Exposure meta["equations"]["api"] = meta["equations"].get("api", 0) + orphans # 2. Wipe the Technical Debt - meta["equations"]["design_slop_orphans"] = 0 + meta["equations"]["orphaned_logic"] = 0 # 3. Heal the function metadata for func in meta.get("functions", []): @@ -1910,12 +1660,8 @@ def _calculate_risk_exposures(self): # Because 'self.census' is global, this naturally bridges 'src/main' and 'src/test' # without writing brittle directory-parsing logic. test_patterns = self.config.get("TEST_NAMING_CONVENTIONS", []) - sibling_candidates = [ - pattern.format(stem=stem) for pattern in test_patterns - ] - meta["is_protected"] = any( - cand in self.census for cand in sibling_candidates - ) + sibling_candidates = [pattern.format(stem=stem) for pattern in test_patterns] + meta["is_protected"] = any(cand in self.census for cand in sibling_candidates) # Pass the mapped test coverage data to the risk engine meta["test_coverage_map"] = test_coverage_map.get(rel_path, {}) @@ -1927,12 +1673,10 @@ def _calculate_risk_exposures(self): ) # ========================================================= - # THE GRAVITY SHIELD: APPLY STRUCTURAL MASS DAMPENERS + # THE BASELINE SHIELD: APPLY STRUCTURAL IMPACT DAMPENERS # SignalProcessor handles % Risks, but Orchestrator handles raw Mass. # ========================================================= - mass_modifiers = self.config.get("PATH_MODIFIERS", {}).get( - "Structural Mass", [] - ) + mass_modifiers = self.config.get("PATH_MODIFIERS", {}).get("Structural Mass", []) mass_multiplier = 1.0 # Normalize path for safe cross-platform regex matching @@ -1943,9 +1687,7 @@ def _calculate_risk_exposures(self): break # First match wins # Apply the dampener to the physical mass - forensic_result["file_impact"] = round( - forensic_result.get("file_impact", 0.0) * mass_multiplier, 2 - ) + forensic_result["file_impact"] = round(forensic_result.get("file_impact", 0.0) * mass_multiplier, 2) # ========================================================= # ========================================================= @@ -1955,9 +1697,7 @@ def _calculate_risk_exposures(self): ghost_meta = meta.get("metadata", {}) # Legacy Telemetry - telemetry_payload["control_flow_ratio"] = meta.get( - "control_flow_ratio", 0.0 - ) + telemetry_payload["control_flow_ratio"] = meta.get("control_flow_ratio", 0.0) telemetry_payload["popularity"] = self.popularity_scores.get(rel_path, 0) # THE FIX: Replace the brittle regex ownership with the dominant Git author @@ -1967,9 +1707,7 @@ def _calculate_risk_exposures(self): telemetry_payload["ownership"] = dominant_author else: # Fallback to the comment regex if Git is dormant or unavailable - telemetry_payload["ownership"] = ghost_meta.get( - "ownership", "Unknown Architect" - ) + telemetry_payload["ownership"] = ghost_meta.get("ownership", "Unknown Architect") # THE FIX: Conditionally inject historical metadata into the domain_context # ONLY if the PROJECT_OVERRIDES regex successfully extracted it. @@ -1982,9 +1720,7 @@ def _calculate_risk_exposures(self): telemetry_payload["roadmap_locked"] = meta.get("prior_lock", False) telemetry_payload["identity_lock_tier"] = meta.get("lock_tier", 4) telemetry_payload["identity_confidence"] = meta.get("intensity", 0.0) - telemetry_payload["identity_source_proof"] = meta.get( - "source_proof", "Discovery" - ) + telemetry_payload["identity_source_proof"] = meta.get("source_proof", "Discovery") telemetry_payload["threat_snippets"] = meta.get("threat_snippets", {}) self.parsed_files.append( @@ -2003,17 +1739,11 @@ def _calculate_risk_exposures(self): # Extract files flagged as secret leaks from the unparsable queue # and forcefully inject them into the parsed map for visualization. # ================================================================== - leaks = [ - cand - for cand in self.unparsable_files - if "CRITICAL LEAK" in cand.get("reason", "") - ] + leaks = [cand for cand in self.unparsable_files if "CRITICAL LEAK" in cand.get("reason", "")] # Remove them from Excluded Artifacts so they aren't double-counted in the summary self.unparsable_files = [ - cand - for cand in self.unparsable_files - if "CRITICAL LEAK" not in cand.get("reason", "") + cand for cand in self.unparsable_files if "CRITICAL LEAK" not in cand.get("reason", "") ] from gitgalaxy.metrics.signal_processor import SignalProcessor @@ -2022,18 +1752,18 @@ def _calculate_risk_exposures(self): rel_path = leak["path"] logger.critical(f"Threat Escalation: Forcing {rel_path} onto the 3D Map!") - synthetic_star = { + synthetic_artifact = { "name": Path(rel_path).name, "path": rel_path, - "lang_id": "plaintext", # <-- Bypasses the Spectral Auditor as Inert Matter + "lang_id": "plaintext", # <-- Bypasses the Spectral Auditor as Static Assets "coding_loc": 1, "total_loc": 1, "classification": "critical_secret_leak", # 18-point risk vector. Index 17 is secrets_risk. Peg it to 100%. "risk_vector": [0.0] * 13 + [0.0, 0.0, 0.0, 0.0, 100.0], "hit_vector": [0] * len(SignalProcessor.SIGNAL_SCHEMA), - # ---> CARTOGRAPHER GRAVITY <--- - # This makes the radius massive and pushes all other files away + # ---> TOPOLOGY BASELINE <--- + # This makes the structural impact score massive and pushes all other files away "file_impact": 5000.0, "telemetry": { "ownership": "Secrets Radar", @@ -2043,27 +1773,21 @@ def _calculate_risk_exposures(self): }, } - if "sec_private_info" in SignalProcessor.SIGNAL_SCHEMA: - idx = SignalProcessor.SIGNAL_SCHEMA.index("sec_private_info") - synthetic_star["hit_vector"][idx] = 1 + if "sec_hardcoded_secrets" in SignalProcessor.SIGNAL_SCHEMA: + idx = SignalProcessor.SIGNAL_SCHEMA.index("sec_hardcoded_secrets") + synthetic_artifact["hit_vector"][idx] = 1 - self.parsed_files.append(synthetic_star) + self.parsed_files.append(synthetic_artifact) # ================================================================== # AI MODEL WEIGHTS: Binary Header Extraction # Extract large model binaries (.gguf, .safetensors) from the unparsable queue, # parse their metadata headers without loading them into RAM, and map them. # ================================================================== - models = [ - cand - for cand in self.unparsable_files - if "AI MODEL WEIGHTS" in cand.get("reason", "") - ] - + models = [cand for cand in self.unparsable_files if "AI MODEL WEIGHTS" in cand.get("reason", "")] + self.unparsable_files = [ - cand - for cand in self.unparsable_files - if "AI MODEL WEIGHTS" not in cand.get("reason", "") + cand for cand in self.unparsable_files if "AI MODEL WEIGHTS" not in cand.get("reason", "") ] if models: @@ -2076,18 +1800,16 @@ def _calculate_risk_exposures(self): size_bytes = model.get("size_bytes", 0) full_path_str = str(self.root / rel_path) - logger.info( - f"🧠 TENSOR SCAN: Auditing local model weights for {rel_path}..." - ) + logger.info(f"🧠 TENSOR SCAN: Auditing local model weights for {rel_path}...") # Perform the zero-RAM binary header audit audit_results = tensor_scanner.audit_model(full_path_str) - # Model weights are incredibly dense. We give them a massive file_impact (Gravity). - # 1 GB = ~100.0 Gravity points, capped at 10,000 to prevent breaking the 3D renderer. - gravity_mass = min((size_bytes / (1024 * 1024 * 1024)) * 100.0, 10000.0) + # Model weights are incredibly dense. We give them a massive file_impact. + # 1 GB = ~100.0 impact points, capped at 10,000 to prevent breaking the 3D renderer. + structural_impact_score = min((size_bytes / (1024 * 1024 * 1024)) * 100.0, 10000.0) - synthetic_star = { + synthetic_artifact = { "name": Path(rel_path).name, "path": rel_path, "lang_id": "binary_threat", # Forces it to render uniquely in the UI @@ -2096,7 +1818,7 @@ def _calculate_risk_exposures(self): "classification": "ai_model_weights", "risk_vector": [0.0] * len(SignalProcessor.RISK_SCHEMA), "hit_vector": [0] * len(SignalProcessor.SIGNAL_SCHEMA), - "file_impact": max(gravity_mass, 500.0), # Minimum massive gravity + "file_impact": max(structural_impact_score, 500.0), # Minimum massive structural impact "telemetry": { "ownership": "Tensor Scanner", "domain_context": { @@ -2114,16 +1836,16 @@ def _calculate_risk_exposures(self): # Force the hit_vector to register as local compute so the AI Topology catches it if "llm_local_compute" in SignalProcessor.SIGNAL_SCHEMA: idx = SignalProcessor.SIGNAL_SCHEMA.index("llm_local_compute") - synthetic_star["hit_vector"][idx] = 100 # Massive hit spike + synthetic_artifact["hit_vector"][idx] = 100 # Massive hit spike - self.parsed_files.append(synthetic_star) + self.parsed_files.append(synthetic_artifact) def _prepare_target(self, target_input: Union[str, Path]) -> Path: """ Validates the user's target input and constructs an ephemeral extraction environment if necessary. If the target is a compressed archive (.zip), this method generates a secure, isolated temporary - directory in the host OS to unpack the contents. This ensures the physics engine can analyze + directory in the host OS to unpack the contents. This ensures the engine can analyze cloud-downloaded repositories without permanently polluting the user's local file system. """ input_path = Path(target_input) @@ -2131,9 +1853,7 @@ def _prepare_target(self, target_input: Union[str, Path]) -> Path: raise InaccessibleArtifactError(f"Target missing: {target_input}") if input_path.suffix.lower() == ".zip": - logger.info( - f"ARCHIVE_DETECTED: Unpacking {input_path.name} to temporary lead shielding." - ) + logger.info(f"ARCHIVE_DETECTED: Unpacking {input_path.name} to temporary lead shielding.") try: self.temp_dir = tempfile.mkdtemp(prefix="refraction_") with zipfile.ZipFile(input_path, "r") as zip_ref: @@ -2157,9 +1877,7 @@ def cleanup(self): try: shutil.rmtree(self.temp_dir) except Exception as e: - logger.warning( - f"CLEANUP_FAILED: Could not remove {self.temp_dir} ({e})" - ) + logger.warning(f"CLEANUP_FAILED: Could not remove {self.temp_dir} ({e})") def _record_anomaly(self, path: Union[str, Path], message: str): """Records failure telemetry.""" @@ -2167,9 +1885,7 @@ def _record_anomaly(self, path: Union[str, Path], message: str): logger.debug(f"ANOMALY: {name} | {message}") self.anomalies.append({"star": name, "diagnostic": message}) - def _summarize_anomalies( - self, total_singularity: List[Dict[str, Any]] - ) -> Dict[str, Any]: + def _summarize_anomalies(self, total_unparsable: List[Dict[str, Any]]) -> Dict[str, Any]: """ Bridges isolated worker failures back to the main thread's forensic ledger. @@ -2200,9 +1916,9 @@ def _summarize_anomalies( # 2. Build the hierarchical composition_by_extension_and_reason composition = {} - for dark in total_singularity: - path = dark.get("path", "") - reason = dark.get("reason", "Unknown Reason") + for unparsable in total_unparsable: + path = unparsable.get("path", "") + reason = unparsable.get("reason", "Unknown Reason") # Extract and normalize the extension using the engine's REGEX SHIELD ext = Path(path).suffix.lower() @@ -2219,9 +1935,7 @@ def _summarize_anomalies( # Sort extensions by total count, and reasons within them by count summary["composition_by_extension_and_reason"] = { ext: dict(sorted(reasons.items(), key=lambda x: x[1], reverse=True)) - for ext, reasons in sorted( - composition.items(), key=lambda x: sum(x[1].values()), reverse=True - ) + for ext, reasons in sorted(composition.items(), key=lambda x: sum(x[1].values()), reverse=True) } return summary @@ -2364,7 +2078,7 @@ def execute_incremental_scan( Instead of re-scanning a 10,000-file repository for a 2-file PR, this method ingests the previous structural state from RAM/SQLite, evicts the deleted/modified files, and only runs the heavy regex optics on the newly added or changed files. It then - triggers the 'Ripple Effect' to recalculate global blast radiuses and PageRank + triggers the 'Ripple Effect' to recalculate global Downstream Exposures and PageRank scores for the entire ecosystem before sealing the updated database. """ start_time = time.time() @@ -2399,22 +2113,18 @@ def execute_incremental_scan( self.census.add(stem) self.ext_tally[ext] = self.ext_tally.get(ext, 0) + 1 self.ext_tally[name] = self.ext_tally.get(name, 0) + 1 - self.stem_map[rel_path] = ( - rel_path # Instruct Pass 1 to ONLY process these - ) + self.stem_map[rel_path] = rel_path # Instruct Pass 1 to ONLY process these # 4. Execute the Surgical Scan (Only parses new files) self._extract_features_parallel() - # 5. The Ripple Effect (Recalculate Blast Radius for ALL files) + # 5. The Ripple Effect (Recalculate Downstream Exposure for ALL files) self.stem_map = {f: f for f in self.ram_cache.keys()} self._resolve_dependency_graph() self._calculate_risk_exposures() # Re-map the directed graph because nodes/edges have mutated - self.parsed_files, network_macro = self.network_sensor.build_dependency_graph( - self.parsed_files - ) + self.parsed_files, network_macro = self.network_sensor.build_dependency_graph(self.parsed_files) # 6. Audit Verification & ML Threat Inference repository_graph, unparsable_audits = self.auditor.audit(self.parsed_files) @@ -2422,9 +2132,7 @@ def execute_incremental_scan( repository_graph = self.model_auditor.audit_repository(repository_graph) # 7. Synthesis and Database Forging - summary = self.processor.summarize_galaxy_metrics( - repository_graph, unparsable_audits - ) + summary = self.processor.summarize_galaxy_metrics(repository_graph, unparsable_audits) summary["network_macro"] = network_macro session_meta = { "engine": f"GitGalaxy Scope v{self.version} (Delta Mode)", @@ -2460,7 +2168,7 @@ def execute_incremental_scan( # ============================================================================== -# MISSION CONTROL: THE ENTRY POINT +# ORCHESTRATOR CORE: THE ENTRY POINT # ============================================================================== @@ -2479,12 +2187,8 @@ def main(): parser = argparse.ArgumentParser(description="GitGalaxy GalaxyScope v2") parser.add_argument("target", help="Path to repo or ZIP") - parser.add_argument( - "--output", default=None, help="Optional output filename override" - ) - parser.add_argument( - "--debug", action="store_true", help="Turn on verbose Analytical logging" - ) + parser.add_argument("--output", default=None, help="Optional output filename override") + parser.add_argument("--debug", action="store_true", help="Turn on verbose Analytical logging") parser.add_argument( "--paranoid", action="store_true", @@ -2499,18 +2203,10 @@ def main(): ) # --- EXCLUSIVE RECORDER FLAGS --- - parser.add_argument( - "--llm-only", action="store_true", help="Run ONLY the LLM recorder" - ) - parser.add_argument( - "--gpu-only", action="store_true", help="Run ONLY the GPU recorder" - ) - parser.add_argument( - "--audit-only", action="store_true", help="Run ONLY the Audit recorder" - ) - parser.add_argument( - "--db-only", action="store_true", help="Run ONLY the native SQLite recorder" - ) + parser.add_argument("--llm-only", action="store_true", help="Run ONLY the LLM recorder") + parser.add_argument("--gpu-only", action="store_true", help="Run ONLY the GPU recorder") + parser.add_argument("--audit-only", action="store_true", help="Run ONLY the Audit recorder") + parser.add_argument("--db-only", action="store_true", help="Run ONLY the native SQLite recorder") parser.add_argument( "--splicing-speed", action="store_true", @@ -2556,9 +2252,7 @@ def main(): final_output = args.output else: if DEFAULT_OUT_DIR: - final_output = str( - Path(DEFAULT_OUT_DIR) / f"{project_name}_galaxy.json" - ) + final_output = str(Path(DEFAULT_OUT_DIR) / f"{project_name}_galaxy.json") else: final_output = f"{project_name}_galaxy.json" @@ -2573,9 +2267,7 @@ def main(): merged_aperture = copy.deepcopy(base_aperture) if project_name in project_overrides: - logging.info( - f"🌌 DIALECT DETECTED: Injecting Project Overrides for '{project_name}'" - ) + logging.info(f"🌌 DIALECT DETECTED: Injecting Project Overrides for '{project_name}'") dialect_dict = project_overrides[project_name] for lang, overrides in dialect_dict.items(): @@ -2585,15 +2277,13 @@ def main(): merged_aperture["IGNORED_DIRECTORIES"] = set() merged_aperture["IGNORED_DIRECTORIES"].update(overrides["exclude_dirs"]) logging.debug( - f" -> Patched Aperture Shield (Added {len(overrides['exclude_dirs'])} Black Holes)." + f" -> Patched Aperture Shield (Added {len(overrides['exclude_dirs'])} Ignored Directories)." ) if "exclude_paths" in overrides: if "CONTRABAND_PATTERNS" not in merged_aperture: merged_aperture["CONTRABAND_PATTERNS"] = [] - merged_aperture["CONTRABAND_PATTERNS"].extend( - overrides["exclude_paths"] - ) + merged_aperture["CONTRABAND_PATTERNS"].extend(overrides["exclude_paths"]) logging.debug( f" -> Patched Contraband Shield (Added {len(overrides['exclude_paths'])} exact paths)." ) @@ -2604,9 +2294,7 @@ def main(): merged_langs[lang]["extensions"] = overrides["extensions"] logging.debug(f" -> Patched '{lang}' extensions.") - rules_patch = { - k: v for k, v in overrides.items() if k != "extensions" - } + rules_patch = {k: v for k, v in overrides.items() if k != "extensions"} if rules_patch and "rules" in merged_langs[lang]: merged_langs[lang]["rules"].update(rules_patch) logging.debug(f" -> Patched '{lang}' geometry rules.") @@ -2626,14 +2314,12 @@ def main(): # --------------------------------------------------------- full_config = { "LANGUAGE_DEFINITIONS": merged_langs, - "COMMENT_DEFINITIONS": COMMENT_DEFINITIONS, + "LEXICAL_FAMILY_HEURISTICS": LEXICAL_FAMILY_HEURISTICS, "APERTURE_CONFIG": merged_aperture, "PATH_MODIFIERS": PATH_MODIFIERS, "PRIORITY_WHITELIST": PRIORITY_WHITELIST, "TEST_NAMING_CONVENTIONS": TEST_NAMING_CONVENTIONS, - "DOCUMENTATION_LANGUAGES": ASSET_MASKS.get( - "DOCUMENTATION_LANGUAGES", set() - ), + "DOCUMENTATION_LANGUAGES": ASSET_MASKS.get("DOCUMENTATION_LANGUAGES", set()), "PARANOID_MODE": args.paranoid, "SHADOW_PATCH_DETECTED": args.shadow_patch_detected, # <--- Pass the flag # --- PASS EXCLUSIVE FLAGS TO ORCHESTRATOR --- @@ -2662,4 +2348,4 @@ def main(): # This tells Python to run main() if you call the file directly, # but allows PyPI to map to main() dynamically. if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/licensing.py b/gitgalaxy/licensing.py index dbb51784..c846e99e 100644 --- a/gitgalaxy/licensing.py +++ b/gitgalaxy/licensing.py @@ -84,12 +84,6 @@ def enforce_licensing_guard(tool_name: str = "GitGalaxy Engine v2"): Evaluates runtime environment for PolyForm compliance. Injects operational friction or audit tripwires for unverified environments. """ - # --- THE PYTEST BYPASS --- - # Keeps our CI/CD logs clean by instantly exiting during automated tests. - if "PYTEST_CURRENT_TEST" in os.environ or os.environ.get("GITGALAXY_ENV") == "development": - return - # ------------------------- - # --- ZERO-DEPENDENCY .ENV LOADER --- # Python doesn't read .env files natively. This parses it manually # so we don't force users to pip install python-dotenv. @@ -103,7 +97,7 @@ def enforce_licensing_guard(tool_name: str = "GitGalaxy Engine v2"): if line and not line.startswith("#") and "=" in line: key, val = line.split("=", 1) # Only inject if it's not already set in the system environment - os.environ.setdefault(key.strip(), val.strip().strip('"\'')) + os.environ.setdefault(key.strip(), val.strip().strip("\"'")) except Exception: pass # Fail gracefully if the .env file is locked by OS permissions # ----------------------------------- @@ -197,13 +191,11 @@ def enforce_licensing_guard(tool_name: str = "GitGalaxy Engine v2"): " Incident has been flagged. Executing under maximum compliance friction.", file=sys.stderr, ) - print( - " Contact joe@gitgalaxy.io to acquire a valid commercial key.", file=sys.stderr - ) + print(" Contact joe@gitgalaxy.io to acquire a valid commercial key.", file=sys.stderr) print( "\n >>> Enforcing 10-second synchronization delay for compliance visibility...", file=sys.stderr, ) print("=" * 80 + "\n", file=sys.stderr) sys.stderr.flush() - time.sleep(10.0) \ No newline at end of file + time.sleep(10.0) diff --git a/gitgalaxy/metrics/README.md b/gitgalaxy/metrics/README.md index 17db0a2a..2599e16c 100644 --- a/gitgalaxy/metrics/README.md +++ b/gitgalaxy/metrics/README.md @@ -1,36 +1,63 @@ -# GitGalaxy: Static Analysis & Heuristics Engine +# GitGalaxy Metrics: Heuristic Synthesis & Statistical Auditing -[![Analysis](https://img.shields.io/badge/Analysis-Sigmoid_Curves-00BFFF.svg)](#) -[![Machine Learning](https://img.shields.io/badge/ML-K--Means_Clustering-00C957.svg)](#) -[![Security](https://img.shields.io/badge/Security-Statistical_Auditing-8A2BE2.svg)](#) +[![Architecture](https://img.shields.io/badge/Architecture-Heuristic_Synthesis-8A2BE2.svg)](#) +[![Reliability](https://img.shields.io/badge/Reliability-False__Positive_Eradication-00BFFF.svg)](#) +[![Performance](https://img.shields.io/badge/Performance-Zero--RAM_Auditing-FF4500.svg)](#) -This directory contains the mathematical core of the **blAST Engine**. +Welcome to **GitGalaxy Metrics**. If the `core/` directory is the extraction layer (identifying raw structural signals), this directory is the analytical brain. -Once the `/core` lexical parsing layer has sliced the source code into structural signals, this engine takes over. It applies mathematical heuristics to generate 0-100% risk exposures, classifies architectural intent using Machine Learning, and maps the temporal churn of the repository over time. +It is responsible for consuming raw lexical data, merging it with temporal Git telemetry, and translating it into actionable, multi-dimensional risk vectors. This is where raw data becomes architectural intelligence. -> **⚠️ Configuration Warning:** Do not modify these core analysis files to tune the engine's behavior. Almost all variables, Sigmoid curve slopes, risk thresholds, and path modifiers have been abstracted to the **[Standards Registry](../standards/README.md)**. +## The Why: False-Positive Eradication & Alert Fatigue -### 🗺️ The Architecture +Traditional Static Application Security Testing (SAST) tools suffer from a fatal flaw: they flag raw vulnerabilities in a vacuum. A raw execution command inside a deprecated, unimported sandbox script generates the same critical alert as one sitting in your primary routing controller. This lack of context generates massive false-positive fatigue, eventually causing engineering teams to ignore the scanner entirely. -* **`signal_processor.py`:** The core analytical engine. It takes the raw regex hits (e.g., number of branches, number of allocations) and applies Sigmoid curves and domain-specific path modifiers to calculate Cognitive Load, Tech Debt, and Security risks. It also executes statistical anomaly detection to identify obfuscated malware or mismatched files hiding in foreign ecosystems. - * 📖 **[Read the Signal Processing Equations](https://squid-protocol.github.io/gitgalaxy/02-09-signal-processing/)** +The GitGalaxy Metrics engine is engineered to solve this through **Contextual Synthesis**. It operates on a fundamental rule: **Risk exposures are calculated metrics derived from structural hits; they are not the hits themselves.** The engine applies mathematical dampeners (like testing umbrellas, network isolation, and documentation shields) to raw signals. For example, a high "state flux" signal in a file with 100% test coverage and zero downstream dependents has its ultimate risk exposure mathematically dampened, reflecting true ecosystem reality rather than isolated syntax panic. -* **`chronometer.py`:** The version control telemetry engine. It interfaces with the local `.git` history using a high-speed stream-processing pipeline to calculate the pulse rate (churn) and stability of individual files over a rolling dynamic window. - * 📖 **[Read the Chronometer Mechanics](https://squid-protocol.github.io/gitgalaxy/02-15-chronometer/)** +--- + +## The What: Core Modules & Data Flow + +All modules in this directory are engineered to operate strictly in $O(1)$ or $O(N)$ linear time complexity. Because expensive disk I/O and regex parsing have already concluded in the `core/` phase, these mathematical operations execute across tens of thousands of files in milliseconds. + +### 1. `signal_processor.py` (The Mathematical Core) +The primary heuristic synthesis engine. It translates raw structural hits into an 18-point risk vector evaluating dimensions like Technical Debt, Cognitive Load, and State Flux. +* **Dual-Axis Anomaly Detection:** Evaluates threats using both global repository baselines and local language models. It leverages **Architectural Drift (Z-Score)** to mathematically flag files that blend in globally but violate their local ecosystem's structural norms. +* **Autonomous Execution Vectors & AI Topology:** Analyzes the density of LLM orchestration tools, vector databases, and execution loops to classify the repository's AI footprint. It explicitly flags vulnerabilities where raw **Prompt Injection Surfaces** flow directly into OS-level execution. + +### 2. `statistical_auditor.py` (The Quality Gate) +A statistical gatekeeper designed to protect downstream Machine Learning models and LLMs from context poisoning. +* **Anomaly Quarantine:** Calculates the Median Absolute Deviation (MAD) of logic density across the assembled repository graph. Files that fall outside mathematical norms (e.g., massive auto-generated JSON dumps masquerading as code) are automatically quarantined and dropped before they can distort the final architectural map. -* **`neural_auditor.py`:** The machine learning inference engine. It uses pre-trained K-means clustering models to assign every file and function to a specific "Archetype" (e.g., *The God Node*, *Declarative Glue*, *Async Orchestrator*). It also contains sensors to surgically audit massive LLM weights (`.safetensors`, `.gguf`) without loading them into RAM. - * 📖 **[Read the Neural Auditor & Archetypes Specs](https://squid-protocol.github.io/gitgalaxy/02-19-neural-auditor/)** +### 3. `chronometer.py` (The Temporal Engine) +Static code analysis is often blind to human behavior. The Chronometer bridges this gap by merging temporal Git telemetry with structural static analysis. +* **Authorship Centralization:** Extracts Git volatility, churn velocity, and ownership entropy without requiring heavy dependency parsing. By cross-referencing code mass with developer churn, the engine calculates centralization—identifying critical, load-bearing files authored and understood entirely by a single developer. -* **`spectral_auditor.py`:** The quality control gate. It uses statistical normalization to detect Structural Drift. If a file acts as a statistical outlier compared to its peers (e.g., a massive data dump disguised as code), it is quarantined to prevent it from corrupting the architectural knowledge graph. - * 📖 **[Read the Spectral Audit Specs](https://squid-protocol.github.io/gitgalaxy/02-11-spectral-audit/)** +### 4. `tensor_scanner.py` (The AI Infrastructure Auditor) +As repositories increasingly embed local AI models, standard parsers crash attempting to evaluate gigabyte-scale binaries. +* **Zero-RAM Auditing:** Performs binary header audits on local model weights (`.gguf`, `.safetensors`). It surgically extracts architectural metadata without loading the massive weights into system memory, safely visualizing AI infrastructure without risking OS-level crashes. -

+--- + +## 🧠 Engineering Highlights (Architectural Defenses) + +If you are evaluating the `metrics/` architecture, pay special attention to how we bypass the computational and statistical bottlenecks of enterprise-scale analysis: + +* **Zombie Process & FD Leak Prevention (`chronometer.py`):** Parsing a decade-long Git log for a monolithic repository will crash CI/CD runners by exhausting RAM and stalling the CPU. We enforce a dual-axis kill switch (volume targets and hard timeouts) via buffered `Popen` streams. To prevent zombie processes, we execute strict `SIGKILL` and `communicate()` flushing, ensuring OS file descriptors are perfectly sterilized even when the stream is aborted early. +* **Heuristic Extension Consensus (`statistical_auditor.py`):** Certain file extensions (`.h`, `.m`) are ambiguous across languages (C vs. C++ vs. Objective-C). Instead of guessing, the engine surveys the macro-state of the repository. If 80% of the repository's confidently parsed `.h` files are confirmed as C++, the auditor mathematically forces all ambiguous headers to align with the ecosystem consensus, resolving collisions dynamically without AST compilation. +* **The Impossible Density Law (`statistical_auditor.py`):** Normal human code rarely sustains > 1.5 structural signature hits per physical line. If a file sustains > 3.0 across 30+ lines, it is mathematically guaranteed to be minified, obfuscated, or packed with embedded binaries. The auditor catches these "Packed Payloads" and shunts them out of the standard risk pool, preventing malicious obfuscation from hiding in the noise. +* **Memory Exhaustion Vulnerability Detector (`signal_processor.py`):** If the processor detects high algorithmic complexity ($O(N^3)$ or recursive depth), combined with high state mutation, but lacks lazy evaluation/generators, it flags the function as a severe Memory Exhaustion Vulnerability. It mathematically multiplies the state flux risk, instantly highlighting architectural bottlenecks that could bloat RAM. +* **Zero-RAM Exhaustion Guards (`tensor_scanner.py`):** A malicious actor can craft a tiny `.safetensors` file claiming its JSON header is 500GB, triggering a catastrophic memory exhaustion attack when a Python parser attempts to read it. Our tensor scanner reads strictly the first 8 bytes to extract the header size and enforces a hard 100MB cap, mathematically guaranteeing pipeline survival in $O(1)$ space complexity. --- -### 🌌 Powered by the blAST Engine +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) + +GitGalaxy Metrics is the analytical processing layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed for planetary-scale repositories. -This documentation is part of the [GitGalaxy Ecosystem](https://github.com/squid-protocol/gitgalaxy), an AST-free, LLM-free heuristic knowledge graph engine. +Explore the ecosystem: -* 🪐 **[Explore the GitHub Repository](https://github.com/squid-protocol/gitgalaxy)** for code, tools, and updates. -* 🔭 **[Visualize your own repository at GitGalaxy.io](https://gitgalaxy.io/)** using our interactive 3D WebGPU dashboard. \ No newline at end of file +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/metrics/chronometer.py b/gitgalaxy/metrics/chronometer.py index b850e699..c792079a 100644 --- a/gitgalaxy/metrics/chronometer.py +++ b/gitgalaxy/metrics/chronometer.py @@ -65,9 +65,7 @@ def __init__(self, root_path: Path, parent_logger: Optional[logging.Logger] = No self.repo_min_time = time.time() self.repo_max_time = time.time() - self.logger.debug( - f"Initializing Time-Series Analyzer for: '{self.root.name}'..." - ) + self.logger.debug(f"Initializing Time-Series Analyzer for: '{self.root.name}'...") # 1. Git Binary Verification & Boundary Survey self._initialize_history_scan() @@ -81,9 +79,7 @@ def _initialize_history_scan(self): try: subprocess.run(["git", "--version"], capture_output=True, check=True) self.is_git_enabled = True - self.logger.debug( - "Git binary verified. Commencing Deep Boundary Survey." - ) + self.logger.debug("Git binary verified. Commencing Deep Boundary Survey.") except (subprocess.CalledProcessError, FileNotFoundError): self.logger.warning("Git binary not found. Falling back to OS Walk.") @@ -141,26 +137,20 @@ def _determine_commit_bounds(self): if res_min_time.stdout.strip(): self.repo_min_time = float(res_min_time.stdout.strip()) - self.logger.debug( - f"Boundaries Locked (Git): {self.repo_min_time} to {self.repo_max_time}" - ) + self.logger.debug(f"Boundaries Locked (Git): {self.repo_min_time} to {self.repo_max_time}") return except Exception as e: - self.logger.warning( - f"Git boundary survey failed, falling back to FS scan: {e}" - ) + self.logger.warning(f"Git boundary survey failed, falling back to FS scan: {e}") # Fallback: OS Walk for boundaries utilizing global Aperture configs - black_holes = self.aperture_config.get("IGNORED_DIRECTORIES", set()) + ignored_dirs = self.aperture_config.get("IGNORED_DIRECTORIES", set()) scan_limit = self.chrono_config.get("FALLBACK_SCAN_LIMIT", 25000) min_t, max_t = float("inf"), 0.0 count = 0 for root, dirs, files in os.walk(self.root): # Skip noise sectors dynamically - dirs[:] = [ - d for d in dirs if not d.startswith(".") and d not in black_holes - ] + dirs[:] = [d for d in dirs if not d.startswith(".") and d not in ignored_dirs] for f in files: try: m = os.path.getmtime(os.path.join(root, f)) @@ -222,19 +212,16 @@ def _scan_git_history(self): # ====================================================================== # DEFENSIVE ARCHITECTURE: Compute & RAM Starvation Guard - # Parsing a decade-long Git log for a monolithic repository will crash - # the CI/CD runner by exhausting available RAM and stalling the CPU. - # We enforce a dual-axis kill switch: + # Parsing a decade-long Git log for a monolithic repository will crash + # the CI/CD runner by exhausting available RAM and stalling the CPU. + # We enforce a dual-axis kill switch: # Axis 1 (Volume): Stop scanning once 50% of active files are mapped (max 5000). # Axis 2 (Time): Hard abort after 'timeout_limit' seconds. # ====================================================================== required_files = min(int(total_files * 0.50), 5000) timeout_limit = self.chrono_config.get("STREAM_TIMEOUT_SECONDS", 15.0) - self.logger.info( - f"Chronometer: Engaging 1-Year Historical Sweep. " - f"Budget: {timeout_limit}s" - ) + self.logger.info(f"Chronometer: Engaging 1-Year Historical Sweep. Budget: {timeout_limit}s") ignored_hashes = self._load_ignored_revs() @@ -262,9 +249,7 @@ def _scan_git_history(self): duration = time.time() - start_time # Filter our churn map to only count currently tracked files for the final pct - coverage_achieved = len( - [k for k in self.churn_map.keys() if k in tracked_files] - ) + coverage_achieved = len([k for k in self.churn_map.keys() if k in tracked_files]) pct = coverage_achieved / max(total_files, 1) * 100 self.logger.info( @@ -328,9 +313,7 @@ def _stream_git_log( skip_current_commit = False current_ts = float(parts[1]) - current_author = ( - parts[2].strip() if len(parts) > 2 else "Unknown" - ) + current_author = parts[2].strip() if len(parts) > 2 else "Unknown" continue if skip_current_commit: @@ -351,9 +334,7 @@ def _stream_git_log( # Track Ownership Entropy if path_key not in self.author_map: self.author_map[path_key] = {} - self.author_map[path_key][current_author] = ( - self.author_map[path_key].get(current_author, 0) + 1 - ) + self.author_map[path_key][current_author] = self.author_map[path_key].get(current_author, 0) + 1 # Track Stability (MTime) if current_ts > self.mtime_map.get(path_key, 0.0): @@ -366,10 +347,10 @@ def _stream_git_log( finally: # ================================================================== # DEFENSIVE ARCHITECTURE: Zombie Process & FD Leak Prevention - # Because our Compute Guards will frequently break the Popen stream + # Because our Compute Guards will frequently break the Popen stream # *before* Git finishes outputting the log, the OS pipe remains open. - # If we do not explicitly send a SIGKILL and flush the File Descriptors - # via communicate(), we will spawn thousands of Zombie Processes that + # If we do not explicitly send a SIGKILL and flush the File Descriptors + # via communicate(), we will spawn thousands of Zombie Processes that # will eventually take down the host machine. # ================================================================== if process: @@ -397,9 +378,9 @@ def get_file_history_metrics(self, rel_path: str) -> Dict[str, Any]: """ ======================================================================== DEFENSIVE ARCHITECTURE: Zero-I/O Thread Safety - This method is called thousands of times per second by the isolated - Multi-Processing worker pool during Phase 1. If it triggered disk reads - or Git CLI commands, it would cause an IPC deadlock. All lookups here + This method is called thousands of times per second by the isolated + Multi-Processing worker pool during Phase 1. If it triggered disk reads + or Git CLI commands, it would cause an IPC deadlock. All lookups here are guaranteed to be O(1) RAM dictionary accesses. ======================================================================== """ @@ -423,4 +404,4 @@ def get_file_history_metrics(self, rel_path: str) -> Dict[str, Any]: "repo_max_time": self.repo_max_time, "is_git_tracked": self.is_git_enabled, "authors": self.author_map.get(lookup_key, {}), - } \ No newline at end of file + } diff --git a/gitgalaxy/metrics/signal_processor.py b/gitgalaxy/metrics/signal_processor.py index 4944093f..aceafeb0 100644 --- a/gitgalaxy/metrics/signal_processor.py +++ b/gitgalaxy/metrics/signal_processor.py @@ -16,7 +16,7 @@ from gitgalaxy.standards import analysis_lens # ============================================================================== -# GitGalaxy Phase 4: Signal Processor (The Physics Engine) +# GitGalaxy Phase 4: Signal Processor (The Structural Signature Analysis Engine) # Strategy v6.2.0 Protocol: Temporal Normalization & Universal Exposure # ============================================================================== @@ -67,23 +67,17 @@ def __init__( # ====================================================================== # ---> NEW (DYNAMIC) <--- inference_model = getattr(config, "GENERAL_FILE_INFERENCE_MODEL", {}) - self.SCALER_MEDIANS = inference_model.get( - "SCALER_MEDIANS", [0.0] * 100 - ) # Safe fallback size + self.SCALER_MEDIANS = inference_model.get("SCALER_MEDIANS", [0.0] * 100) # Safe fallback size self.SCALER_IQRS = inference_model.get("SCALER_IQRS", [1.0] * 100) # Dynamically grab whichever ARCHETYPES_K key exists (e.g. ARCHETYPES_K9) - arch_key = next( - (k for k in inference_model.keys() if k.startswith("ARCHETYPES_K")), None - ) + arch_key = next((k for k in inference_model.keys() if k.startswith("ARCHETYPES_K")), None) self.GLOBAL_ARCHETYPES = inference_model.get(arch_key, {}) if arch_key else {} # ---> NEW: Fetch Language-Specific Clustering Models <--- - self.LANGUAGE_INFERENCE_MODELS = getattr( - config, "SPECIFIC_FILE_INFERENCE_MODEL", {} - ) + self.LANGUAGE_INFERENCE_MODELS = getattr(config, "SPECIFIC_FILE_INFERENCE_MODEL", {}) - # Fetch Physics Constants + # Fetch Structural Constants physics = getattr(config, "ENGINE_CONSTANTS", {}) self.WEIGHT_RISK = physics.get("WEIGHT_RISK", 2.5) self.WEIGHT_DEFENSE = physics.get("WEIGHT_DEFENSE", 1.0) @@ -120,24 +114,16 @@ def __init__( "memory": 5.0, "logic_bomb": 3.0, }, # C code hiding in a JS app = Trojan - "infra_in_web": { - "logic_bomb": 4.0 - }, # Shell script hiding in a JS app = Backdoor - "web_in_systems": { - "flux": 3.0 - }, # JS embedded in C firmware = Bizarre architecture + "infra_in_web": {"logic_bomb": 4.0}, # Shell script hiding in a JS app = Backdoor + "web_in_systems": {"state_mutation": 3.0}, # JS embedded in C firmware = Bizarre architecture }, ) # ---> NEW: Fetch the Archetype Matrix - self.CONTEXT_VIOLATION_MATRIX = security_profiles.get( - "CONTEXT_VIOLATION_MATRIX", {} - ) + self.CONTEXT_VIOLATION_MATRIX = security_profiles.get("CONTEXT_VIOLATION_MATRIX", {}) + + self.logger.info("Signal Processor Online | Context-Aware Risk Schema & ML Archetypes loaded.") - self.logger.info( - "Signal Processor Online | Context-Aware Risk Schema & ML Archetypes loaded." - ) - def _classify_archetype( self, scaled_vector: List[float], archetypes_dict: Dict[str, List[float]] ) -> Tuple[str, float, Dict[str, float]]: @@ -167,15 +153,13 @@ def _classify_archetype( return best_match, round(min_dist, 3), fingerprint - def _get_context_multipliers( - self, file_lang: str, folder_lang: str - ) -> Dict[str, float]: + def _get_context_multipliers(self, file_lang: str, folder_lang: str) -> Dict[str, float]: """ Calculates risk multipliers by comparing an asset's language to its directory environment. Detects architectural boundary violations and embedded payloads (e.g., C code in a JS directory). """ # Default multipliers if no specific context rules apply - multipliers = {"memory": 1.0, "logic_bomb": 1.0, "flux": 1.0, "injection": 1.0} + multipliers = {"memory": 1.0, "logic_bomb": 1.0, "state_mutation": 1.0, "injection": 1.0} file_lang = file_lang.lower() folder_lang = folder_lang.lower() if folder_lang else file_lang @@ -198,7 +182,6 @@ def _get_context_multipliers( if file_eco == folder_eco: return self.NATIVE_WEIGHTS.get(file_eco, multipliers) - # SCENARIO 2: The Entity is an Alien (Context Mismatch) alien_key = f"{file_eco}_in_{folder_eco}" alien_penalties = self.ECOSYSTEM_MISMATCH_WEIGHTS.get(alien_key, {}) @@ -216,9 +199,9 @@ def _get_context_multipliers( def _calculate_silo_risk(self, authors: dict) -> float: """ - Calculates the 'Bus Factor' risk of a file. - 100% = A single developer wrote the entire file (High Silo Risk). - 0% = Perfectly distributed across multiple developers (Low Silo Risk). + Calculates the Authorship Centralization risk of a file. + 100% = A single developer wrote the entire file (High Centralization). + 0% = Perfectly distributed across multiple developers (Low Centralization). """ if not authors: return 0.0 @@ -238,7 +221,7 @@ def calculate_risk_vector( raw_signals: Dict[str, int], umbrella_bonus: float = 0.0, ) -> Dict[str, Any]: - """Calculates risk exposure, temporal physics, and per-file physical impact.""" + """Calculates risk exposure, temporal analysis, and per-file structural impact.""" rel_path = meta.get("path", "unknown") loc = 1 # Safe fallback for the except block @@ -317,11 +300,7 @@ def calculate_risk_vector( secrets_exact = aperture_cfg.get("SECRETS_EXACT", set()) aperture_reason = ghost_meta.get("aperture_reason", "") - is_critical_leak = ( - "CRITICAL LEAK" in aperture_reason - or ext in secrets_exts - or filename in secrets_exact - ) + is_critical_leak = "CRITICAL LEAK" in aperture_reason or ext in secrets_exts or filename in secrets_exact if is_critical_leak: temporal_data = meta.get("temporal_telemetry", {}) @@ -350,7 +329,7 @@ def calculate_risk_vector( return { "risk_vector": blanket_risk_vector, "hit_vector": [0] * len(self.SIGNAL_SCHEMA), - "file_impact": 150.0, # Massive physical footprint for the 3D map + "file_impact": 150.0, # Massive structural footprint for the topological map "telemetry": { "archetype": getattr(config, "STATIC_ARCHETYPES", {}).get( "data", "Static: Declarative Data & Configurations" @@ -376,27 +355,19 @@ def calculate_risk_vector( # 2. Check for ANY malicious intent (eval, network fetching, etc.) intent_mass = ( - raw_signals.get("sec_danger", 0) + raw_signals.get("sec_high_risk_execution", 0) + raw_signals.get("sec_io", 0) - + raw_signals.get("sec_safety_neg", 0) + + raw_signals.get("sec_safety_bypasses", 0) ) if intent_mass > 0: - self.logger.critical( - f"🚨 OBFUSCATION DETECTED: {rel_path} contains obscured execution/IO!" - ) + self.logger.critical(f"🚨 OBFUSCATION DETECTED: {rel_path} contains obscured execution/IO!") if "obscured_payload" in self.RISK_SCHEMA: - blanket_risk_vector[ - self.RISK_SCHEMA.index("obscured_payload") - ] = 100.0 + blanket_risk_vector[self.RISK_SCHEMA.index("obscured_payload")] = 100.0 if "logic_bomb" in self.RISK_SCHEMA: - blanket_risk_vector[self.RISK_SCHEMA.index("logic_bomb")] = ( - 100.0 - ) + blanket_risk_vector[self.RISK_SCHEMA.index("logic_bomb")] = 100.0 if "injection_surface" in self.RISK_SCHEMA: - blanket_risk_vector[ - self.RISK_SCHEMA.index("injection_surface") - ] = 100.0 + blanket_risk_vector[self.RISK_SCHEMA.index("injection_surface")] = 100.0 return { "risk_vector": blanket_risk_vector, @@ -421,9 +392,7 @@ def calculate_risk_vector( # STATIC LITERATURE OVERRIDE # Treat pure literature as static structural assets, skipping logic math # ================================================================== - doc_languages = self.asset_masks.get( - "DOCUMENTATION_LANGUAGES", {"markdown", "plaintext", "rst", "text"} - ) + doc_languages = self.asset_masks.get("DOCUMENTATION_LANGUAGES", {"markdown", "plaintext", "rst", "text"}) if lang_id.lower() in doc_languages: temporal_data = meta.get("temporal_telemetry", {}) @@ -439,15 +408,11 @@ def calculate_risk_vector( blanket_risk_vector = [0.0] * len(self.RISK_SCHEMA) if "churn" in self.RISK_SCHEMA: - blanket_risk_vector[self.RISK_SCHEMA.index("churn")] = min( - raw_churn_freq * 10, 100.0 - ) + blanket_risk_vector[self.RISK_SCHEMA.index("churn")] = min(raw_churn_freq * 10, 100.0) if "documentation" in self.RISK_SCHEMA: - blanket_risk_vector[self.RISK_SCHEMA.index("documentation")] = ( - 0.0 # <-- The Fix! 0% Risk. - ) - if "civil_war" in self.RISK_SCHEMA: - blanket_risk_vector[self.RISK_SCHEMA.index("civil_war")] = 50.0 + blanket_risk_vector[self.RISK_SCHEMA.index("documentation")] = 0.0 # <-- The Fix! 0% Risk. + if "tabs_vs_spaces" in self.RISK_SCHEMA: + blanket_risk_vector[self.RISK_SCHEMA.index("tabs_vs_spaces")] = 50.0 return { "risk_vector": blanket_risk_vector, @@ -459,7 +424,7 @@ def calculate_risk_vector( ), "control_flow_ratio": 0.0, "ownership_entropy": 0.0, # <-- FIX: Documentation has no logic entropy - "author_distribution": 0.0, # <-- FIX: Plaintext changelogs don't have a Bus Factor + "author_distribution": 0.0, # <-- FIX: Plaintext changelogs don't have Authorship Centralization risk "ownership": dominant_author, "domain_context": ghost_meta, }, @@ -479,19 +444,15 @@ def calculate_risk_vector( folder_lang = ghost_meta.get("folder_dominant_lang", lang_id) eco_mp = self._get_context_multipliers(lang_id, folder_lang) - self.logger.debug( - f"[{rel_path}] Physics Calc | Lang: {lang_id} (Fc: {fc:.2f}, Irc: {irc}, Ot: {ot:.2f})" - ) - + self.logger.debug(f"[{rel_path}] Structural Calc | Lang: {lang_id} (Fc: {fc:.2f}, Irc: {irc}, Ot: {ot:.2f})") + hit_vector = [raw_signals.get(key, 0) for key in self.SIGNAL_SCHEMA] # ------------------------------------------------------------------ # 1. TEMPORAL PRE-PROCESSING (Raw Extraction) # ------------------------------------------------------------------ temporal_data = meta.get("temporal_telemetry", {}) - stability_score, raw_churn_freq = self._calc_raw_temporal_signals( - temporal_data - ) + stability_score, raw_churn_freq = self._calc_raw_temporal_signals(temporal_data) # ------------------------------------------------------------------ # 1.5 BUILD THE ML VECTOR & CLASSIFY ARCHETYPE @@ -507,9 +468,7 @@ def calculate_risk_vector( encapsulation_ratio = 1.0 # Safe by default if no state exists else: # 1.0 = Perfect (0 globals). 0.0 = Terrible (All globals). - encapsulation_ratio = max( - 0.0, 1.0 - (global_vars / max(total_vars + global_vars, 1)) - ) + encapsulation_ratio = max(0.0, 1.0 - (global_vars / max(total_vars + global_vars, 1))) logic_loc = max(int(round(meta.get("coding_loc", 0) * cfr)), 1) safe_denom = max(logic_loc, meta.get("coding_loc", 1)) @@ -522,14 +481,10 @@ def calculate_risk_vector( max_big_o = 1 max_db_complexity = 0 - func_ml_brain = getattr( - analysis_lens, "GENERAL_FUNCTION_INFERENCE_MODEL", {} - ) + func_ml_brain = getattr(analysis_lens, "GENERAL_FUNCTION_INFERENCE_MODEL", {}) f_medians = func_ml_brain.get("SCALER_MEDIANS", []) f_iqrs = func_ml_brain.get("SCALER_IQRS", []) - f_arch_key = next( - (k for k in func_ml_brain.keys() if k.startswith("ARCHETYPES_K")), None - ) + f_arch_key = next((k for k in func_ml_brain.keys() if k.startswith("ARCHETYPES_K")), None) f_centroids = func_ml_brain.get(f_arch_key, {}) if f_arch_key else {} # Bulletproof fallback names if the model dictionary forgets them @@ -563,9 +518,7 @@ def calculate_risk_vector( if functions: complexities = [f.get("branch", 0) for f in functions] max_func_comp = max(complexities) - avg_func_args = sum([f.get("args", 0) for f in functions]) / len( - functions - ) + avg_func_args = sum([f.get("args", 0) for f in functions]) / len(functions) max_big_o = max([f.get("big_o_depth", 1) for f in functions]) max_db_complexity = max([f.get("db_complexity", 0) for f in functions]) has_recursion = any([f.get("is_recursive", False) for f in functions]) @@ -595,26 +548,18 @@ def calculate_risk_vector( scaled_vec = [] for i, val in enumerate(raw_vec): med = f_medians[i] if i < len(f_medians) else 0.0 - iqr = ( - f_iqrs[i] if i < len(f_iqrs) and f_iqrs[i] > 0 else 1.0 - ) + iqr = f_iqrs[i] if i < len(f_iqrs) and f_iqrs[i] > 0 else 1.0 scaled_vec.append((val - med) / iqr) min_dist = float("inf") for c_key, centroid in f_centroids.items(): - dist = math.sqrt( - sum((a - b) ** 2 for a, b in zip(scaled_vec, centroid)) - ) + dist = math.sqrt(sum((a - b) ** 2 for a, b in zip(scaled_vec, centroid))) if dist < min_dist: min_dist = dist try: # If the key is numbered like "Cluster 0", extract the 0 c_idx = int(str(c_key).split(" ")[-1]) - s["archetype"] = ( - f_names[c_idx] - if c_idx < len(f_names) - else c_key - ) + s["archetype"] = f_names[c_idx] if c_idx < len(f_names) else c_key except ValueError: # If the key is already the name (e.g., "Interfaces"), use it directly! s["archetype"] = str(c_key) @@ -624,9 +569,9 @@ def calculate_risk_vector( sorted_comps = sorted(float(c) for c in complexities) n = len(sorted_comps) index = range(1, n + 1) - func_gini = ( - sum((2 * i - n - 1) * c for i, c in zip(index, sorted_comps)) - ) / (n * sum(sorted_comps)) + func_gini = (sum((2 * i - n - 1) * c for i, c in zip(index, sorted_comps))) / ( + n * sum(sorted_comps) + ) # ---> END FUNCTION-LEVEL ML CLASSIFICATION <--- raw_imports_count = len(meta.get("raw_imports", [])) @@ -643,7 +588,7 @@ def calculate_risk_vector( for key in self.SIGNAL_SCHEMA: # ---> THE DIMENSIONAL FIX: Ignore hardware_bridge and cryptography <--- if key in { - "civil_war", + "tabs_vs_spaces", "indent_tabs", "indent_spaces", "hardware_bridge", @@ -673,11 +618,7 @@ def calculate_risk_vector( scaled_vector_global = [] for i, val in enumerate(raw_vector): median = self.SCALER_MEDIANS[i] if i < len(self.SCALER_MEDIANS) else 0.0 - safe_iqr = ( - self.SCALER_IQRS[i] - if i < len(self.SCALER_IQRS) and self.SCALER_IQRS[i] > 0 - else 1.0 - ) + safe_iqr = self.SCALER_IQRS[i] if i < len(self.SCALER_IQRS) and self.SCALER_IQRS[i] > 0 else 1.0 scaled_vector_global.append((val - median) / safe_iqr) global_archetype, global_drift, arch_fingerprint = self._classify_archetype( @@ -695,27 +636,19 @@ def calculate_risk_vector( lang_iqrs = lang_brain.get("SCALER_IQRS", []) # Find the dynamic K-key (e.g., ARCHETYPES_K11) - arch_key = next( - (k for k in lang_brain.keys() if k.startswith("ARCHETYPES_K")), None - ) + arch_key = next((k for k in lang_brain.keys() if k.startswith("ARCHETYPES_K")), None) lang_archetypes = lang_brain.get(arch_key, {}) if arch_key else {} if lang_medians and lang_iqrs and lang_archetypes: scaled_vector_local = [] for i, val in enumerate(raw_vector): - median = ( - lang_medians[i] - if i < len(lang_medians) - else self.SCALER_MEDIANS[i] - ) - iqr = ( - lang_iqrs[i] if i < len(lang_iqrs) else self.SCALER_IQRS[i] - ) + median = lang_medians[i] if i < len(lang_medians) else self.SCALER_MEDIANS[i] + iqr = lang_iqrs[i] if i < len(lang_iqrs) else self.SCALER_IQRS[i] safe_iqr = iqr if iqr > 0 else 1.0 scaled_vector_local.append((val - median) / safe_iqr) - local_archetype, local_drift, local_fingerprint = ( - self._classify_archetype(scaled_vector_local, lang_archetypes) + local_archetype, local_drift, local_fingerprint = self._classify_archetype( + scaled_vector_local, lang_archetypes ) # ------------------------------------------------------------------ @@ -724,24 +657,18 @@ def calculate_risk_vector( # ---> HIGHER-ORDER SYNTHESIS: The OOM (Out of Memory) Bomb <--- # If O(N^3) or recursive, AND high flux, AND NO lazy_evaluation -> Massive Flux Multiplier oom_multiplier = 1.0 - if (max_big_o >= 3 or has_recursion) and raw_signals.get("flux", 0) > 0: + if (max_big_o >= 3 or has_recursion) and raw_signals.get("state_mutation", 0) > 0: if raw_signals.get("lazy_evaluation", 0) == 0: oom_multiplier = 3.0 # Ticking OOM bomb (Bloating RAM) else: oom_multiplier = 0.5 # Safely streamed (O(1) memory) - mp_map["flux"] = mp_map.get("flux", 1.0) * oom_multiplier + mp_map["state_mutation"] = mp_map.get("state_mutation", 1.0) * oom_multiplier # -------------------------------------------------------------- - cog_score, cog_raw = self._calc_cog_load( - loc, raw_signals, irc, fc, mp_map.get("cog", 1.0), func_gini - ) - saf_score = self._calc_safety( - loc, raw_signals, irc, fc, mp_map.get("safety", 1.0) - ) - debt_score = self._calc_tech_debt( - loc, raw_signals, irc, mp_map.get("debt", 1.0) - ) + cog_score, cog_raw = self._calc_cog_load(loc, raw_signals, irc, fc, mp_map.get("cog", 1.0), func_gini) + saf_score = self._calc_safety(loc, raw_signals, irc, fc, mp_map.get("safety", 1.0)) + debt_score = self._calc_tech_debt(loc, raw_signals, irc, mp_map.get("debt", 1.0)) test_score = self._calc_verification( loc, @@ -784,23 +711,15 @@ def calculate_risk_vector( "safety_score": saf_score, "tech_debt": debt_score, "verification": test_score, - "api_exposure": self._calc_api_exposure( - raw_signals, total_loc, popularity - ), - "concurrency": self._calc_concurrency( - loc, raw_signals, irc, mp_map.get("async", 1.0), functions - ), - "state_flux": self._calc_state_flux( - loc, raw_signals, irc, mp_map.get("flux", 1.0) - ), - "graveyard": self._calc_graveyard( - total_loc, raw_signals, mp_map.get("dead", 1.0) - ), + "api_exposure": self._calc_api_exposure(raw_signals, total_loc, popularity), + "concurrency": self._calc_concurrency(loc, raw_signals, irc, mp_map.get("async", 1.0), functions), + "state_flux": self._calc_state_flux(loc, raw_signals, irc, mp_map.get("state_mutation", 1.0)), + "dead_code": self._calc_graveyard(total_loc, raw_signals, mp_map.get("dead", 1.0)), "spec_match": spec_score, "stability": stability_score, "churn": 0.0, "documentation": doc_score, - "civil_war": self._calc_civil_war(raw_signals), + "tabs_vs_spaces": self._calc_civil_war(raw_signals), "algorithmic_dos": self._calc_algorithmic_dos( loc, raw_signals, @@ -839,20 +758,16 @@ def calculate_risk_vector( lang_id, global_archetype, ), - "secrets_risk": self._calc_secrets_risk( - loc, raw_signals, mp_map.get("secrets", 1.0) - ), + "secrets_risk": self._calc_secrets_risk(loc, raw_signals, mp_map.get("secrets", 1.0)), } # ------------------------------------------------------------------ # 3. VECTOR ASSEMBLY (Locked to RISK_SCHEMA order) # ------------------------------------------------------------------ - risk_vector_ordered = [ - round(exposure_vector[key], 4) for key in self.RISK_SCHEMA - ] + risk_vector_ordered = [round(exposure_vector[key], 4) for key in self.RISK_SCHEMA] # ------------------------------------------------------------------ - # 4. CALCULATE FILE IMPACT (The Mass) + # 4. CALCULATE FILE IMPACT (Structural Magnitude) # ------------------------------------------------------------------ functions = meta.get("functions", []) func_start = raw_signals.get("func_start", 0) @@ -871,21 +786,16 @@ def calculate_risk_vector( temp_effective_loc = min(loc, (temp_signals + 1) * 10) temp_arg_multiplier = math.sqrt(temp_args + 1) - sum_function_impacts = ( - (temp_branches + 1) * temp_arg_multiplier - + (0.05 * temp_effective_loc) - ) * 10 + sum_function_impacts = ((temp_branches + 1) * temp_arg_multiplier + (0.05 * temp_effective_loc)) * 10 api_exposure = raw_signals.get("api", 0) concurrency = raw_signals.get("concurrency", 0) - flux = raw_signals.get("flux", 0) + flux = raw_signals.get("state_mutation", 0) - file_mass = ( - sum_function_impacts + api_exposure + concurrency + flux + (loc / 50.0) - ) + file_mass = sum_function_impacts + api_exposure + concurrency + flux + (loc / 50.0) # ------------------------------------------------------------------ - # 5. EXECUTE OWNERSHIP ENTROPY MATH & SILO RISK + # 5. EXECUTE OWNERSHIP ENTROPY MATH & AUTHORSHIP CENTRALIZATION # ------------------------------------------------------------------ authors_map = meta.get("authors", {}) ownership_score = self._calc_ownership_entropy(authors_map) @@ -908,9 +818,7 @@ def calculate_risk_vector( "raw_churn_freq": raw_churn_freq, "func_complexity_gini": func_gini, "max_algorithmic_complexity": ( - "O(2^N) [Recursive]" - if has_recursion - else (f"O(N^{max_big_o})" if max_big_o > 1 else "O(N)") + "O(2^N) [Recursive]" if has_recursion else (f"O(N^{max_big_o})" if max_big_o > 1 else "O(N)") ), "max_db_complexity": max_db_complexity, "ownership_entropy": ownership_score, @@ -932,7 +840,7 @@ def calculate_risk_vector( except Exception as e: self.logger.error( - f"Catastrophic physics failure on artifact '{rel_path}': {e}", + f"Catastrophic structural failure on artifact '{rel_path}': {e}", exc_info=True, ) return { @@ -967,11 +875,7 @@ def get_avg(metric_name): if metric_name not in self.RISK_SCHEMA: return 0.0 idx = self.RISK_SCHEMA.index(metric_name) - scores = [ - f["risk_vector"][idx] - for f in parsed_files - if "risk_vector" in f and len(f["risk_vector"]) > idx - ] + scores = [f["risk_vector"][idx] for f in parsed_files if "risk_vector" in f and len(f["risk_vector"]) > idx] return round(statistics.mean(scores), 3) if scores else 0.0 lang_comp = {} @@ -992,9 +896,7 @@ def get_avg(metric_name): [ f for f in parsed_files - if "risk_vector" in f - and len(f["risk_vector"]) > churn_idx - and f["risk_vector"][churn_idx] > 80.0 + if "risk_vector" in f and len(f["risk_vector"]) > churn_idx and f["risk_vector"][churn_idx] > 80.0 ] ) volatility_idx = round(high_volatility / max(len(parsed_files), 1), 3) @@ -1053,18 +955,14 @@ def get_avg(metric_name): "count": count, "pct": round((count / len(parsed_files)) * 100.0, 1), } - for name, count in sorted( - archetype_counts.items(), key=lambda x: x[1], reverse=True - ) + for name, count in sorted(archetype_counts.items(), key=lambda x: x[1], reverse=True) } ecosystem_fingerprint["static_mass"] = { name: { "count": count, "pct": round((count / len(parsed_files)) * 100.0, 1), } - for name, count in sorted( - static_counts.items(), key=lambda x: x[1], reverse=True - ) + for name, count in sorted(static_counts.items(), key=lambda x: x[1], reverse=True) } # --- NEW: AI TOPOLOGY & LLM INTELLIGENCE --- @@ -1079,19 +977,13 @@ def get_avg(metric_name): "ml_traditional", "dl_frameworks", ] - ai_indices = { - k: self.SIGNAL_SCHEMA.index(k) - for k in ai_sensor_keys - if k in self.SIGNAL_SCHEMA - } + ai_indices = {k: self.SIGNAL_SCHEMA.index(k) for k in ai_sensor_keys if k in self.SIGNAL_SCHEMA} # Isolate the physical files harboring AI logic ai_files = [] for f in parsed_files: hv = f.get("hit_vector", []) - file_ai_mass = sum( - hv[idx] for k, idx in ai_indices.items() if idx < len(hv) - ) + file_ai_mass = sum(hv[idx] for k, idx in ai_indices.items() if idx < len(hv)) if file_ai_mass > 0: ai_files.append(f) @@ -1108,8 +1000,7 @@ def get_avg(metric_name): ( f.get("hit_vector", [])[self.SIGNAL_SCHEMA.index("llm_orchestrator")] if "llm_orchestrator" in self.SIGNAL_SCHEMA - and len(f.get("hit_vector", [])) - > self.SIGNAL_SCHEMA.index("llm_orchestrator") + and len(f.get("hit_vector", [])) > self.SIGNAL_SCHEMA.index("llm_orchestrator") else 0 ) for f in parsed_files @@ -1118,8 +1009,7 @@ def get_avg(metric_name): ( f.get("hit_vector", [])[self.SIGNAL_SCHEMA.index("llm_vector_store")] if "llm_vector_store" in self.SIGNAL_SCHEMA - and len(f.get("hit_vector", [])) - > self.SIGNAL_SCHEMA.index("llm_vector_store") + and len(f.get("hit_vector", [])) > self.SIGNAL_SCHEMA.index("llm_vector_store") else 0 ) for f in parsed_files @@ -1128,8 +1018,7 @@ def get_avg(metric_name): ( f.get("hit_vector", [])[self.SIGNAL_SCHEMA.index("llm_local_compute")] if "llm_local_compute" in self.SIGNAL_SCHEMA - and len(f.get("hit_vector", [])) - > self.SIGNAL_SCHEMA.index("llm_local_compute") + and len(f.get("hit_vector", [])) > self.SIGNAL_SCHEMA.index("llm_local_compute") else 0 ) for f in parsed_files @@ -1158,8 +1047,7 @@ def get_avg(metric_name): ( f.get("hit_vector", [])[self.SIGNAL_SCHEMA.index("ai_logic_loop")] if "ai_logic_loop" in self.SIGNAL_SCHEMA - and len(f.get("hit_vector", [])) - > self.SIGNAL_SCHEMA.index("ai_logic_loop") + and len(f.get("hit_vector", [])) > self.SIGNAL_SCHEMA.index("ai_logic_loop") else 0 ) for f in parsed_files @@ -1170,8 +1058,7 @@ def get_avg(metric_name): ( f.get("hit_vector", [])[self.SIGNAL_SCHEMA.index("ml_traditional")] if "ml_traditional" in self.SIGNAL_SCHEMA - and len(f.get("hit_vector", [])) - > self.SIGNAL_SCHEMA.index("ml_traditional") + and len(f.get("hit_vector", [])) > self.SIGNAL_SCHEMA.index("ml_traditional") else 0 ) for f in parsed_files @@ -1180,8 +1067,7 @@ def get_avg(metric_name): ( f.get("hit_vector", [])[self.SIGNAL_SCHEMA.index("dl_frameworks")] if "dl_frameworks" in self.SIGNAL_SCHEMA - and len(f.get("hit_vector", [])) - > self.SIGNAL_SCHEMA.index("dl_frameworks") + and len(f.get("hit_vector", [])) > self.SIGNAL_SCHEMA.index("dl_frameworks") else 0 ) for f in parsed_files @@ -1222,9 +1108,7 @@ def get_avg(metric_name): "Repository contains local model execution or tensor math. Expect heavy GPU memory allocation." ) elif llm_vector_total > 0 and llm_api_total > 0: - ai_topology["classification"] = ( - "RAG Pipeline (Retrieval-Augmented Generation)" - ) + ai_topology["classification"] = "RAG Pipeline (Retrieval-Augmented Generation)" ai_topology["insights"].append( "Active vector database integration detected. Architecture centers around data chunking and context retrieval." ) @@ -1253,18 +1137,11 @@ def get_avg(metric_name): if ai_files: # Find the most heavily relied-upon AI node in the graph ai_files.sort( - key=lambda x: ( - x.get("telemetry", {}) - .get("network_metrics", {}) - .get("pagerank_score") - or 0.0 - ), + key=lambda x: x.get("telemetry", {}).get("network_metrics", {}).get("pagerank_score") or 0.0, reverse=True, ) primary_ai_node = ai_files[0] - net_mets = primary_ai_node.get("telemetry", {}).get( - "network_metrics", {} - ) + net_mets = primary_ai_node.get("telemetry", {}).get("network_metrics", {}) role = net_mets.get("ecosystem_role", "Unknown") pr = net_mets.get("normalized_blast_radius") or 0.0 @@ -1276,11 +1153,11 @@ def get_avg(metric_name): if pr > 1.0: ai_topology["insights"].append( - f"Systemic Risk (High): The AI components are deeply embedded with a massive Blast Radius (PageRank: {pr}). Hallucinations or prompt injections here will cascade catastrophically across the system." + f"Systemic Risk (High): The AI components are deeply embedded with a massive Dependency Blast Radius (PageRank: {pr}). Hallucinations or prompt injections here will cascade catastrophically across the system." ) elif pr < 0.2: ai_topology["insights"].append( - "Containment (Low Risk): The AI components are safely isolated at the edge of the network with a minimal blast radius." + "Containment (Low Risk): The AI components are safely isolated at the edge of the network with a minimal dependency blast radius." ) if btw > 0.05: @@ -1311,20 +1188,13 @@ def get_avg(metric_name): if repo_model and parsed_files: # Rebuild the ratios based purely on the K-Means features - feature_counts = { - feat: archetype_counts.get(feat, 0) for feat in repo_model["features"] - } - live_ratios = [ - feature_counts[feat] / len(parsed_files) - for feat in repo_model["features"] - ] + feature_counts = {feat: archetype_counts.get(feat, 0) for feat in repo_model["features"]} + live_ratios = [feature_counts[feat] / len(parsed_files) for feat in repo_model["features"]] distances = [] for i in range(repo_model["k_clusters"]): centroid = repo_model["centroids"][f"Cluster {i}"] - dist = math.sqrt( - sum((a - b) ** 2 for a, b in zip(live_ratios, centroid)) - ) + dist = math.sqrt(sum((a - b) ** 2 for a, b in zip(live_ratios, centroid))) distances.append(dist) assigned_idx = distances.index(min(distances)) @@ -1409,7 +1279,7 @@ def _normalize_temporal_metrics(self, parsed_files: List[Dict[str, Any]]): file_data["risk_vector"][idx] = round(final_churn, 2) # ========================================================================== - # FORENSIC EQUATIONS (The Physics Models) + # FORENSIC EQUATIONS (The Structural Models) # ========================================================================== def _calc_raw_temporal_signals(self, temp: Dict[str, Any]) -> Tuple[float, float]: @@ -1461,7 +1331,7 @@ def _calc_ownership_entropy(self, authors: Dict[str, int]) -> float: def _calc_civil_war(self, raw_signals: Dict[str, int]) -> float: """ - Calculates Layout Consistency (Tabs vs Spaces). + Calculates Formatting Inconsistencies (Tabs vs Spaces). 0 = Pure Tabs (Consistent), 100 = Pure Spaces (Consistent), 50 = High Discrepancy. """ tab_lines = raw_signals.get("indent_tabs", 0) @@ -1497,10 +1367,10 @@ def _calc_cog_load( raw_signals.get(k, 0) for k in [ "branch", - "flux", + "state_mutation", "concurrency", - "heat_triggers", - "danger", + "reflection_metaprogramming", + "high_risk_execution", ] ] ) / safe_loc + (irc / safe_loc) @@ -1511,15 +1381,13 @@ def _calc_cog_load( return 0.0, 0.0 branch_density = branches / safe_loc - flux_density = raw_signals.get("flux", 0) / safe_loc + flux_density = raw_signals.get("state_mutation", 0) / safe_loc concurrency_density = raw_signals.get("concurrency", 0) / safe_loc - heat_density = raw_signals.get("heat_triggers", 0) / safe_loc - danger_density = raw_signals.get("danger", 0) / safe_loc + heat_density = raw_signals.get("reflection_metaprogramming", 0) / safe_loc + danger_density = raw_signals.get("high_risk_execution", 0) / safe_loc clamped_branch = min(branch_density * 1.0, t.get("branch_clamp", 0.5)) - clamped_flux = min( - flux_density * t.get("flux_mult", 2.0), t.get("flux_clamp", 0.75) - ) + clamped_flux = min(flux_density * t.get("flux_mult", 2.0), t.get("flux_clamp", 0.75)) heavy_logic = ( (concurrency_density * t.get("async_mult", 3.0)) + (heat_density * t.get("heat_mult", 5.0)) @@ -1533,20 +1401,14 @@ def _calc_cog_load( if func_gini > 0.7: gini_multiplier = 1.0 + (func_gini * 0.5) - total_density = ( - clamped_branch + clamped_flux + heavy_logic + (irc / safe_loc) - ) * gini_multiplier + total_density = (clamped_branch + clamped_flux + heavy_logic + (irc / safe_loc)) * gini_multiplier if safe_loc <= 2 and total_density == 0: return 0.0, total_density try: raw_score = 100.0 / ( - 1.0 - + math.exp( - -t.get("sigmoid_slope", 4.0) - * (total_density - t.get("sigmoid_offset", 0.75)) - ) + 1.0 + math.exp(-t.get("sigmoid_slope", 4.0) * (total_density - t.get("sigmoid_offset", 0.75))) ) except OverflowError: raw_score = 100.0 if total_density > t.get("sigmoid_offset", 0.75) else 0.0 @@ -1556,16 +1418,14 @@ def _calc_cog_load( return min(raw_score * cooling * mp, 100.0), total_density - def _calc_safety( - self, loc: int, raw_signals: Dict[str, int], irc: int, fc: float, mp: float - ) -> float: + def _calc_safety(self, loc: int, raw_signals: Dict[str, int], irc: int, fc: float, mp: float) -> float: safe_loc = max(loc, 1) t = self.risk_tuning.get("safety", {}) attack_hits = ( - (raw_signals.get("danger", 0) * t.get("danger_weight", 4.0)) - + (raw_signals.get("safety_neg", 0) * t.get("safety_neg_weight", 1.5)) - + (raw_signals.get("flux", 0) * t.get("flux_weight", 0.5)) + (raw_signals.get("high_risk_execution", 0) * t.get("danger_weight", 4.0)) + + (raw_signals.get("safety_bypasses", 0) * t.get("safety_neg_weight", 1.5)) + + (raw_signals.get("state_mutation", 0) * t.get("flux_weight", 0.5)) ) defense_hits = ( (raw_signals.get("safety", 0) * self.WEIGHT_DEFENSE) @@ -1584,13 +1444,11 @@ def _calc_safety( net_exposure = (attack - defense) - systems_buffer try: - score = 100.0 / ( - 1.0 + math.exp(-t.get("sigmoid_slope", 12.0) * net_exposure) - ) + score = 100.0 / (1.0 + math.exp(-t.get("sigmoid_slope", 12.0) * net_exposure)) except OverflowError: score = 100.0 if net_exposure > 0 else 0.0 - danger_density = (raw_signals.get("danger", 0) + raw_signals.get("safety_neg", 0)) / safe_loc + danger_density = (raw_signals.get("high_risk_execution", 0) + raw_signals.get("safety_bypasses", 0)) / safe_loc if danger_density > t.get("vulnerability_density_min", 0.03) and attack > defense: floor = min( t.get("breach_floor_max", 80.0), @@ -1600,25 +1458,17 @@ def _calc_safety( return max(score, 0.0) - def _calc_tech_debt( - self, loc: int, raw_signals: Dict[str, int], irc: int, mp: float - ) -> float: + def _calc_tech_debt(self, loc: int, raw_signals: Dict[str, int], irc: int, mp: float) -> float: t = self.risk_tuning.get("tech_debt", {}) good_debt = raw_signals.get("planned_debt", 0) bad_debt = raw_signals.get("fragile_debt", raw_signals.get("keyword_debt", 0)) stubs = raw_signals.get("func_empty", 0) # --- NEW: UNTRACKED COMPLEXITY (SLOP) --- - orphans = raw_signals.get("design_slop_orphans", 0) - duplicates = raw_signals.get("design_slop_duplicates", 0) - - if ( - good_debt == 0 - and bad_debt == 0 - and stubs == 0 - and orphans == 0 - and duplicates == 0 - ): + orphans = raw_signals.get("orphaned_logic", 0) + duplicates = raw_signals.get("duplicate_logic", 0) + + if good_debt == 0 and bad_debt == 0 and stubs == 0 and orphans == 0 and duplicates == 0: return 0.0 # Implicit debt carries a heavier baseline penalty because it is invisible to standard linters @@ -1640,9 +1490,7 @@ def _calc_tech_debt( threshold = t.get("threshold", 5.0) try: - raw_score = 100.0 / ( - 1.0 + math.exp(-t.get("sigmoid_slope", 0.5) * (density - threshold)) - ) + raw_score = 100.0 / (1.0 + math.exp(-t.get("sigmoid_slope", 0.5) * (density - threshold))) except OverflowError: raw_score = 100.0 if density > threshold else 0.0 @@ -1694,7 +1542,7 @@ def _calc_documentation( net_exposure = max(0.0, risk_hits - (defense_hits / 2.0)) density = (net_exposure / max(loc, 1)) * 100.0 - # 4. THE MULTIPLIERS (Blast Radius & Bus Factor) + # 4. THE MULTIPLIERS (Dependency Blast Radius & Authorship Centralization) # Undocumented code is exponentially more dangerous if it is highly # integrated (popularity) or siloed to a single developer. network_multiplier = 1.0 + (popularity / 10.0) @@ -1706,9 +1554,7 @@ def _calc_documentation( try: # We use a negative slope because high density = high risk exposure - raw_risk = 100.0 / ( - 1.0 + math.exp(-t.get("sigmoid_slope", 0.2) * (density - threshold)) - ) + raw_risk = 100.0 / (1.0 + math.exp(-t.get("sigmoid_slope", 0.2) * (density - threshold))) except OverflowError: raw_risk = 100.0 if density > threshold else 0.0 @@ -1775,9 +1621,7 @@ def _calc_verification( # Parameterization Multiplier param_multiplier = 2.0 if test.get("decorators", 0) > 0 else 1.0 - effective_test_impact_sum += ( - raw_impact * param_multiplier - ) / target_count + effective_test_impact_sum += (raw_impact * param_multiplier) / target_count defensive_ratio = effective_test_impact_sum / func_impact @@ -1786,7 +1630,7 @@ def _calc_verification( total_untested_impact += untested_impact # Add file-level danger as raw unverified mass - file_level_danger = float(raw_signals.get("danger", 0)) + file_level_danger = float(raw_signals.get("high_risk_execution", 0)) total_untested_impact += file_level_danger # Step D: Executable Density Normalization & Ecosystem Modifiers @@ -1797,7 +1641,7 @@ def _calc_verification( # umbrella_bonus is max 50.0. If bonus is 50, dampener is 0.5. guidestar_dampener = max(1.0 - (umbrella_bonus / 100.0), 0.1) - # Network Blast Radius (Amplifier) + # Dependency Blast Radius (Amplifier) blast_radius = mp + min(popularity * 0.2, 3.0) adjusted_density = (raw_density * guidestar_dampener) * blast_radius @@ -1807,9 +1651,7 @@ def _calc_verification( slope = t.get("sigmoid_slope", 0.25) try: - base_score = 100.0 / ( - 1.0 + math.exp(-slope * (adjusted_density - threshold)) - ) + base_score = 100.0 / (1.0 + math.exp(-slope * (adjusted_density - threshold))) except OverflowError: base_score = 100.0 if adjusted_density > threshold else 0.0 @@ -1818,39 +1660,32 @@ def _calc_verification( return 0.0 # Breach Cap: If untested mass is overwhelmingly larger than verified, cap to Fragile (80+) - if ( - total_untested_impact > (total_function_impact * 0.8) - and total_function_impact > 50.0 - ): + if total_untested_impact > (total_function_impact * 0.8) and total_function_impact > 50.0: return max(base_score, 80.0) return min(base_score, 100.0) def _calc_graveyard(self, total_loc: float, raw_signals: Dict[str, int], mp: float) -> float: - hits = raw_signals.get("graveyard", 0) + hits = raw_signals.get("dead_code", 0) if hits == 0: return 0.0 - t = self.risk_tuning.get("graveyard", {}) + t = self.risk_tuning.get("dead_code", {}) deprecated_lines = hits * t.get("hit_mult", 3.0) density = (deprecated_lines / max(total_loc, t.get("safe_mass_floor", 50.0))) * 100.0 threshold = t.get("threshold_base", 10.0) / max(mp, 0.1) try: - score = 100.0 / ( - 1.0 + math.exp(-t.get("sigmoid_slope", 0.3) * (density - threshold)) - ) + score = 100.0 / (1.0 + math.exp(-t.get("sigmoid_slope", 0.3) * (density - threshold))) except OverflowError: score = 100.0 if density > threshold else 0.0 return min(score, 100.0) - - def _calc_api_exposure( - self, raw_signals: dict, total_loc: int, popularity: int = 0 - ) -> float: + + def _calc_api_exposure(self, raw_signals: dict, total_loc: int, popularity: int = 0) -> float: """ - YIN: Publicly exposed surfaces (api). - YANG: Internal/Private boundaries (encapsulation). + RISK: Publicly exposed surfaces (api). + MITIGATION: Internal/Private boundaries (encapsulation). """ api_hits = float(raw_signals.get("api", 0)) encapsulation = float(raw_signals.get("encapsulation", 0)) @@ -1884,8 +1719,8 @@ def _calc_concurrency( functions: List[Dict[str, Any]] = None, ) -> float: """ - YIN: Threads/Async execution + Thread Starvation (O(N) Bombs). - YANG: Mutex/Locks/Semaphores (sync_locks). + RISK: Threads/Async execution + Thread Starvation (O(N) Bombs). + MITIGATION: Mutex/Locks/Semaphores (sync_locks). """ tuning = self.risk_tuning.get("concurrency", {}) loc_padding = tuning.get("loc_padding", 150) @@ -1921,20 +1756,18 @@ def _calc_concurrency( return self._sigmoid(density, threshold, slope) * 100.0 * mp - def _calc_state_flux( - self, loc: int, raw_signals: Dict[str, int], irc: int, mp: float - ) -> float: + def _calc_state_flux(self, loc: int, raw_signals: Dict[str, int], irc: int, mp: float) -> float: """ - YIN: State mutation (flux). - YANG: Immutability enforcements (freeze_hits). + RISK: State mutation (flux). + MITIGATION: Immutability enforcements (freeze_hits). """ tuning = self.risk_tuning.get("state_flux", {}) # THE FIX: Dropped padding to 0 so mutations immediately impact density loc_padding = tuning.get("loc_padding", 0) - raw_flux = float(raw_signals.get("flux", 0)) - freeze_hits = float(raw_signals.get("freeze_hits", 0)) + raw_flux = float(raw_signals.get("state_mutation", 0)) + freeze_hits = float(raw_signals.get("immutability_locks", 0)) # MITIGATION BALANCE: Subtract immutability from raw mutation. net_volatility = max(0.0, raw_flux - (freeze_hits * 0.5)) @@ -1979,15 +1812,15 @@ def _calc_obscured_payload( arch_matrix = self.CONTEXT_VIOLATION_MATRIX.get(archetype, {}) arch_multiplier = arch_matrix.get("obscured_payload_multiplier", 1.0) - obfuscation_indicators = (raw_signals.get("sec_heat_triggers", 0) * 5.0) + ( - raw_signals.get("sec_bitwise_hits", 0) * 2.0 + obfuscation_indicators = (raw_signals.get("sec_reflection_metaprogramming", 0) * 5.0) + ( + raw_signals.get("sec_bitwise_ops", 0) * 2.0 ) - malicious_payload = raw_signals.get("sec_safety_neg", 0) * 3.0 + malicious_payload = raw_signals.get("sec_safety_bypasses", 0) * 3.0 exfiltration = raw_signals.get("sec_io", 0) * 4.0 - rce_indicators = raw_signals.get("sec_danger", 0) * 5.0 - state_corruption = raw_signals.get("sec_flux", 0) * 3.0 - dead_code_threat = raw_signals.get("sec_graveyard", 0) * 2.0 - secrets = raw_signals.get("sec_private_info", 0) * 1.5 + rce_indicators = raw_signals.get("sec_high_risk_execution", 0) * 5.0 + state_corruption = raw_signals.get("sec_state_mutation", 0) * 3.0 + dead_code_threat = raw_signals.get("sec_dead_code", 0) * 2.0 + secrets = raw_signals.get("sec_hardcoded_secrets", 0) * 1.5 # Extension mismatch is proof of active evasion. Assign it a massive 20.0x mass. evasion_indicators = (raw_signals.get("sec_shadow_imports", 0) * 10.0) + ( @@ -2075,11 +1908,11 @@ def _calc_logic_bomb( arch_matrix = self.CONTEXT_VIOLATION_MATRIX.get(archetype, {}) arch_multiplier = arch_matrix.get("logic_bomb_multiplier", 1.0) - trigger = raw_signals.get("branch", 0) + (raw_signals.get("halt_hits", 0) * 3.0) + trigger = raw_signals.get("branch", 0) + (raw_signals.get("thread_sleeps", 0) * 3.0) payload = ( - (raw_signals.get("bailout_hits", 0) * 2.0) + (raw_signals.get("panics_and_aborts", 0) * 2.0) + (raw_signals.get("cleanup", 0) * 1.5) - + (raw_signals.get("sec_danger", 0) * 4.0) + + (raw_signals.get("sec_high_risk_execution", 0) * 4.0) ) # ---> THE AGENTIC SHIELD <--- @@ -2104,11 +1937,11 @@ def _calc_logic_bomb( dos_mass = attack_surface * (max_big_o**2) * 10.0 # 2. State Flux Bomb (Memory Exhaustion) - flux = raw_signals.get("flux", 0) + raw_signals.get("globals", 0) + flux = raw_signals.get("state_mutation", 0) + raw_signals.get("globals", 0) dos_mass += flux * (max_big_o**2) * 5.0 # 3. The Shielding Dampener (Safety Guardrails) - if raw_signals.get("safety", 0) > 0 or raw_signals.get("bailout_hits", 0) > 0: + if raw_signals.get("safety", 0) > 0 or raw_signals.get("panics_and_aborts", 0) > 0: dos_mass *= 0.25 # 75% reduction if guardrails exist sabotage_mass += dos_mass @@ -2128,15 +1961,11 @@ def _calc_logic_bomb( if sabotage_mass == 0: return 0.0 - explicit_threats = raw_signals.get("sec_graveyard", 0) + raw_signals.get("sec_heat_triggers", 0) + explicit_threats = raw_signals.get("sec_dead_code", 0) + raw_signals.get("sec_reflection_metaprogramming", 0) if max_big_o >= 3: explicit_threats += 1 # Preserve DoS Mass from being zeroed out - if ( - explicit_threats == 0 - and taint_confirmed == 0 - and not getattr(self, "is_paranoid", False) - ): + if explicit_threats == 0 and taint_confirmed == 0 and not getattr(self, "is_paranoid", False): sabotage_mass *= 0.05 # Fetch tuning parameters @@ -2156,10 +1985,8 @@ def _calc_logic_bomb( score = 100.0 if density > threshold else 0.0 return min(score * mp, 100.0) - - def _calc_injection_surface( - self, loc: int, raw_signals: Dict[str, int], mp: float, archetype: str - ) -> float: + + def _calc_injection_surface(self, loc: int, raw_signals: Dict[str, int], mp: float, archetype: str) -> float: """ Calculates Injection Surface Exposure (XSS, SQLi, RCE, SSTI). Looks for external network input flowing near dynamic execution without safety nets. @@ -2169,12 +1996,10 @@ def _calc_injection_surface( arch_multiplier = arch_matrix.get("injection_surface_multiplier", 1.0) input_vectors = raw_signals.get("sec_io", 0) + (raw_signals.get("ssr_boundaries", 0) * 2.0) - execution_vectors = (raw_signals.get("sec_danger", 0) * 4.0) + ( - raw_signals.get("sec_safety_neg", 0) * 2.0 - ) + execution_vectors = (raw_signals.get("sec_high_risk_execution", 0) * 4.0) + (raw_signals.get("sec_safety_bypasses", 0) * 2.0) # ---> LLM EXECUTION VULNERABILITY (Prompt Injection to Exec) <--- - if raw_signals.get("sec_danger", 0) > 0 and ( + if raw_signals.get("sec_high_risk_execution", 0) > 0 and ( raw_signals.get("llm_orchestrator", 0) > 0 or raw_signals.get("ai_tools", 0) > 0 ): # If an AI can trigger eval/exec/OS commands, it's a massive vulnerability @@ -2183,9 +2008,7 @@ def _calc_injection_surface( else: # ---> STATIC AI COMPUTE DAMPENER (Standard safe agents) <--- agent_dampener = ( - 1.0 - + (raw_signals.get("scientific", 0) * 2.0) - + (raw_signals.get("llm_local_compute", 0) * 2.0) + 1.0 + (raw_signals.get("scientific", 0) * 2.0) + (raw_signals.get("llm_local_compute", 0) * 2.0) ) execution_vectors = execution_vectors / agent_dampener @@ -2204,12 +2027,8 @@ def _calc_injection_surface( if injection_mass == 0: return 0.0 - explicit_threats = raw_signals.get("sec_danger", 0) + raw_signals.get("sec_io", 0) - if ( - explicit_threats == 0 - and taint_confirmed == 0 - and not getattr(self, "is_paranoid", False) - ): + explicit_threats = raw_signals.get("sec_high_risk_execution", 0) + raw_signals.get("sec_io", 0) + if explicit_threats == 0 and taint_confirmed == 0 and not getattr(self, "is_paranoid", False): injection_mass *= 0.10 # Fetch tuning parameters @@ -2266,7 +2085,7 @@ def _calc_memory_corruption( (raw_signals.get("pointers", 0) * 2.5) + (raw_signals.get("memory_alloc", 0) * 3.0) + (raw_signals.get("inline_asm", 0) * 5.0) - + (raw_signals.get("cast_hits", 0) * 1.5) + + (raw_signals.get("explicit_casts", 0) * 1.5) ) if raw_memory_mass == 0: @@ -2277,9 +2096,9 @@ def _calc_memory_corruption( net_risk = max(raw_memory_mass - mitigation_mass, 0.0) * arch_multiplier explicit_threats = ( - raw_signals.get("sec_danger", 0) - + raw_signals.get("sec_safety_neg", 0) - + raw_signals.get("sec_heat_triggers", 0) + raw_signals.get("sec_high_risk_execution", 0) + + raw_signals.get("sec_safety_bypasses", 0) + + raw_signals.get("sec_reflection_metaprogramming", 0) ) if explicit_threats == 0 and not getattr(self, "is_paranoid", False): net_risk *= 0.05 @@ -2310,26 +2129,20 @@ def _calc_secrets_risk(self, loc: int, raw_signals: Dict[str, int], mp: float) - Calculates Secrets Risk Exposure (Credential Exposure). Looks for hardcoded credentials. Trusts the SecurityLens RHS-string sensor. """ - base_leak = raw_signals.get("sec_private_info", 0) * 10.0 + base_leak = raw_signals.get("sec_hardcoded_secrets", 0) * 10.0 if base_leak == 0: return 0.0 careless_amplifiers = ( - 1.0 - + raw_signals.get("print_hits", 0) - + raw_signals.get("graveyard", 0) - + raw_signals.get("globals", 0) + 1.0 + raw_signals.get("debug_prints", 0) + raw_signals.get("dead_code", 0) + raw_signals.get("globals", 0) ) # LLM API keys are massive targets. If they are calling APIs without globals, spike the risk. if raw_signals.get("llm_api", 0) > 0 and raw_signals.get("globals", 0) == 0: careless_amplifiers *= 3.0 - if ( - not getattr(self, "is_paranoid", False) - and raw_signals.get("sec_heat_triggers", 0) == 0 - ): + if not getattr(self, "is_paranoid", False) and raw_signals.get("sec_reflection_metaprogramming", 0) == 0: careless_amplifiers = min(careless_amplifiers, 2.0) leak_mass = base_leak * careless_amplifiers @@ -2390,15 +2203,13 @@ def _calc_algorithmic_dos( hv = func.get("hit_vector", {}) api_hits = hv.get("api", 0) io_hits = hv.get("io", 0) + hv.get("sec_io", 0) - flux_hits = hv.get("flux", 0) + hv.get("globals", 0) + flux_hits = hv.get("state_mutation", 0) + hv.get("globals", 0) choke_multiplier = 1.0 + api_hits + io_hits + flux_hits func_threat *= choke_multiplier # 3. The Dampeners (Guardrails) - safety_hits = ( - hv.get("safety", 0) + hv.get("bailout_hits", 0) + hv.get("cleanup", 0) - ) + safety_hits = hv.get("safety", 0) + hv.get("panics_and_aborts", 0) + hv.get("cleanup", 0) if safety_hits > 0: func_threat *= 0.5 # 50% reduction for bounded iteration @@ -2434,9 +2245,7 @@ def _calc_algorithmic_dos( # REPORTING UTILITIES # -------------------------------------------------------------------------- - def generate_forensic_report( - self, parsed_files: List[Dict[str, Any]] - ) -> Dict[str, Any]: + def generate_forensic_report(self, parsed_files: List[Dict[str, Any]]) -> Dict[str, Any]: """[FORENSIC RANKING] Generates Top/Bottom 3 for dynamically indexed exposures.""" if not parsed_files: return {} @@ -2462,11 +2271,7 @@ def generate_forensic_report( # ==================================================================== # NEW: CALCULATE CUMULATIVE RISK (Excluding Formatting Inconsistency) # ==================================================================== - civil_war_idx = ( - self.RISK_SCHEMA.index("civil_war") - if "civil_war" in self.RISK_SCHEMA - else -1 - ) + civil_war_idx = self.RISK_SCHEMA.index("tabs_vs_spaces") if "tabs_vs_spaces" in self.RISK_SCHEMA else -1 def get_cumulative_risk(f): rv = f.get("risk_vector", []) @@ -2474,31 +2279,15 @@ def get_cumulative_risk(f): return 0.0 # Sum all exposures except civil_war return sum( - val - for i, val in enumerate(rv) - if i != civil_war_idx and i < len(rv) and isinstance(val, (int, float)) + val for i, val in enumerate(rv) if i != civil_war_idx and i < len(rv) and isinstance(val, (int, float)) ) - sorted_by_cumulative = sorted( - active_files, key=get_cumulative_risk, reverse=True - ) + sorted_by_cumulative = sorted(active_files, key=get_cumulative_risk, reverse=True) # --- NEW: CALCULATE SYSTEMIC ARCHITECTURAL BOTTLENECKS --- - flux_idx = ( - self.RISK_SCHEMA.index("state_flux") - if "state_flux" in self.RISK_SCHEMA - else -1 - ) - err_idx = ( - self.RISK_SCHEMA.index("safety_score") - if "safety_score" in self.RISK_SCHEMA - else -1 - ) - doc_idx = ( - self.RISK_SCHEMA.index("documentation") - if "documentation" in self.RISK_SCHEMA - else -1 - ) + flux_idx = self.RISK_SCHEMA.index("state_flux") if "state_flux" in self.RISK_SCHEMA else -1 + err_idx = self.RISK_SCHEMA.index("safety_score") if "safety_score" in self.RISK_SCHEMA else -1 + doc_idx = self.RISK_SCHEMA.index("documentation") if "documentation" in self.RISK_SCHEMA else -1 bottlenecks = { "cascading_state_mutation": [], @@ -2518,23 +2307,17 @@ def get_cumulative_risk(f): flux_risk = ( float(rv[flux_idx]) - if flux_idx >= 0 - and len(rv) > flux_idx - and isinstance(rv[flux_idx], (int, float)) + if flux_idx >= 0 and len(rv) > flux_idx and isinstance(rv[flux_idx], (int, float)) else 0.0 ) err_risk = ( float(rv[err_idx]) - if err_idx >= 0 - and len(rv) > err_idx - and isinstance(rv[err_idx], (int, float)) + if err_idx >= 0 and len(rv) > err_idx and isinstance(rv[err_idx], (int, float)) else 0.0 ) doc_risk = ( float(rv[doc_idx]) - if doc_idx >= 0 - and len(rv) > doc_idx - and isinstance(rv[doc_idx], (int, float)) + if doc_idx >= 0 and len(rv) > doc_idx and isinstance(rv[doc_idx], (int, float)) else 0.0 ) @@ -2543,7 +2326,7 @@ def get_cumulative_risk(f): "path": p, "score": round(btw * flux_risk, 3), "btw": round(btw, 4), - "flux": flux_risk, + "state_mutation": flux_risk, } ) bottlenecks["fragile_dependency_chain"].append( @@ -2595,12 +2378,10 @@ def get_cumulative_risk(f): } for idx, rk in enumerate(self.RISK_SCHEMA): - report["exposures"][rk] = self._rank_list( - active_files, key_path=["risk_vector", idx] - ) + report["exposures"][rk] = self._rank_list(active_files, key_path=["risk_vector", idx]) return report - + def _get_locational_multipliers(self, path: str) -> Dict[str, float]: """Matches path against regex configurations and extracts applicable Modifiers.""" active_multipliers = {} @@ -2613,7 +2394,7 @@ def _get_locational_multipliers(self, path: str) -> Dict[str, float]: "Dead Code Exposure": "dead", "API Exposure": "api", "Concurrency Exposure": "async", - "State Flux Exposure": "flux", + "State Flux Exposure": "state_mutation", "Specification Exposure": "spec", "Churn Exposure": "churn", "Algorithmic DoS Exposure": "algorithmic_dos", @@ -2640,9 +2421,7 @@ def _get_locational_multipliers(self, path: str) -> Dict[str, float]: return active_multipliers - def _rank_list( - self, parsed_files: List[Dict[str, Any]], key_path: List[Any] - ) -> Dict[str, List[Dict[str, Any]]]: + def _rank_list(self, parsed_files: List[Dict[str, Any]], key_path: List[Any]) -> Dict[str, List[Dict[str, Any]]]: """Extracts top and bottom ranks safely navigating dictionaries and lists.""" def get_val(f): @@ -2676,9 +2455,7 @@ def get_val(f): ], } - def _generate_function_rankings( - self, parsed_files: List[Dict[str, Any]] - ) -> Dict[str, List[Dict[str, Any]]]: + def _generate_function_rankings(self, parsed_files: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: all_funcs = [] for f in parsed_files: for func in f.get("functions", []): diff --git a/gitgalaxy/metrics/statistical_auditor.py b/gitgalaxy/metrics/statistical_auditor.py index d74b8c3c..501c380c 100644 --- a/gitgalaxy/metrics/statistical_auditor.py +++ b/gitgalaxy/metrics/statistical_auditor.py @@ -14,7 +14,7 @@ # ============================================================================== # GitGalaxy Phase 7: Spectral Auditor (Quality Control) -# Strategy v6.2.0 Protocol: Bayesian Accountability & Inert Dark Matter +# Strategy v6.2.0 Protocol: Bayesian Accountability & Unparsable Artifacts # ============================================================================== @@ -22,8 +22,8 @@ class StatisticalAuditor: """ GitGalaxy Statistical Auditor. - PURPOSE: Acts as the 3rd-gate quality control filter to catch structural anomalies - and data dumps using language-specific Median Absolute Deviation (MAD) outliers + PURPOSE: Acts as the 3rd-gate quality control filter to catch structural anomalies + and data dumps using language-specific Median Absolute Deviation (MAD) outliers and explicit hard-floor density checks. ARCHITECTURE: @@ -48,26 +48,26 @@ def __init__( self.logger.debug("Initializing Statistical Auditor (Data Quality Gating)...") - # Save the language definitions so we can check for execution geometry later + # Save the language definitions so we can check for execution topology later self.lang_defs = lang_defs or {} # SCHEMA CONSTANTS (32 Signal Keys representing pure active logic) self.SIGNAL_KEYS = [ "branch", "args", - "linear", + "structural_boundaries", "func_start", "class_start", "import", "api", "decorators", "safety", - "safety_neg", - "danger", - "flux", - "heat_triggers", + "safety_bypasses", + "high_risk_execution", + "state_mutation", + "reflection_metaprogramming", "keyword_debt", - "private_info", + "hardcoded_secrets", "io", "concurrency", "ui_framework", @@ -87,9 +87,7 @@ def __init__( "inline_asm", ] - def audit( - self, parsed_files: List[Dict[str, Any]] - ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + def audit(self, parsed_files: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """Executes statistical gating to identify data-dumps and structural outliers.""" import os # Required for extension splitting in Consensus Engine @@ -97,23 +95,19 @@ def audit( self.logger.debug("Statistical Audit skipped: Empty file roster provided.") return [], [] - self.logger.info( - f"Scanning {len(parsed_files)} artifacts for structural anomalies and data dumps..." - ) + self.logger.info(f"Scanning {len(parsed_files)} artifacts for structural anomalies and data dumps...") total_files = max(len(parsed_files), 1) orphan_threshold = max(3, int(math.log10(total_files) * 2)) - self.logger.debug( - f"Dynamic Ecosystem Orphan Threshold set to: <= {orphan_threshold} files." - ) + self.logger.debug(f"Dynamic Ecosystem Orphan Threshold set to: <= {orphan_threshold} files.") verified_files, unparsable_files = [], [] # ====================================================================== # DEFENSIVE ARCHITECTURE: Heuristic Extension Consensus - # Certain file extensions (like .h or .m) are ambiguous across languages - # (C vs C++ vs Objective-C). If the regex parser lacked high confidence, - # we check the macro-state of the repository. If 80% of the repository's + # Certain file extensions (like .h or .m) are ambiguous across languages + # (C vs C++ vs Objective-C). If the regex parser lacked high confidence, + # we check the macro-state of the repository. If 80% of the repository's # confidently parsed .h files are C++, we force the ambiguous file to align. # ====================================================================== confident_artifacts = [] @@ -232,20 +226,14 @@ def audit( for lid, group in by_lang.items(): if lid in ("undeterminable", "unknown"): for artifact in group: - unparsable_files.append( - self._format_for_exclusion( - artifact, "Pre-filtered Noise (Pre-Audit)" - ) - ) - self.logger.debug( - f"[{lid}] Bypassed {len(group)} artifacts (already excluded)." - ) + unparsable_files.append(self._format_for_exclusion(artifact, "Pre-filtered Noise (Pre-Audit)")) + self.logger.debug(f"[{lid}] Bypassed {len(group)} artifacts (already excluded).") continue # ================================================================== # DEFENSIVE ARCHITECTURE: Dynamic Auditability Check - # Prevent pure data files (YAML, JSON, CSV) from triggering the - # statistical outliers by checking if their language definition + # Prevent pure data files (YAML, JSON, CSV) from triggering the + # statistical outliers by checking if their language definition # even contains executable logic signals. # ================================================================== is_inert = False @@ -254,21 +242,17 @@ def audit( rules = self.lang_defs[lid].get("rules", {}) # POSITIVE COUNT: How many actual, active logic sensors exist? - active_signals = sum( - 1 for key in self.SIGNAL_KEYS if rules.get(key) is not None - ) + active_signals = sum(1 for key in self.SIGNAL_KEYS if rules.get(key) is not None) if active_signals == 0: is_inert = True else: is_inert = True # Unknown/Undefined languages are inert by default - # Immediately bypass inert matter from all statistical checks + # Immediately bypass static assets from all statistical checks if is_inert: verified_files.extend(group) - self.logger.debug( - f"[{lid}] Bypassed {len(group)} artifact(s) (Inert Data Format: 0 Active Signals)." - ) + self.logger.debug(f"[{lid}] Bypassed {len(group)} artifact(s) (Static Asset: 0 Active Signals).") continue # ================================================================== @@ -279,24 +263,21 @@ def audit( # Require an absolute Tier 0 Convergent Lock for orphans to survive. # If ALL files in this tiny group are Tier 1 or worse (> 0), banish them. all_weak_claims = all( - artifact.get("telemetry", {}).get( - "identity_lock_tier", artifact.get("lock_tier", 4) - ) - > 0 + artifact.get("telemetry", {}).get("identity_lock_tier", artifact.get("lock_tier", 4)) > 0 for artifact in group ) if all_weak_claims: - relegation_reason = f"Statistically Insignificant Sample (Population {len(group)}). Reverting to plaintext." + relegation_reason = ( + f"Statistically Insignificant Sample (Population {len(group)}). Reverting to plaintext." + ) self.logger.warning(f"[{lid}] {relegation_reason}") for artifact in group: - # Strip the hallucination, keep the mass visible in the 3D map + # Strip the hallucination, keep the mass visible in the topological map artifact["lang_id"] = "plaintext" - artifact["telemetry"]["identity_source_proof"] = ( - "Low-Sample Guard Fallback" - ) - artifact["equations"] = {} # Inert matter has no logic equations + artifact["telemetry"]["identity_source_proof"] = "Low-Sample Guard Fallback" + artifact["equations"] = {} # Static assets have no logic equations verified_files.append(artifact) continue @@ -311,9 +292,7 @@ def audit( equations = artifact.get("equations", {}) signal_hits = sum(equations.get(k, 0) for k in self.SIGNAL_KEYS) # Denominator MUST be total physical lines to detect 'hollowness' - total_physical_loc = max( - artifact.get("total_loc", artifact.get("coding_loc", 1)), 1 - ) + total_physical_loc = max(artifact.get("total_loc", artifact.get("coding_loc", 1)), 1) artifact["_rho"] = signal_hits / total_physical_loc # Polyglot Defense: Only add pure files to the statistical baseline @@ -332,10 +311,7 @@ def audit( # 2. Confidence Anchor (At least one file with C > 0.85) has_anchor = any( - artifact.get("telemetry", {}).get( - "identity_confidence", artifact.get("intensity", 0.0) - ) - > 0.85 + artifact.get("telemetry", {}).get("identity_confidence", artifact.get("intensity", 0.0)) > 0.85 for artifact in group ) @@ -388,12 +364,8 @@ def audit( # Extract telemetry from Phase 1 OR fallback to root meta keys telemetry = artifact.get("telemetry", {}) lock_tier = telemetry.get("identity_lock_tier", artifact.get("lock_tier", 4)) - source_proof = telemetry.get( - "identity_source_proof", artifact.get("source_proof", "Discovery") - ) - confidence = telemetry.get( - "identity_confidence", artifact.get("intensity", 0.0) - ) + source_proof = telemetry.get("identity_source_proof", artifact.get("source_proof", "Discovery")) + confidence = telemetry.get("identity_confidence", artifact.get("intensity", 0.0)) # ZERO-DENSITY THRESHOLD: Hard Floor check for data dumps disguised as code if loc > 50 and rho == 0 and not is_minified: @@ -406,9 +378,7 @@ def audit( # to be minified, obfuscated, or packed with embedded binaries. elif loc > 30 and rho > 3.0 and not is_minified: is_outlier = True - relegation_reason = ( - f"Packed Payload Guard (Impossible Density: {rho:.2f} hits/line)" - ) + relegation_reason = f"Packed Payload Guard (Impossible Density: {rho:.2f} hits/line)" # THE ROBUST Z-SCORE (MAD) # Bypassed if the file is a heavy polyglot (its density is blended) @@ -416,15 +386,11 @@ def audit( mi = (0.6745 * (rho - median_rho)) / mad # 4. Probabilistic Threshold Gating (T_adj = -3.5 * Ci) - t_adj = -5 * max( - confidence, 0.1 - ) # Floor confidence to prevent 0 threshold + t_adj = -5 * max(confidence, 0.1) # Floor confidence to prevent 0 threshold if mi < t_adj: is_outlier = True - relegation_reason = ( - f"Statistical Anomaly (Z-Score: {mi:.2f} < {t_adj:.2f})" - ) + relegation_reason = f"Statistical Anomaly (Z-Score: {mi:.2f} < {t_adj:.2f})" # 4. Routing logic for Outliers if is_outlier: @@ -467,9 +433,7 @@ def audit( ) # Format it as Noise to save memory and ensure schema consistency - unparsable_files.append( - self._format_for_exclusion(artifact, relegation_reason) - ) + unparsable_files.append(self._format_for_exclusion(artifact, relegation_reason)) relegated_count += 1 else: verified_files.append(artifact) @@ -512,7 +476,7 @@ def _is_dead_code(self, artifact: Dict[str, Any]) -> bool: total_signals = sum(equations.values()) # Condition 2: Over 50% of the active signals are commented-out structural logic - if total_signals > 0 and equations.get("graveyard", 0) > (total_signals * 0.5): + if total_signals > 0 and equations.get("dead_code", 0) > (total_signals * 0.5): return True except Exception as e: @@ -520,9 +484,7 @@ def _is_dead_code(self, artifact: Dict[str, Any]) -> bool: return False - def _format_for_exclusion( - self, artifact: Dict[str, Any], reason: str - ) -> Dict[str, Any]: + def _format_for_exclusion(self, artifact: Dict[str, Any], reason: str) -> Dict[str, Any]: """ Formats an audited artifact to match the Orchestrator's Exclusion Queue schema. This ensures structural inertia and prevents the JSON archive from bloating. @@ -535,15 +497,9 @@ def _format_for_exclusion( "size_bytes": artifact.get("size_bytes", 0), # Preserve Phase 1 Telemetry for SBOM Traceability "failed_claim": artifact.get("lang_id", "unknown"), - "identity_confidence": telemetry.get( - "identity_confidence", artifact.get("intensity", 0.0) - ), - "identity_lock_tier": telemetry.get( - "identity_lock_tier", artifact.get("lock_tier", 4) - ), - "identity_source_proof": telemetry.get( - "identity_source_proof", artifact.get("source_proof", "Discovery") - ), + "identity_confidence": telemetry.get("identity_confidence", artifact.get("intensity", 0.0)), + "identity_lock_tier": telemetry.get("identity_lock_tier", artifact.get("lock_tier", 4)), + "identity_source_proof": telemetry.get("identity_source_proof", artifact.get("source_proof", "Discovery")), } def _is_threat(self, artifact: Dict[str, Any]) -> bool: diff --git a/gitgalaxy/metrics/tensor_scanner.py b/gitgalaxy/metrics/tensor_scanner.py index 1f0fe6e2..997ed0ae 100644 --- a/gitgalaxy/metrics/tensor_scanner.py +++ b/gitgalaxy/metrics/tensor_scanner.py @@ -18,11 +18,7 @@ class TensorScanner: """ def __init__(self, parent_logger: logging.Logger = None): - self.logger = ( - parent_logger.getChild("tensor_scanner") - if parent_logger - else logging.getLogger("tensor_scanner") - ) + self.logger = parent_logger.getChild("tensor_scanner") if parent_logger else logging.getLogger("tensor_scanner") def audit_model(self, file_path: str) -> Dict[str, Any]: """Routes the binary to the correct header parser.""" @@ -56,9 +52,9 @@ def _parse_safetensors(self, file_path: str) -> Dict[str, Any]: # ================================================================== # DEFENSIVE ARCHITECTURE: O(1) Memory Footprint # We explicitly do NOT use `torch.load()` or `safetensors.safe_open()`. - # Loading a 70B parameter model into RAM would instantly trigger an - # OOM (Out of Memory) kill in CI/CD pipelines. By only reading the - # first 8 bytes to extract the JSON header size, we keep the memory + # Loading a 70B parameter model into RAM would instantly trigger an + # OOM (Out of Memory) kill in CI/CD pipelines. By only reading the + # first 8 bytes to extract the JSON header size, we keep the memory # footprint microscopic. # ================================================================== header_size_bytes = f.read(8) @@ -69,15 +65,13 @@ def _parse_safetensors(self, file_path: str) -> Dict[str, Any]: # ================================================================== # DEFENSIVE ARCHITECTURE: Denial of Service (DoS) / Memory Bomb Guard - # A malicious actor could craft a tiny safetensor file that claims its - # JSON header is 500GB. When Python attempts to read those bytes, it - # causes a catastrophic memory exhaustion attack. We hard-cap the read + # A malicious actor could craft a tiny safetensor file that claims its + # JSON header is 500GB. When Python attempts to read those bytes, it + # causes a catastrophic memory exhaustion attack. We hard-cap the read # buffer at 100MB to mathematically guarantee pipeline survival. # ================================================================== if header_size > 100 * 1024 * 1024: - raise ValueError( - f"Safetensors header is suspiciously large: {header_size} bytes" - ) + raise ValueError(f"Safetensors header is suspiciously large: {header_size} bytes") # 2. Read the JSON header header_json_bytes = f.read(header_size) @@ -85,9 +79,7 @@ def _parse_safetensors(self, file_path: str) -> Dict[str, Any]: # 3. Extract Metadata metadata = header.get("__metadata__", {}) - architecture = metadata.get( - "architecture", metadata.get("format", "Unknown Transformer") - ) + architecture = metadata.get("architecture", metadata.get("format", "Unknown Transformer")) # 4. Calculate Parameters (Sum of the product of all tensor shapes) total_params = 0 @@ -116,9 +108,9 @@ def _parse_gguf(self, file_path: str) -> Dict[str, Any]: # ================================================================== # DEFENSIVE ARCHITECTURE: Algorithmic Complexity Guard # The GGUF format uses a deeply nested binary tree for KV pairs. - # Writing a pure Python binary tree walker introduces a massive risk of + # Writing a pure Python binary tree walker introduces a massive risk of # infinite loops (ReDoS equivalents) if the parsed file is malformed. - # Instead, we read a flat 1MB chunk and extract known ASCII signatures. + # Instead, we read a flat 1MB chunk and extract known ASCII signatures. # This guarantees an O(1) time complexity and O(1) space complexity. # ================================================================== chunk = f.read(1024 * 1024) @@ -156,4 +148,4 @@ def _format_params(self, count: int) -> str: return f"{count / 1_000_000_000:.1f}B" elif count >= 1_000_000: return f"{count / 1_000_000:.1f}M" - return str(count) \ No newline at end of file + return str(count) diff --git a/gitgalaxy/recorders/README.md b/gitgalaxy/recorders/README.md index 321bf5e9..f5402e6b 100644 --- a/gitgalaxy/recorders/README.md +++ b/gitgalaxy/recorders/README.md @@ -1,38 +1,43 @@ -# GitGalaxy: Telemetry & Data Serialization Layer +# GitGalaxy Recorders: Telemetry & Data Serialization Engine -[![Telemetry](https://img.shields.io/badge/Telemetry-Multi--Format-00BFFF.svg)](#) -[![WebGPU](https://img.shields.io/badge/WebGPU-Columnar_JSON-00C957.svg)](#) -[![Context](https://img.shields.io/badge/Context-LLM_Optimized-8A2BE2.svg)](#) +This directory houses the high-speed data serialization and export engines for GitGalaxy. -This directory contains the high-speed data serialization engines that translate the internal RAM state of the **blAST Engine** into actionable, forensic telemetry formats for downstream systems. +As the final phase of the pipeline, the `recorders` directory is responsible for translating the massive, in-memory topological graphs and structural risk vectors into highly optimized payloads for downstream consumers. GitGalaxy is strictly headless and API-first; these modules ensure the data can be instantly consumed by interactive 3D WebGPU visualizers, autonomous AI coding agents, SIEM pipelines, and enterprise data warehouses. -GitGalaxy is designed to be completely headless and API-first. These serialization tools ensure that the mathematical graph generated by the core engine can be instantly consumed by interactive 3D visualizers, autonomous AI coding agents, or enterprise SIEM pipelines. +## Architectural Philosophy & Defensive Engineering -> **⚠️ Configuration Warning:** Do not modify these core serialization files to add new data columns or change schema mappings. All schemas, UI string labels, and key mappings have been abstracted to the **[Standards Registry](../standards/README.md)**. +Converting a multi-dimensional graph of 50,000+ files into a portable format presents a severe memory and I/O bottleneck. Standard serialization libraries will duplicate the graph in RAM, triggering immediate OS-level Out-Of-Memory (OOM) kills. The Recorders are defensively engineered to mitigate these constraints: -### 🗺️ The Architecture +### 1. Destructive RAM Eviction (Memory Management) +To process monolithic repositories on standard hardware, the `gpu_recorder.py` employs a "Destructive Pivot." Instead of copying data, it systematically `.pop()`s elements from the main Orchestrator arrays to build its new schema, manually invoking Python's Garbage Collector (`gc.collect()`) at phase boundaries. This ensures the pipeline's memory footprint strictly decreases during the export phase, preventing OOM crashes. -Each file represents a highly specialized data serialization strategy. Read the official documentation links for deep dives into the underlying schema formatting. +### 2. Columnar Pivoting & Text Interning (AoS to SoA) +WebGPU and frontend rendering engines struggle with deeply nested, row-based JSON (Array of Structs). The engine structurally pivots the telemetry into flat, numerical columns (Structure of Arrays). Repetitive metadata—such as file extensions, author names, or diagnostic reasons—are aggressively minified into O(1) integer arrays using Text Interning. This drastically reduces the network payload size and client-side RAM overhead. -* **`record_keeper.py` (The SQL Telemetry Layer):** The native SQLite3 recorder. It transforms the live RAM state directly into a highly relational database (`_master.db`), bypassing intermediate JSON parsing to create a time-series schema perfectly aligned for Master Database aggregation and complex querying. - * 📖 **[Read the SQLite Database Specs](https://squid-protocol.github.io/gitgalaxy/02-21-record-keeper/)** +### 3. Zero-Overhead Relational Mapping +The `record_keeper.py` bypasses intermediate JSON/CSV dumping entirely. It maps the in-memory Python dictionaries directly into a native SQLite3 database. To prevent I/O deadlocks during massive batch inserts, it explicitly enforces `PRAGMA journal_mode = WAL;` (Write-Ahead Logging) and relaxed synchronous modes, guaranteeing high-speed execution without sacrificing relational integrity. -* **`gpu_recorder.py` (The Visual Payload Generator):** Generates the highly optimized `_GPU_galaxy.json` payload. This recorder performs a destructive RAM eviction (destroying the Python dictionaries as it writes) to compress the data into a Columnar Arrays of Structs (AoS to SoA) format, utilizing string interning for extreme WebGPU rendering performance. - * 📖 **[Read the GPU Payload Formatting Specs](https://squid-protocol.github.io/gitgalaxy/02-13-gpu-recorder/)** +### 4. Token-Density Optimization for LLMs +Dumping raw telemetry into an LLM context window saturates the agent with noise and wastes tokens. The `llm_recorder.py` acts as a statistical translation layer. It mathematically isolates the repository's structural dependencies (high blast radius), undocumented choke points (high centrality, zero documentation), and cascading mutations (high centrality, high state flux), generating a dense, highly opinionated Markdown brief that maximizes AI comprehension per token. -* **`llm_recorder.py` (The AI Context Layer):** The AI Translation Layer. It generates condensed, token-optimized Markdown (`_llm.md`) and a relational knowledge graph (`_graph.sqlite`). These artifacts are explicitly designed to be ingested by autonomous AI coding agents (like Claude or Cursor) or RAG pipelines, highlighting architectural choke points, monolithic bottlenecks (God Nodes), and AI threat scores. - * 📖 **[Read the LLM Context Optimization Specs](https://squid-protocol.github.io/gitgalaxy/02-14-llm-recorder/)** +--- + +## The Core Pipeline (Exit Strategies) -* **`audit_recorder.py` (The Compliance & Audit Layer):** Generates a verbose, human-readable forensic log (`_audit.json`). This file retains the raw dictionary structure and translates internal metrics into descriptive English labels. Designed strictly for compliance, debugging, and deep-dive architectural analysis. - * 📖 **[Read the Forensic Audit Specs](https://squid-protocol.github.io/gitgalaxy/02-12-audit-recorder/)** +Each file in this directory represents a specialized data exit strategy, tailored for a specific downstream consumer: -

+* **`gpu_recorder.py` (The Visual Payload Generator):** Generates the `_gpu.json` payload. It performs the Destructive RAM Eviction and Columnar Pivot, compressing the multi-dimensional graph into a minified manifest built strictly for high-performance WebGPU rendering engines. +* **`record_keeper.py` (The SQL Telemetry Layer):** Generates the `_master.db` artifact. A native SQLite3 recorder that captures the complete forensic state of the scan. It creates a robust, time-series schema designed for Enterprise Data Warehouse (EDW) aggregation, SQL-based security auditing, and delta-scan rehydration. +* **`llm_recorder.py` (The AI Context Layer):** Generates the `_llm.md` and `_graph.sqlite` artifacts. It calculates repository-wide statistical metrics (Min/Max/Mean for all 18 risk dimensions) and produces a targeted brief that grants autonomous AI agents (like Claude or Cursor) total ecosystem awareness before they write a single line of code. +* **`audit_recorder.py` (The Compliance & Forensic Layer):** Generates the `_audit.json` log. Designed for compliance, security debugging, and human review. It cryptographically binds the scan to a specific Git Commit Hash (acting as a Structural Health Bill of Materials), decodes the internal XGBoost ML Threat taxonomy, and maps raw integers back to descriptive, enterprise-friendly terminology. --- -### 🌌 Powered by the blAST Engine +## 🌌 Powered by the blAST Engine -This documentation is part of the [GitGalaxy Ecosystem](https://github.com/squid-protocol/gitgalaxy), an AST-free, LLM-free heuristic knowledge graph engine. +This documentation is part of the [GitGalaxy Ecosystem](https://squid-protocol.github.io/gitgalaxy/), an AST-free, LLM-free heuristic knowledge graph engine. -* 🪐 **[Explore the GitHub Repository](https://github.com/squid-protocol/gitgalaxy)** for code, tools, and updates. -* 🔭 **[Visualize your own repository at GitGalaxy.io](https://gitgalaxy.io/)** using our interactive 3D WebGPU dashboard. \ No newline at end of file +* 🪐 **[GitGalaxy Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** - Deep dives into the mathematics and pipeline architecture. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** - Render your codebase locally in 3D using WebGPU. +* 📖 **[The blAST Paradigm Wiki](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** - The academic and structural thesis backing the engine. +* ⚙️ **[Language Calibration Standards](https://squid-protocol.github.io/gitgalaxy/gitgalaxy/standards/how_to_add_a_language.md)** - Guide to extending the comparative lexical taxonomy. \ No newline at end of file diff --git a/gitgalaxy/recorders/audit_recorder.py b/gitgalaxy/recorders/audit_recorder.py index 4034cc2f..170d1467 100644 --- a/gitgalaxy/recorders/audit_recorder.py +++ b/gitgalaxy/recorders/audit_recorder.py @@ -26,19 +26,15 @@ class AuditRecorder: """ Forensic Audit Recorder. - PURPOSE: Generates a verbose, human-readable forensic JSON log from in-memory - telemetry state. Designed for enterprise compliance, security debugging, and + PURPOSE: Generates a verbose, human-readable forensic JSON log from in-memory + telemetry state. Designed for enterprise compliance, security debugging, and Software Supply Chain Security (SSCS) deep-dive analysis. """ def __init__(self, parent_logger=None): import logging - self.logger = ( - parent_logger.getChild("audit_recorder") - if parent_logger - else logging.getLogger("audit_recorder") - ) + self.logger = parent_logger.getChild("audit_recorder") if parent_logger else logging.getLogger("audit_recorder") # --- DYNAMIC SCHEMA FETCH --- schemas = getattr(config, "RECORDING_SCHEMAS", {}) @@ -98,9 +94,7 @@ def generate_report( "Analysis Context": { "Engine Identity": session_meta.get("engine", "GitGalaxy Scope v6.2.0"), "Target Root Name": session_meta.get("target", "Unknown"), - "Absolute Project Path": session_meta.get( - "target_directory", "Unknown" - ), + "Absolute Project Path": session_meta.get("target_directory", "Unknown"), "Analysis ISO Timestamp": session_meta.get("timestamp"), "Total Scan Duration": f"{session_meta.get('duration_seconds', 0.0)} seconds", }, @@ -108,9 +102,7 @@ def generate_report( "Active Branch": git_audit.get("branch", "N/A"), "Commit Hash (SHA-1)": git_audit.get("commit_hash", "N/A"), "Remote Origin URL": git_audit.get("remote_url", "Local/Disconnected"), - "Last Code Integration Date": git_audit.get( - "latest_commit_date", "Unknown" - ), + "Last Code Integration Date": git_audit.get("latest_commit_date", "Unknown"), }, } @@ -119,16 +111,14 @@ def generate_report( exposure_labels = schemas.get("EXPOSURE_LABELS", {}) # Pre-calculate labels for vectors to avoid repeating work in the inner loop - risk_labels = [ - exposure_labels.get(k, self.format_label(k)) for k in self.RISK_SCHEMA - ] + risk_labels = [exposure_labels.get(k, self.format_label(k)) for k in self.RISK_SCHEMA] hit_labels = [self.format_label(k) for k in self.HIT_SCHEMA] # --- DIRECTORY GROUP SORTING & HIERARCHY --- pretty_directory_groups = {} directory_groups_meta = summary.get("directory_groups", {}) - # Sort folders descending by physical mass + # Sort folders descending by structural magnitude sorted_directory_groups = sorted( directory_groups_meta.items(), key=lambda x: x[1].get("total_mass", 0.0), @@ -138,7 +128,7 @@ def generate_report( # Initialize the ordered dictionary with directory-level aggregates for d_name, d_data in sorted_directory_groups: pretty_directory_groups[d_name] = { - "Directory Group Mass": d_data.get("total_mass", 0.0), + "Directory Group Magnitude": d_data.get("total_mass", 0.0), "File Count": d_data.get("file_count", 0), "Average Risk Exposures": { exposure_labels.get(k, self.format_label(k)): f"{v}%" @@ -159,18 +149,30 @@ def generate_report( # DEFENSIVE GUARD: Synthesize default risk vectors for documentation # Prevents matrix dimension desyncs if the pipeline bypassed static physics for pure text. doc_languages = {"markdown", "plaintext", "rst", "text", "md"} - if lang_raw in doc_languages and len( - file_data.get("risk_vector", []) - ) < len(self.RISK_SCHEMA): + if lang_raw in doc_languages and len(file_data.get("risk_vector", [])) < len(self.RISK_SCHEMA): file_data["risk_vector"] = [ - 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 100.0, 100.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, + 100.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 100.0, + 100.0, + 0.0, + 100.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, ] telemetry["control_flow_ratio"] = 0.0 if not file_data.get("file_impact"): - file_data["file_impact"] = round( - max(file_data.get("total_loc", 1) / 50.0, 1.0), 2 - ) + file_data["file_impact"] = round(max(file_data.get("total_loc", 1) / 50.0, 1.0), 2) # --- DYNAMIC IDENTITY BLOCK --- identity_block = { @@ -189,9 +191,7 @@ def generate_report( display_key = "System Purpose" identity_block[display_key] = custom_val - identity_block["Lock Tier"] = file_data.get( - "lock_tier", telemetry.get("identity_lock_tier", 4) - ) + identity_block["Lock Tier"] = file_data.get("lock_tier", telemetry.get("identity_lock_tier", 4)) identity_block["Identity Proof"] = telemetry.get( "identity_source_proof", file_data.get("source_proof", "Discovery") ) @@ -201,9 +201,7 @@ def generate_report( # --- EXPOSURE FORMATTER --- exposures_dict = {} - for label, v in zip( - risk_labels, file_data.get("risk_vector") or [0.0] * len(risk_labels) - ): + for label, v in zip(risk_labels, file_data.get("risk_vector") or [0.0] * len(risk_labels)): if label in ["Civil War Exposure", "Indentation Consistency"]: if v == 0.0: exposures_dict["Indentation Consistency"] = "Tabs" @@ -212,24 +210,18 @@ def generate_report( elif v == 50.0: exposures_dict["Indentation Consistency"] = "Neutral / Deadlocked" else: - exposures_dict["Indentation Consistency"] = ( - f"Mixed ({100 - v:.1f}% Tabs / {v:.1f}% Spaces)" - ) + exposures_dict["Indentation Consistency"] = f"Mixed ({100 - v:.1f}% Tabs / {v:.1f}% Spaces)" else: exposures_dict[label] = f"{round(v, 2)}%" arch = telemetry.get("archetype", "Unknown Archetype") if d_name not in folder_archetype_counts: folder_archetype_counts[d_name] = {} - folder_archetype_counts[d_name][arch] = ( - folder_archetype_counts[d_name].get(arch, 0) + 1 - ) + folder_archetype_counts[d_name][arch] = folder_archetype_counts[d_name].get(arch, 0) + 1 mitigation_data = telemetry.get("mitigation_telemetry", {}) formatted_mitigations = { - key.replace("_", " ").title(): f"{val} instances" - for key, val in mitigation_data.items() - if val > 0 + key.replace("_", " ").title(): f"{val} instances" for key, val in mitigation_data.items() if val > 0 } # Assemble the individual artifact profile @@ -244,49 +236,35 @@ def generate_report( "Repository Archetype": arch, "Repository Drift (Z-Score)": telemetry.get("global_drift", 0.0), "Repository Fingerprint": ( - { - k: round(v, 3) - for k, v in telemetry.get( - "archetype_fingerprint", {} - ).items() - } + {k: round(v, 3) for k, v in telemetry.get("archetype_fingerprint", {}).items()} if isinstance(telemetry.get("archetype_fingerprint"), dict) else {} ), "File Archetype": telemetry.get("local_archetype", "N/A"), "File Drift (Z-Score)": telemetry.get("local_drift", 0.0), "File Fingerprint": ( - { - k: round(v, 3) - for k, v in telemetry.get("local_fingerprint", {}).items() - } + {k: round(v, 3) for k, v in telemetry.get("local_fingerprint", {}).items()} if isinstance(telemetry.get("local_fingerprint"), dict) else {} ), "Total LOC": file_data.get("total_loc", 0), "Coding LOC": file_data.get("coding_loc", 0), "Documentation LOC": file_data.get("doc_loc", 0), - "Structural Mass": round(file_data.get("file_impact", 0.0), 3), + "Structural Magnitude": round(file_data.get("file_impact", 0.0), 3), "Control Flow Ratio": f"{round(telemetry.get('control_flow_ratio', 0.0) * 100, 1)}%", "Popularity Rank": telemetry.get("popularity", 0), "Raw Churn Frequency": telemetry.get("raw_churn_freq", 0.0), - "Author Distribution": telemetry.get("author_distribution", 0.0), + "Authorship Centralization": telemetry.get("author_distribution", 0.0), "Ownership Entropy": telemetry.get("ownership_entropy", 0.0), - "Raw Cognitive Density": telemetry.get("densities", {}).get( - "cog_raw", 0.0 - ), + "Raw Cognitive Density": telemetry.get("densities", {}).get("cog_raw", 0.0), }, "4. Vulnerability & Risk Exposures": exposures_dict, "5. Function Analysis": [ { "Function Name": func.get("name", "Unknown"), - "Structural Impact": func.get( - "impact", func.get("magnitude", 0.0) - ), + "Structural Impact": func.get("impact", func.get("magnitude", 0.0)), "Lines of Code (LOC)": func.get("loc", 0), - "Control Flow Branches": func.get( - "branch", func.get("branch_count", 0) - ), + "Control Flow Branches": func.get("branch", func.get("branch_count", 0)), "Input Parameters": func.get("args", func.get("args_count", 0)), "Control Flow Ratio": f"{round((func.get('control_flow_ratio') or func.get('cf_ratio') or 0.0) * 100, 1)}%", "Start Line": func.get("start_line", 0), @@ -299,28 +277,23 @@ def generate_report( formatted_mitigations if formatted_mitigations else "None Detected" ), "7. Structural Signatures (Net Mitigated Signals)": { - label: v - for label, v in zip( - hit_labels, file_data.get("hit_vector") or [0] * len(hit_labels) - ) + label: v for label, v in zip(hit_labels, file_data.get("hit_vector") or [0] * len(hit_labels)) }, "8. Dependency Network": { - "Direct Upstream (Fragility)": file_data.get( - "dependency_network", {} - ).get("direct_upstream", len(file_data.get("raw_imports", []))), - "Direct Downstream (Blast Radius)": file_data.get( - "dependency_network", {} - ).get("direct_downstream", telemetry.get("popularity", 0)), - "Total Upstream (Absolute Fragility)": file_data.get( - "dependency_network", {} - ).get("total_upstream", 0), - "Total Downstream (Absolute Blast Radius)": file_data.get( - "dependency_network", {} - ).get("total_downstream", 0), + "Direct Upstream (Fragility)": file_data.get("dependency_network", {}).get( + "direct_upstream", len(file_data.get("raw_imports", [])) + ), + "Direct Downstream (Dependency Blast Radius)": file_data.get("dependency_network", {}).get( + "direct_downstream", telemetry.get("popularity", 0) + ), + "Total Upstream (Absolute Fragility)": file_data.get("dependency_network", {}).get( + "total_upstream", 0 + ), + "Total Downstream (Absolute Dependency Blast Radius)": file_data.get("dependency_network", {}).get( + "total_downstream", 0 + ), }, - "9. Extracted Dependencies": sorted( - list(file_data.get("raw_imports", [])) - ), + "9. Extracted Dependencies": sorted(list(file_data.get("raw_imports", []))), } # Map the file into its parent directory group @@ -340,13 +313,11 @@ def generate_report( # Calculate percentages and sort highest to lowest fingerprint = { name: f"{round((count / folder_files) * 100.0, 1)}%" - for name, count in sorted( - arch_counts.items(), key=lambda x: x[1], reverse=True - ) + for name, count in sorted(arch_counts.items(), key=lambda x: x[1], reverse=True) } reordered_d_data = { - "Directory Group Mass": d_data.get("Directory Group Mass", 0.0), + "Directory Group Magnitude": d_data.get("Directory Group Magnitude", 0.0), "File Count": d_data.get("File Count", folder_files), "Ecosystem Fingerprint (Archetypes)": fingerprint, "Average Risk Exposures": d_data.get("Average Risk Exposures", {}), @@ -365,11 +336,7 @@ def generate_report( # Physically weighs the file on disk if the pipeline dropped the byte count try: - actual_size = ( - os.path.getsize(abs_path) - if abs_path.exists() - else unparsable.get("size_bytes", 0) - ) + actual_size = os.path.getsize(abs_path) if abs_path.exists() else unparsable.get("size_bytes", 0) except Exception: actual_size = unparsable.get("size_bytes", 0) @@ -377,21 +344,15 @@ def generate_report( { "Path": rel_path, "Forensic Category": "Excluded Artifact", - "Diagnostic Reason": unparsable.get( - "reason", "Security Shielding (Format Excluded)" - ), + "Diagnostic Reason": unparsable.get("reason", "Security Shielding (Format Excluded)"), "Size": f"{actual_size} bytes", "Identity Confidence": f"{round(unparsable.get('identity_confidence', 0.0) * 100, 1)}%", - "Discovery Proof": unparsable.get( - "identity_source_proof", "Radar Scan" - ), + "Discovery Proof": unparsable.get("identity_source_proof", "Radar Scan"), } ) - # 3.2 Append optically bypassed artifacts to the local output list - for anon_path in summary.get("unparsable_files", {}).get( - "unparsable_artifacts", [] - ): + # 3.2 Append structurally bypassed artifacts to the local output list + for anon_path in summary.get("unparsable_files", {}).get("unparsable_artifacts", []): pretty_unparsable.append( { "Path": anon_path, @@ -399,7 +360,7 @@ def generate_report( "Diagnostic Reason": "Engine Bypass (Dense Structure or Unrecognized Syntax)", "Size": "Unknown (Parser Bypass)", "Identity Confidence": "0.0% (Scan Yielded No Data)", - "Discovery Proof": "Lexical Splicer Shielding", + "Discovery Proof": "Structural Signature Extractor Shielding", } ) @@ -428,13 +389,13 @@ def generate_report( } sec_hit_mapping = { - "sec_danger": "Dynamic Code Execution (RCE)", - "sec_safety_neg": "Security Control & Safety Bypasses", + "sec_high_risk_execution": "Dynamic Code Execution (RCE)", + "sec_safety_bypasses": "Security Control & Safety Bypasses", "sec_io": "Network & I/O Exfiltration Vectors", - "sec_flux": "Prototype Pollution & Global State Flux", - "sec_heat_triggers": "Obfuscation & Encoding Signatures", - "sec_graveyard": "Commented-out Executable Logic (Shadow Logic)", - "sec_bitwise_hits": "Low-Level Cryptographic & Bitwise Operations", + "sec_state_mutation": "Prototype Pollution & Global State Flux", + "sec_reflection_metaprogramming": "Obfuscation & Encoding Signatures", + "sec_dead_code": "Commented-out Executable Logic (Shadow Logic)", + "sec_bitwise_ops": "Low-Level Cryptographic & Bitwise Operations", "sec_shadow_imports": "Steganographic Payload Imports", "sec_homoglyphs": "Unicode Homoglyphs & Typosquatting", } @@ -457,16 +418,8 @@ def generate_report( } # Safe index lookups mapping formal schema names back to array indices - risk_indices = { - k: self.RISK_SCHEMA.index(k) - for k in sec_risk_mapping.keys() - if k in self.RISK_SCHEMA - } - hit_indices = { - k: self.HIT_SCHEMA.index(k) - for k in sec_hit_mapping.keys() - if k in self.HIT_SCHEMA - } + risk_indices = {k: self.RISK_SCHEMA.index(k) for k in sec_risk_mapping.keys() if k in self.RISK_SCHEMA} + hit_indices = {k: self.HIT_SCHEMA.index(k) for k in sec_hit_mapping.keys() if k in self.HIT_SCHEMA} # Sweep the files for security anomalies for file_data in parsed_files: @@ -506,9 +459,7 @@ def generate_report( mapping = sec_risk_mapping[r_key] if score >= mapping["threshold"]: label = mapping["label"] - vuln_exposures[label]["Critical Files"].append( - {"Path": path, "Score": f"{score:.1f}%"} - ) + vuln_exposures[label]["Critical Files"].append({"Path": path, "Score": f"{score:.1f}%"}) vuln_exposures[label]["Artifacts Flagged"] += 1 hit_vector = file_data.get("hit_vector") @@ -521,13 +472,9 @@ def generate_report( # --- THE FALSE POSITIVE GUARD: Decouple Active Threats from Passive Surface Risks --- # Count actual malicious regex hits (ignoring the _description metadata string) - malicious_hits_total = sum( - v for k, v in raw_threat_hits.items() if isinstance(v, int) - ) + malicious_hits_total = sum(v for k, v in raw_threat_hits.items() if isinstance(v, int)) - has_malware = ( - vuln_exposures["Hidden Malware Risk Exposure"]["Artifacts Flagged"] > 0 - ) + has_malware = vuln_exposures["Hidden Malware Risk Exposure"]["Artifacts Flagged"] > 0 has_secrets = vuln_exposures["Secrets Risk Exposure"]["Artifacts Flagged"] > 0 # Sort and map the ML (XGBoost) hit list descending by confidence @@ -544,9 +491,7 @@ def generate_report( # Tiered Status Routing (ML acts as the supreme authority) if ml_threat_files: audit_status = "ML_CONFIRMED_THREAT_DETECTED" - elif ( - quarantined_files or has_malware or has_secrets or malicious_hits_total > 0 - ): + elif quarantined_files or has_malware or has_secrets or malicious_hits_total > 0: audit_status = "CRITICAL_THREATS_DETECTED (Rule-Based)" elif any(v["Artifacts Flagged"] > 0 for v in vuln_exposures.values()): audit_status = "ELEVATED_SURFACE_RISK" @@ -578,15 +523,11 @@ def generate_report( if "ml_clusters" in global_fingerprint or "static_mass" in global_fingerprint: if "ml_clusters" in global_fingerprint: pretty_global_fingerprint["Active Execution Logic (ML Clusters)"] = { - k: f"{v['pct']}% ({v['count']} files)" - for k, v in global_fingerprint["ml_clusters"].items() + k: f"{v['pct']}% ({v['count']} files)" for k, v in global_fingerprint["ml_clusters"].items() } if "static_mass" in global_fingerprint: - pretty_global_fingerprint[ - "Inert Structural Mass (Static Categories)" - ] = { - k: f"{v['pct']}% ({v['count']} files)" - for k, v in global_fingerprint["static_mass"].items() + pretty_global_fingerprint["Static Assets (Static Categories)"] = { + k: f"{v['pct']}% ({v['count']} files)" for k, v in global_fingerprint["static_mass"].items() } else: # Legacy Schema Fallback @@ -612,7 +553,7 @@ def generate_report( "2. Global Ecosystem Summary": summary, "3. Forensic Security & Vulnerability Audit": security_audit, "4. High-Value Forensic Report": forensic_report, - "5. Unparsable Files (Excluded Artifacts Queue)": pretty_unparsable, + "5. Unparsable Artifacts (Excluded Artifacts Queue)": pretty_unparsable, "6. Parsed Files (Scanned Artifacts)": pretty_directory_groups, } @@ -621,9 +562,7 @@ def generate_report( try: with open(target_path, "w", encoding="utf-8") as f: json.dump(mission_audit, f, indent=4, ensure_ascii=False) - self.logger.info( - f"Audit Success: Forensic manifest sealed -> {target_path}" - ) + self.logger.info(f"Audit Success: Forensic manifest sealed -> {target_path}") except Exception as e: self.logger.error(f"Audit Write Error: {e}") @@ -634,9 +573,7 @@ def decode_galaxy(input_path, output_path=None): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="GitGalaxy v6.2.0 Forensic Audit Recorder CLI" - ) + parser = argparse.ArgumentParser(description="GitGalaxy v6.2.0 Forensic Audit Recorder CLI") parser.add_argument("input", help="Path to columnar galaxy.json") parser.add_argument("--out", help="Optional output path") - args = parser.parse_args() \ No newline at end of file + args = parser.parse_args() diff --git a/gitgalaxy/recorders/gpu_recorder.py b/gitgalaxy/recorders/gpu_recorder.py index 7c093188..e46ceb0b 100644 --- a/gitgalaxy/recorders/gpu_recorder.py +++ b/gitgalaxy/recorders/gpu_recorder.py @@ -28,24 +28,20 @@ class GPURecorder: PURPOSE: Transforms heavily nested, row-based artifact data into flattened, numerical columns (Structure of Arrays) optimized for GPU/WebGL ingestion. - - MECHANICS: Minifies repetitive strings via Text Interning (Lookups). Executes - destructive RAM eviction by aggressively `.pop()`ing the central pipeline lists + + MECHANICS: Minifies repetitive strings via Text Interning (Lookups). Executes + destructive RAM eviction by aggressively `.pop()`ing the central pipeline lists and manually triggering Python's Garbage Collector. - NOTE: While internal Python logic uses formal DevSecOps terminology (e.g., 'Artifacts', - 'Directory Groups'), the output JSON explicitly retains the legacy visual taxonomy - ('galaxy', 'singularity', 'c_ids') to maintain strict compatibility with the + NOTE: While internal Python logic uses formal DevSecOps terminology (e.g., 'Artifacts', + 'Directory Groups'), the output JSON explicitly retains the legacy visual taxonomy + ('galaxy', 'singularity', 'c_ids') to maintain strict compatibility with the downstream WebGL rendering engine. """ def __init__(self, version: str, parent_logger: Optional[logging.Logger] = None): self.version = version - self.logger = ( - parent_logger.getChild("gpu_recorder") - if parent_logger - else logging.getLogger("gpu_recorder") - ) + self.logger = parent_logger.getChild("gpu_recorder") if parent_logger else logging.getLogger("gpu_recorder") # --- DYNAMIC SCHEMA FETCH --- schemas = getattr(analysis_lens, "RECORDING_SCHEMAS", {}) @@ -109,14 +105,14 @@ def record_mission( "tel_pop": [], "tel_cfr": [], "ai_threats": [], - "satellite_data_flat": [], # Output retains WebGL 'satellite' namespace for functions + "satellite_data_flat": [], # Output retains WebGL 'satellite' namespace for functions "satellite_offsets": [0], "imports": [], - "c_ids": [], # Directory Group / Constellation mappings - "a_ids": [], # Ecosystem Baseline / Archetype IDs + "c_ids": [], # Directory Group / Subsystem mappings + "a_ids": [], # Ecosystem Baseline / Archetype IDs "a_dists": [], - "edges": [], # Inbound dependency pointers - "outbound_edges": [], # Outbound dependency pointers + "edges": [], # Inbound dependency pointers + "outbound_edges": [], # Outbound dependency pointers } # The 'Singularity' array maps 1:1 to Excluded Artifacts @@ -157,9 +153,7 @@ def record_mission( # 1. Directory Group Mapping d_name = file_data.get("directory_group", "__monolith__") - repository_graph["c_ids"].append( - self._intern(d_name, self.dir_group_lookup) - ) + repository_graph["c_ids"].append(self._intern(d_name, self.dir_group_lookup)) # 2. Dynamic Architectural Fingerprint Extraction fingerprint = tel.get("archetype_fingerprint", {}) @@ -173,7 +167,7 @@ def record_mission( sec_name, sec_dist = sorted_archs[1] file_a_ids.append(self._intern(prim_name, self.archetype_lookup)) - file_a_dists.append(int(round(prim_dist * 1000))) # Quantize to save bytes + file_a_dists.append(int(round(prim_dist * 1000))) # Quantize to save bytes # Identify architectural drift (Anti-Patterns) if (sec_dist - prim_dist) <= 0.9: @@ -197,20 +191,12 @@ def record_mission( repository_graph["m_locs"].append(int(file_data.get("coding_loc", 0))) repository_graph["d_locs"].append(int(file_data.get("doc_loc", 0))) - # 4. Quantized Physics Metrics + # 4. Quantized Structural Metrics repository_graph["mass"].append(int(round(file_data.get("file_impact", 0.0) * 10))) - repository_graph["author_distribution"].append( - int(round(tel.get("author_distribution", 0.0) * 1000)) - ) - repository_graph["ownership_entropy"].append( - int(round(tel.get("ownership_entropy", 0.0) * 1000)) - ) - repository_graph["raw_churn_freq"].append( - int(round(tel.get("raw_churn_freq", 0.0) * 1000)) - ) - repository_graph["cog_raw"].append( - int(round(tel.get("densities", {}).get("cog_raw", 0.0) * 1000)) - ) + repository_graph["author_distribution"].append(int(round(tel.get("author_distribution", 0.0) * 1000))) + repository_graph["ownership_entropy"].append(int(round(tel.get("ownership_entropy", 0.0) * 1000))) + repository_graph["raw_churn_freq"].append(int(round(tel.get("raw_churn_freq", 0.0) * 1000))) + repository_graph["cog_raw"].append(int(round(tel.get("densities", {}).get("cog_raw", 0.0) * 1000))) repository_graph["pos_x"].append(int(round(file_data.get("pos_x", 0.0) * 10))) repository_graph["pos_y"].append(int(round(file_data.get("pos_y", 0.0) * 10))) @@ -218,27 +204,17 @@ def record_mission( # 5. Flat Array Mapping (Structure of Arrays) repository_graph["risks_flat"].extend( - [ - int(v * 10) - for v in file_data.get("risk_vector", [0] * len(self.RISK_SCHEMA)) - ] + [int(v * 10) for v in file_data.get("risk_vector", [0] * len(self.RISK_SCHEMA))] ) repository_graph["hits_flat"].extend( - [ - int(v) - for v in file_data.get("hit_vector", [0] * len(self.HIT_SCHEMA)) - ] + [int(v) for v in file_data.get("hit_vector", [0] * len(self.HIT_SCHEMA))] ) # 6. Telemetry Interning domain_ctx = tel.get("domain_context", {}) - repository_graph["tel_aid"].append( - self._intern(tel.get("ownership", "unknown"), self.author_lookup) - ) + repository_graph["tel_aid"].append(self._intern(tel.get("ownership", "unknown"), self.author_lookup)) repository_graph["tel_pid"].append( - self._intern( - tel.get("identity_source_proof", "Discovery"), self.proof_lookup - ) + self._intern(tel.get("identity_source_proof", "Discovery"), self.proof_lookup) ) repository_graph["tel_purp"].append( self._intern( @@ -248,9 +224,7 @@ def record_mission( ) repository_graph["tel_lt"].append(tel.get("identity_lock_tier", 4)) repository_graph["tel_pop"].append(tel.get("popularity", 0)) - repository_graph["tel_cfr"].append( - int(round(tel.get("control_flow_ratio", 0.0) * 1000)) - ) + repository_graph["tel_cfr"].append(int(round(tel.get("control_flow_ratio", 0.0) * 1000))) # 7. Threat Score Quantization ai_score_str = domain_ctx.get("AI Threat Score", "0.0%") @@ -273,9 +247,7 @@ def record_mission( func.get("branch", 0), int(func.get("angle", 0) * 10), func.get("args", 0), - self._intern( - func.get("texture", "standard"), self.texture_lookup - ), + self._intern(func.get("texture", "standard"), self.texture_lookup), int(func.get("control_flow_ratio", 0.0) * 1000), int(func.get("impact", func.get("magnitude", 0)) * 10), int(func.get("start_line", 0)), @@ -295,9 +267,7 @@ def record_mission( # 9. Dependency Resolution raw_imports = sorted(list(file_data.get("raw_imports", []))) - repository_graph["imports"].append( - [self._intern(imp, self.import_lookup) for imp in raw_imports] - ) + repository_graph["imports"].append([self._intern(imp, self.import_lookup) for imp in raw_imports]) current_outbound = [] for imp in raw_imports: @@ -325,20 +295,14 @@ def record_mission( excluded_artifacts["paths"].append(path) excluded_artifacts["exts"].append(self._intern(ext, self.ext_lookup)) - excluded_artifacts["reasons"].append( - self._intern(unparsable.get("reason", "anomaly"), self.reason_lookup) - ) + excluded_artifacts["reasons"].append(self._intern(unparsable.get("reason", "anomaly"), self.reason_lookup)) excluded_artifacts["sizes"].append(int(unparsable.get("size_bytes", 0))) - excluded_artifacts["confidences"].append( - int(round(unparsable.get("identity_confidence", 0.0) * 1000)) - ) + excluded_artifacts["confidences"].append(int(round(unparsable.get("identity_confidence", 0.0) * 1000))) del unparsable # Evict detached dict references gc.collect() - self.logger.debug( - "GPU_RECORDER: RAM Eviction complete. Python GC cycle triggered." - ) + self.logger.debug("GPU_RECORDER: RAM Eviction complete. Python GC cycle triggered.") # ============================================================================== # SUMMARY FLATTENING (UI Diagnostics) @@ -425,9 +389,7 @@ def save_minified(self, payload: Dict[str, Any], filename: str): try: with open(target_path, "w", encoding="utf-8") as f: - json.dump( - payload, f, indent=None, separators=(",", ":"), ensure_ascii=False - ) + json.dump(payload, f, indent=None, separators=(",", ":"), ensure_ascii=False) self.logger.info(f"GPU Manifest Sealed -> {target_path}") except Exception as e: self.logger.error(f"Failed to seal GPU manifest: {e}") \ No newline at end of file diff --git a/gitgalaxy/recorders/llm_recorder.py b/gitgalaxy/recorders/llm_recorder.py index 7635c67c..0969719d 100644 --- a/gitgalaxy/recorders/llm_recorder.py +++ b/gitgalaxy/recorders/llm_recorder.py @@ -16,7 +16,7 @@ # ============================================================================== # GitGalaxy Phase 10: LLM Recorder (The AI Translation Layer) -# Strategy v6.3.0 Protocol: Token Density, Distribution Physics & Context Graphs +# Strategy v6.3.0 Protocol: Token Density, Distribution Topology & Context Graphs # ============================================================================== @@ -25,7 +25,7 @@ class LLMRecorder: PURPOSE: Translates raw GitGalaxy telemetry into AI-optimized artifacts. FEATURES: - 1. Statistical Physics: Calculates Min/Max/Mean/Median/Mode for all risks. + 1. Statistical Topologies: Calculates Min/Max/Mean/Median/Mode for all risks. 2. Syntactic Bottlenecks: Isolates I/O and Dependency choke points. 3. High-Impact Functions: Ranks top 10 functions by structural magnitude. 4. Relational Knowledge Graph: Builds a SQLite DB for autonomous agents. @@ -47,11 +47,7 @@ def __init__(self, parent_logger: Optional[logging.Logger] = None): def _parse_threat_score(self, artifact: Dict) -> Tuple[float, str]: """Safely extracts and converts the AI threat score string to a float.""" - score_str = ( - artifact.get("telemetry", {}) - .get("domain_context", {}) - .get("AI Threat Score", "0.0%") - ) + score_str = artifact.get("telemetry", {}).get("domain_context", {}).get("AI Threat Score", "0.0%") try: return float(score_str.replace("%", "")), score_str except ValueError: @@ -154,7 +150,7 @@ def _build_markdown( lines = [] lines.append(f"# ARCHITECTURAL_BRIEF: {target}") lines.append( - "> INSTRUCTION: Deterministic Syntactic Analysis. Base architectural insights on Mass, Extracted Signatures, and Risk overlays.\n" + "> INSTRUCTION: Deterministic Syntactic Analysis. Base architectural insights on Structural Magnitude, Extracted Signatures, and Risk overlays.\n" ) # --- 0. FORENSIC TRACEABILITY --- @@ -162,15 +158,9 @@ def _build_markdown( lines.append("| Metadata | Value |") lines.append("|---|---|") lines.append(f"| **Engine** | `{session_meta.get('engine', 'Unknown')}` |") - lines.append( - f"| **Target Path** | `{session_meta.get('target_directory', 'Unknown')}` |" - ) - lines.append( - f"| **Timestamp** | `{session_meta.get('timestamp', 'Unknown')}` |" - ) - lines.append( - f"| **Scan Duration** | `{session_meta.get('duration_seconds', 0.0)}s` |" - ) + lines.append(f"| **Target Path** | `{session_meta.get('target_directory', 'Unknown')}` |") + lines.append(f"| **Timestamp** | `{session_meta.get('timestamp', 'Unknown')}` |") + lines.append(f"| **Scan Duration** | `{session_meta.get('duration_seconds', 0.0)}s` |") lines.append(f"| **Git Branch** | `{git_audit.get('branch', 'N/A')}` |") lines.append(f"| **Git Commit** | `{git_audit.get('commit_hash', 'N/A')}` |") lines.append(f"| **Git Remote** | `{git_audit.get('remote_url', 'N/A')}` |") @@ -198,9 +188,7 @@ def _build_markdown( lines.append("## 0.5 AI THREAT AUDIT STATUS") if ml_threats: lines.append("> **🚨 ML_CONFIRMED_THREAT_DETECTED**") - lines.append( - f"> XGBoost Structural Signatures model identified {len(ml_threats)} malicious artifacts." - ) + lines.append(f"> XGBoost Structural Signatures model identified {len(ml_threats)} malicious artifacts.") else: lines.append("> **✅ SECURE_NO_THREATS_DETECTED**") lines.append("> XGBoost Structural Signatures model found no malicious artifacts.") @@ -237,11 +225,9 @@ def _build_markdown( "> * **Defensive Guardrails:** `safety` (Error handling), `freeze_hits` (immutability), `cleanup` (state destruction)." ) - # --- 2. 13-POINT RISK PHYSICS (THE EQUATIONS) --- - lines.append("## 2. THE 13-POINT RISK EXPOSURE PHYSICS (EQUATIONS & CONTEXT)") - lines.append( - "> **How the SAST Engine Calculates Risk Exposure (Lower Risk 0 - Higher Risk Exposure 100%):**" - ) + # --- 2. 13-POINT RISK ANALYSIS (THE EQUATIONS) --- + lines.append("## 2. THE 13-POINT RISK EXPOSURE ANALYSIS (EQUATIONS & CONTEXT)") + lines.append("> **How the SAST Engine Calculates Risk Exposure (Lower Risk 0 - Higher Risk Exposure 100%):**") lines.append( "> Most scores use a Sigmoid curve based on density (Hits / LOC) to prevent massive files from mathematically hiding their flaws." ) @@ -268,17 +254,13 @@ def _build_markdown( "> 7. **State Flux Risk Exposure:** Measures the frequency of data mutation and variable reassignment." ) lines.append( - "> 8. **Graveyard (commented out code):** Measures the presence of abandoned, commented-out logic blocks." + "> 8. **Commented Logic (dead code):** Measures the presence of abandoned, commented-out logic blocks." ) lines.append( "> 9. **Spec Match Risk Exposure:** Measures how closely code aligns with formal specifications or architectural requirements." ) - lines.append( - "> 10. **Stability:** Measures the recency of edits relative to the repository's entire lifespan." - ) - lines.append( - "> 11. **Deep Churn:** Measures the historical volatility and frequency of modification." - ) + lines.append("> 10. **Stability:** Measures the recency of edits relative to the repository's entire lifespan.") + lines.append("> 11. **Deep Churn:** Measures the historical volatility and frequency of modification.") lines.append( "> 12. **Documentation Risk Exposure:** Measures the lack of structured documentation and ownership metadata." ) @@ -308,7 +290,7 @@ def _build_markdown( "> **19. Function Magnitude (Impact Score):** Measures the physical footprint and 'heaviness' of a specific function. `((BranchHits + 1) * (Args + 1) + (0.05 * LOC)) * 10`. This is NOT a risk score." ) lines.append( - "> **20. File Magnitude (Total Mass):** Measures the total gravitational pull of a file. `Sum(Function Impacts) + API + Concurrency + Flux + (LOC / 50)`. This is NOT a risk score." + "> **20. File Magnitude (Total Impact):** Measures the total structural impact of a file. `Sum(Function Impacts) + API + Concurrency + Flux + (LOC / 50)`. This is NOT a risk score." ) lines.append("") @@ -318,17 +300,11 @@ def _build_markdown( lines.append("|---|---|") lines.append(f"| Total Artifacts | {sum_data.get('total_files', 0)} |") lines.append(f"| Analyzed Artifacts (Scanned) | {visible_count} |") - lines.append( - f"| Excluded Artifacts (Unparsable data, binaries, unsupported formats) | {total_excluded} |" - ) + lines.append(f"| Excluded Artifacts (Unparsable data, binaries, unsupported formats) | {total_excluded} |") lines.append(f"| Total LOC | {sum_data.get('total_loc', 0)} |") lines.append(f"| Volatility Index | {sum_data.get('volatility_index', 0.0)} |") - lines.append( - f"| % Scanned of codebase = | {sum_data.get('Percent_Visible', 0)}% |" - ) - lines.append( - f"| Dominant Lang | {sum_data.get('dominant_language', 'UNK').upper()} |" - ) + lines.append(f"| % Scanned of codebase = | {sum_data.get('Percent_Visible', 0)}% |") + lines.append(f"| Dominant Lang | {sum_data.get('dominant_language', 'UNK').upper()} |") lines.append("") # --- 3.5 MACRO-NETWORK TOPOLOGY --- @@ -360,13 +336,9 @@ def _build_markdown( lines.append("| Lang | Files | LOC | Share |") lines.append("|---|---|---|---|") total_visible = max(visible_count, 1) - for lang, stats in sorted( - comp.items(), key=lambda x: x[1].get("files", 0), reverse=True - ): + for lang, stats in sorted(comp.items(), key=lambda x: x[1].get("files", 0), reverse=True): pct = (stats.get("files", 0) / total_visible) * 100 - lines.append( - f"| {lang.upper()} | {stats.get('files', 0)} | {stats.get('loc', 0)} | {pct:.1f}% |" - ) + lines.append(f"| {lang.upper()} | {stats.get('files', 0)} | {stats.get('loc', 0)} | {pct:.1f}% |") lines.append("") # --- 4.5 REPOSITORY ECOSYSTEM BASELINE --- @@ -392,7 +364,7 @@ def _build_markdown( ) lines.append("") - lines.append("## 4.6 FILE ARCHETYPES & STATIC MASS") + lines.append("## 4.6 FILE ARCHETYPES & STATIC ASSETS") fingerprint = summary.get("ecosystem_fingerprint", {}) ml_clusters = fingerprint.get("ml_clusters", {}) static_mass = fingerprint.get("static_mass", {}) @@ -417,9 +389,7 @@ def _build_markdown( lines.append("## 5. EXCLUDED ARTIFACTS (Unparsable or Shielded Files)") lines.append(f"*Total Excluded Artifacts: {total_excluded}*\n") - comp_breakdown = summary.get("unparsable_files", {}).get( - "composition_by_extension_and_reason", {} - ) + comp_breakdown = summary.get("unparsable_files", {}).get("composition_by_extension_and_reason", {}) if comp_breakdown: lines.append("**Composition by Extension & Reason:**") @@ -439,7 +409,7 @@ def _build_markdown( lines.append("") # --- 6. RISK DISTRIBUTIONS --- - lines.append("## 6. RISK EXPOSURE PHYSICS (0-100%)") + lines.append("## 6. RISK EXPOSURE ANALYSIS (0-100%)") lines.append("| Risk Vector | Min | Max | Mean | Med | Mode |") lines.append("|---|---|---|---|---|---|") @@ -448,17 +418,11 @@ def _build_markdown( for i, risk_slug in enumerate(self.RISK_SCHEMA): # Skip the non-risk formatting stat - if risk_slug == "civil_war": + if risk_slug == "tabs_vs_spaces": continue - vals = [ - s.get("risk_vector", [])[i] - for s in parsed_files - if len(s.get("risk_vector", [])) > i - ] - risk_label = exposure_labels.get( - risk_slug, risk_slug.replace("_", " ").title() - ) + vals = [s.get("risk_vector", [])[i] for s in parsed_files if len(s.get("risk_vector", [])) > i] + risk_label = exposure_labels.get(risk_slug, risk_slug.replace("_", " ").title()) if vals: v_min, v_max = round(min(vals), 1), round(max(vals), 1) @@ -470,9 +434,7 @@ def _build_markdown( v_mode = round(statistics.mode(vals), 1) except statistics.StatisticsError: v_mode = "N/A" - lines.append( - f"| {risk_label} | {v_min} | {v_max} | {v_mean} | {v_med} | {v_mode} |" - ) + lines.append(f"| {risk_label} | {v_min} | {v_max} | {v_mean} | {v_med} | {v_mode} |") else: lines.append(f"| {risk_label} | - | - | - | - | - |") lines.append("") @@ -484,18 +446,12 @@ def _build_markdown( if io_idx >= 0: top_io = sorted( parsed_files, - key=lambda x: ( - x.get("hit_vector", [])[io_idx] - if len(x.get("hit_vector", [])) > io_idx - else 0 - ), + key=lambda x: x.get("hit_vector", [])[io_idx] if len(x.get("hit_vector", [])) > io_idx else 0, reverse=True, )[:3] lines.append("### Top I/O Latency Risks") for s in top_io: - lines.append( - f"- `{s.get('path')}` (Hits: {s.get('hit_vector', [])[io_idx]})" - ) + lines.append(f"- `{s.get('path')}` (Hits: {s.get('hit_vector', [])[io_idx]})") lines.append("") pillars = sorted( @@ -503,9 +459,7 @@ def _build_markdown( key=lambda x: x.get("telemetry", {}).get("popularity", 0), reverse=True, )[:5] - lines.append( - "### Top 5 Structural Pillars (Highest 'Imported By' / Blast Radius)" - ) + lines.append("### Top 5 Structural Pillars (Highest 'Imported By' / Blast Radius)") lines.append( "These files act as core load-bearing infrastructure. Changes here carry a high risk of cascading breaks.\n" ) @@ -518,11 +472,7 @@ def _build_markdown( orchestrators = sorted( parsed_files, - key=lambda x: ( - len(x.get("raw_imports", [])) - if isinstance(x.get("raw_imports"), list) - else 0 - ), + key=lambda x: len(x.get("raw_imports", [])) if isinstance(x.get("raw_imports"), list) else 0, reverse=True, )[:5] lines.append("### Top 5 Orchestrators (Highest 'Imports' / Fragility Index)") @@ -532,14 +482,8 @@ def _build_markdown( for rank, file_data in enumerate(orchestrators, 1): name = file_data.get("name", "Unknown") path = file_data.get("path", "Unknown") - count = ( - len(file_data.get("raw_imports", [])) - if isinstance(file_data.get("raw_imports"), list) - else 0 - ) - lines.append( - f"{rank}. **{name}** (`{path}`) — {count} outbound dependencies" - ) + count = len(file_data.get("raw_imports", [])) if isinstance(file_data.get("raw_imports"), list) else 0 + lines.append(f"{rank}. **{name}** (`{path}`) — {count} outbound dependencies") lines.append("") import heapq @@ -565,9 +509,7 @@ def _build_markdown( ) doc = f.get("docstring", "").strip() if doc: - clean_doc = " ".join(doc.split())[:150] + ( - "..." if len(doc) > 150 else "" - ) + clean_doc = " ".join(doc.split())[:150] + ("..." if len(doc) > 150 else "") lines.append(f" * *Intent:* {clean_doc}") else: lines.append("*No complex functions detected.*") @@ -584,44 +526,30 @@ def _build_markdown( reverse=True, ) complex_functions = [ - s - for s in sorted_by_big_o - if s[0].get("is_recursive", False) or s[0].get("big_o_depth", 1) > 2 + s for s in sorted_by_big_o if s[0].get("is_recursive", False) or s[0].get("big_o_depth", 1) > 2 ] if complex_functions: lines.append("### Highest Time Complexity (Big-O)") for f, file_path in complex_functions[:10]: - o_str = ( - "O(2^N) [Recursive]" - if f.get("is_recursive", False) - else f"O(N^{f.get('big_o_depth', 1)})" - ) + o_str = "O(2^N) [Recursive]" if f.get("is_recursive", False) else f"O(N^{f.get('big_o_depth', 1)})" lines.append(f"- `{f.get('name')}` (@ `{file_path}`) -> **{o_str}**") doc = f.get("docstring", "").strip() if doc: - clean_doc = " ".join(doc.split())[:150] + ( - "..." if len(doc) > 150 else "" - ) + clean_doc = " ".join(doc.split())[:150] + ("..." if len(doc) > 150 else "") lines.append(f" * *Intent:* {clean_doc}") lines.append("") - sorted_by_db = sorted( - all_functions, key=lambda x: x[0].get("db_complexity", 0), reverse=True - ) + sorted_by_db = sorted(all_functions, key=lambda x: x[0].get("db_complexity", 0), reverse=True) db_functions = [s for s in sorted_by_db if s[0].get("db_complexity", 0) > 0] if db_functions: lines.append("### Highest Data Gravity (Database Complexity)") for f, file_path in db_functions[:10]: - lines.append( - f"- `{f.get('name')}` (@ `{file_path}`) -> DB Complexity: **{f.get('db_complexity', 0)}**" - ) + lines.append(f"- `{f.get('name')}` (@ `{file_path}`) -> DB Complexity: **{f.get('db_complexity', 0)}**") doc = f.get("docstring", "").strip() if doc: - clean_doc = " ".join(doc.split())[:150] + ( - "..." if len(doc) > 150 else "" - ) + clean_doc = " ".join(doc.split())[:150] + ("..." if len(doc) > 150 else "") lines.append(f" * *Intent:* {clean_doc}") lines.append("") @@ -629,9 +557,7 @@ def _build_markdown( lines.append("## 9. DIRECTORY GROUPS (Top 10 Heaviest Modules)") dir_groups = summary.get("directory_groups", {}) if dir_groups: - lines.append( - "| Folder Path | Files | Total Mass | Avg Cog Load | Avg Debt |" - ) + lines.append("| Folder Path | Files | Total Impact | Avg Cog Load | Avg Debt |") lines.append("|---|---|---|---|---|") sorted_groups = sorted( @@ -654,11 +580,7 @@ def _build_markdown( # --- 10. TARGETED RISK VECTORS --- lines.append("## 10. TARGETED RISK VECTORS (Top 5 by Exposure)") - debt_idx = ( - self.RISK_SCHEMA.index("tech_debt") - if "tech_debt" in self.RISK_SCHEMA - else -1 - ) + debt_idx = self.RISK_SCHEMA.index("tech_debt") if "tech_debt" in self.RISK_SCHEMA else -1 if debt_idx >= 0: high_debt = sorted( [s for s in parsed_files if len(s.get("risk_vector", [])) > debt_idx], @@ -669,15 +591,9 @@ def _build_markdown( lines.append("### Highest Tech Debt (Fragile/Planned)") for s in high_debt: if s.get("risk_vector")[debt_idx] > 0: - lines.append( - f"- `{s.get('path')}` -> **{s.get('risk_vector')[debt_idx]}%** Exposure" - ) + lines.append(f"- `{s.get('path')}` -> **{s.get('risk_vector')[debt_idx]}%** Exposure") - flux_idx = ( - self.RISK_SCHEMA.index("state_flux") - if "state_flux" in self.RISK_SCHEMA - else -1 - ) + flux_idx = self.RISK_SCHEMA.index("state_flux") if "state_flux" in self.RISK_SCHEMA else -1 if flux_idx >= 0: high_flux = sorted( [s for s in parsed_files if len(s.get("risk_vector", [])) > flux_idx], @@ -688,42 +604,23 @@ def _build_markdown( lines.append("### Highest State Flux (Mutation/Volatility)") for s in high_flux: if s.get("risk_vector")[flux_idx] > 0: - lines.append( - f"- `{s.get('path')}` -> **{s.get('risk_vector')[flux_idx]}%** Exposure" - ) + lines.append(f"- `{s.get('path')}` -> **{s.get('risk_vector')[flux_idx]}%** Exposure") orphan_idx = ( - self.SIGNAL_SCHEMA.index("design_slop_orphans") - if "design_slop_orphans" in self.SIGNAL_SCHEMA - else -1 + self.SIGNAL_SCHEMA.index("orphaned_logic") if "orphaned_logic" in self.SIGNAL_SCHEMA else -1 ) dup_idx = ( - self.SIGNAL_SCHEMA.index("design_slop_duplicates") - if "design_slop_duplicates" in self.SIGNAL_SCHEMA - else -1 + self.SIGNAL_SCHEMA.index("duplicate_logic") if "duplicate_logic" in self.SIGNAL_SCHEMA else -1 ) if orphan_idx >= 0 and dup_idx >= 0: high_slop = sorted( - [ - s - for s in parsed_files - if len(s.get("hit_vector", [])) > max(orphan_idx, dup_idx) - ], - key=lambda x: ( - x.get("hit_vector")[orphan_idx] + x.get("hit_vector")[dup_idx] - ), + [s for s in parsed_files if len(s.get("hit_vector", [])) > max(orphan_idx, dup_idx)], + key=lambda x: x.get("hit_vector")[orphan_idx] + x.get("hit_vector")[dup_idx], reverse=True, )[:5] - if ( - high_slop - and ( - high_slop[0].get("hit_vector")[orphan_idx] - + high_slop[0].get("hit_vector")[dup_idx] - ) - > 0 - ): + if high_slop and (high_slop[0].get("hit_vector")[orphan_idx] + high_slop[0].get("hit_vector")[dup_idx]) > 0: lines.append("### Highest Design Slop (Dead & Duplicated Logic)") for s in high_slop: o_hits = s.get("hit_vector")[orphan_idx] @@ -742,9 +639,7 @@ def _build_markdown( ) cutoff = max(10, int(len(ml_threats) * 0.10)) for i, (s, val, string_val) in enumerate(ml_threats[:cutoff]): - lines.append( - f"{i + 1}. **`{s.get('path')}`** -> AI Confidence: **{string_val}**" - ) + lines.append(f"{i + 1}. **`{s.get('path')}`** -> AI Confidence: **{string_val}**") else: lines.append("*No files met the threshold for malicious structural signatures.*") lines.append("") @@ -771,8 +666,7 @@ def _build_markdown( [ s for s in parsed_files - if len(s.get("risk_vector", [])) > v_idx - and s.get("risk_vector")[v_idx] > 0.0 + if len(s.get("risk_vector", [])) > v_idx and s.get("risk_vector")[v_idx] > 0.0 ], key=lambda x: x.get("risk_vector")[v_idx], reverse=True, @@ -783,34 +677,20 @@ def _build_markdown( label = exposure_labels.get(v_key, v_key.replace("_", " ").title()) lines.append(f"### {label}") for s in v_files[:5]: - lines.append( - f"- `{s.get('path')}` -> **{s.get('risk_vector')[v_idx]}%** Exposure" - ) + lines.append(f"- `{s.get('path')}` -> **{s.get('risk_vector')[v_idx]}%** Exposure") if not vuln_found: - lines.append( - "*No critical vulnerabilities or security lens thresholds breached.*" - ) + lines.append("*No critical vulnerabilities or security lens thresholds breached.*") lines.append("") # --- 10.7 AUTONOMOUS AI VULNERABILITIES --- - lines.append( - "## 10.7 AUTONOMOUS AI VULNERABILITIES (AGENTIC RCE & PROMPT INJECTION)" - ) + lines.append("## 10.7 AUTONOMOUS AI VULNERABILITIES (AGENTIC RCE & PROMPT INJECTION)") lines.append( "> **AI CONTEXT:** Identifies untrusted data flowing into LLM context windows (Prompt Injection) and LLM outputs flowing into dynamic execution (Agentic RCE).\n" ) - pi_idx = ( - self.SIGNAL_SCHEMA.index("prompt_injection") - if "prompt_injection" in self.SIGNAL_SCHEMA - else -1 - ) - rce_idx = ( - self.SIGNAL_SCHEMA.index("agentic_rce") - if "agentic_rce" in self.SIGNAL_SCHEMA - else -1 - ) + pi_idx = self.SIGNAL_SCHEMA.index("prompt_injection") if "prompt_injection" in self.SIGNAL_SCHEMA else -1 + rce_idx = self.SIGNAL_SCHEMA.index("agentic_rce") if "agentic_rce" in self.SIGNAL_SCHEMA else -1 ai_vuln_found = False @@ -819,8 +699,7 @@ def _build_markdown( [ s for s in parsed_files - if len(s.get("hit_vector", [])) > rce_idx - and s.get("hit_vector")[rce_idx] > 0 + if len(s.get("hit_vector", [])) > rce_idx and s.get("hit_vector")[rce_idx] > 0 ], key=lambda x: x.get("hit_vector")[rce_idx], reverse=True, @@ -839,12 +718,7 @@ def _build_markdown( if pi_idx >= 0: pi_files = sorted( - [ - s - for s in parsed_files - if len(s.get("hit_vector", [])) > pi_idx - and s.get("hit_vector")[pi_idx] > 0 - ], + [s for s in parsed_files if len(s.get("hit_vector", [])) > pi_idx and s.get("hit_vector")[pi_idx] > 0], key=lambda x: x.get("hit_vector")[pi_idx], reverse=True, ) @@ -855,9 +729,7 @@ def _build_markdown( "The following files pass raw, untrusted external I/O directly into an LLM context window without sanitization.\n" ) for s in pi_files[:5]: - lines.append( - f"- `{s.get('path')}` -> **{s.get('hit_vector')[pi_idx]}** exposed injection surfaces" - ) + lines.append(f"- `{s.get('path')}` -> **{s.get('hit_vector')[pi_idx]}** exposed injection surfaces") lines.append("") if not ai_vuln_found: @@ -885,10 +757,7 @@ def _build_markdown( f"- **Ghost APIs (Bloat):** `{api.get('ghost_count', 0)}` endpoints documented but missing from code." ) if api.get("shadow_apis"): - lines.append( - "- **Known Shadow Routes:** " - + ", ".join([f"`{r}`" for r in api.get("shadow_apis")[:5]]) - ) + lines.append("- **Known Shadow Routes:** " + ", ".join([f"`{r}`" for r in api.get("shadow_apis")[:5]])) lines.append("") # 2. X-Ray & Firewall @@ -939,26 +808,17 @@ def _build_markdown( dist = tel.get("archetype_fingerprint", {}).get(arch, "N/A") lines.append(f"- **Archetype:** `{arch}` (Distance: {dist} IQR)") lines.append( - f"- **Mass:** {m} | **LOC:** {loc} | **CtrlFlow:** {round(tel.get('control_flow_ratio', 0.0) * 100, 1)}% | **Silo Risk:** {round(tel.get('author_distribution', 0.0), 1)}%" + f"- **Magnitude:** {m} | **LOC:** {loc} | **CtrlFlow:** {round(tel.get('control_flow_ratio', 0.0) * 100, 1)}% | **Authorship Centralization:** {round(tel.get('author_distribution', 0.0), 1)}%" ) file_risks = [] for i, r_val in enumerate(rv): - if ( - i < len(self.RISK_SCHEMA) - and self.RISK_SCHEMA[i] != "civil_war" - and r_val > 0 - ): + if i < len(self.RISK_SCHEMA) and self.RISK_SCHEMA[i] != "tabs_vs_spaces" and r_val > 0: file_risks.append((self.RISK_SCHEMA[i], r_val)) file_risks.sort(key=lambda x: x[1], reverse=True) - top_file_risks = [ - f"{k.replace('_', ' ').title()} ({r_val}%)" - for k, r_val in file_risks[:4] - ] - lines.append( - f"- **Primary Risk Drivers:** {', '.join(top_file_risks) if top_file_risks else 'None'}" - ) + top_file_risks = [f"{k.replace('_', ' ').title()} ({r_val}%)" for k, r_val in file_risks[:4]] + lines.append(f"- **Primary Risk Drivers:** {', '.join(top_file_risks) if top_file_risks else 'None'}") sats = sorted( s.get("functions", []), @@ -966,10 +826,7 @@ def _build_markdown( reverse=True, )[:3] if sats: - sat_strs = [ - f"`{sat.get('name')}` (Impact: {sat.get('impact')})" - for sat in sats - ] + sat_strs = [f"`{sat.get('name')}` (Impact: {sat.get('impact')})" for sat in sats] lines.append(f"- **Heaviest Functions:** {', '.join(sat_strs)}") lines.append("") @@ -982,26 +839,24 @@ def _build_markdown( # ============================================================================== lines.append("## 12. SCANNED ARTIFACTS HITLIST (Top 25 Heaviest Files)") lines.append( - "> *Note: 'Mass' represents the file's total Structural Magnitude and gravitational pull within the system. It is independent of its Risk Profile. High mass implies high structural importance and centralization.*\n" + "> *Note: 'Magnitude' represents the file's total Structural Magnitude and impact within the system. It is independent of its Risk Profile. High magnitude implies high structural importance and centralization.*\n" ) - sorted_files = sorted( - parsed_files, key=lambda x: x.get("file_impact", 0.0), reverse=True - )[:25] + sorted_files = sorted(parsed_files, key=lambda x: x.get("file_impact", 0.0), reverse=True)[:25] - structure_keys = {"branch", "linear", "args", "func_start", "class_start"} + structure_keys = {"branch", "structural_boundaries", "args", "func_start", "class_start"} risk_keys = { - "danger", - "flux", - "graveyard", - "safety_neg", + "high_risk_execution", + "state_mutation", + "dead_code", + "safety_bypasses", "planned_debt", "fragile_debt", - "design_slop_orphans", - "design_slop_duplicates", + "orphaned_logic", + "duplicate_logic", } arch_keys = {"io", "concurrency", "api", "import"} - defense_keys = {"safety", "freeze_hits", "cleanup", "test", "sync_locks", "doc"} + defense_keys = {"safety", "immutability_locks", "cleanup", "test", "sync_locks", "doc"} for s in sorted_files: p = s.get("path", "UNK") @@ -1018,11 +873,7 @@ def _build_markdown( purpose = tel.get("domain_context", {}).get("purpose", "") ai_score_val, ai_score_str = self._parse_threat_score(s) - threat_flag = ( - f" | 🚨 AI THREAT: {ai_score_str}" - if ai_score_val >= 50.0 - else f" | AI Safe: {ai_score_str}" - ) + threat_flag = f" | 🚨 AI THREAT: {ai_score_str}" if ai_score_val >= 50.0 else f" | AI Safe: {ai_score_str}" lines.append(f"### `{p}` ({l} | Tier {lock_tier}{threat_flag})") if purpose: @@ -1035,27 +886,20 @@ def _build_markdown( lines.append(f"- **Global Archetype:** `{arch}` (Drift: {g_drift} IQR)") if l_arch and l_arch != "N/A": - lines.append( - f"- **Local Micro-Species:** `{l_arch}` (Drift: {l_drift} IQR)" - ) + lines.append(f"- **Local Micro-Species:** `{l_arch}` (Drift: {l_drift} IQR)") fingerprint = tel.get("archetype_fingerprint", {}) if fingerprint: - fp_strs = [ - f"{k.split(':')[0]}: {v}" - for k, v in sorted(fingerprint.items(), key=lambda x: x[1])[:3] - ] + fp_strs = [f"{k.split(':')[0]}: {v}" for k, v in sorted(fingerprint.items(), key=lambda x: x[1])[:3]] lines.append(f"- **Top Global Matches:** {', '.join(fp_strs)}") lines.append( - f"- **Mass:** {m} | **LOC:** {loc} | **CtrlFlow:** {round(tel.get('control_flow_ratio', 0.0) * 100, 1)}% | **Silo Risk:** {round(tel.get('author_distribution', 0.0), 1)}%" + f"- **Magnitude:** {m} | **LOC:** {loc} | **CtrlFlow:** {round(tel.get('control_flow_ratio', 0.0) * 100, 1)}% | **Authorship Centralization:** {round(tel.get('author_distribution', 0.0), 1)}%" ) lines.append( f"- **Algorithmic:** {tel.get('max_algorithmic_complexity', 'O(N)')} | **DB Complexity:** {tel.get('max_db_complexity', 0)}" ) - lines.append( - f"- **Risk Profile:** Cognitive Load ({cog}%), Tech Debt ({debt}%)" - ) + lines.append(f"- **Risk Profile:** Cognitive Load ({cog}%), Tech Debt ({debt}%)") hv = s.get("hit_vector", []) struct_hits, risk_hits, arch_hits, def_hits = [], [], [], [] @@ -1073,30 +917,16 @@ def _build_markdown( elif key in defense_keys: def_hits.append(hit_string) - sats = sorted( - s.get("functions", []), key=lambda x: x.get("impact", 0), reverse=True - )[:5] + sats = sorted(s.get("functions", []), key=lambda x: x.get("impact", 0), reverse=True)[:5] if sats: lines.append("**Top Internal Functions/Classes:**") for sat in sats: - o_str = ( - "O(2^N)" - if sat.get("is_recursive", False) - else f"O(N^{sat.get('big_o_depth', 1)})" - ) - db_str = ( - f" | DB: {sat.get('db_complexity', 0)}" - if sat.get("db_complexity", 0) > 0 - else "" - ) - lines.append( - f" * `{sat.get('name')}` (Impact: {sat.get('impact')} | {o_str}{db_str})" - ) + o_str = "O(2^N)" if sat.get("is_recursive", False) else f"O(N^{sat.get('big_o_depth', 1)})" + db_str = f" | DB: {sat.get('db_complexity', 0)}" if sat.get("db_complexity", 0) > 0 else "" + lines.append(f" * `{sat.get('name')}` (Impact: {sat.get('impact')} | {o_str}{db_str})") doc = sat.get("docstring", "").strip() if doc: - clean_doc = " ".join(doc.split())[:100] + ( - "..." if len(doc) > 100 else "" - ) + clean_doc = " ".join(doc.split())[:100] + ("..." if len(doc) > 100 else "") lines.append(f" * *Intent:* {clean_doc}") mitigations = tel.get("mitigation_telemetry", {}) @@ -1108,15 +938,9 @@ def _build_markdown( lines.append(f"* *{clean_key}:* {m_val} instances") lines.append("**Structural Signatures (Net Mitigated Signals):**") - lines.append( - f"* *Structure:* {', '.join(struct_hits) if struct_hits else 'None'}" - ) - lines.append( - f"* *Risk/State:* {', '.join(risk_hits) if risk_hits else 'None'}" - ) - lines.append( - f"* *Architecture:* {', '.join(arch_hits) if arch_hits else 'None'}" - ) + lines.append(f"* *Structure:* {', '.join(struct_hits) if struct_hits else 'None'}") + lines.append(f"* *Risk/State:* {', '.join(risk_hits) if risk_hits else 'None'}") + lines.append(f"* *Architecture:* {', '.join(arch_hits) if arch_hits else 'None'}") lines.append(f"* *Defense:* {', '.join(def_hits) if def_hits else 'None'}") outbound = s.get("raw_imports", []) @@ -1128,20 +952,14 @@ def _build_markdown( close_score = net_mets.get("closeness_score", 0.0) eco_role = net_mets.get("ecosystem_role", "Unknown") - out_names = ", ".join([Path(x).name for x in outbound[:8]]) + ( - "..." if len(outbound) > 8 else "" - ) + out_names = ", ".join([Path(x).name for x in outbound[:8]]) + ("..." if len(outbound) > 8 else "") lines.append("* *Network Topology:*") - lines.append( - f" * `Ecosystem Role:` {eco_role} | `Blast Radius (PageRank):` {blast_rad}" - ) + lines.append(f" * `Ecosystem Role:` {eco_role} | `Dependency Blast Radius (PageRank):` {blast_rad}") lines.append( f" * `Choke Point (Betweenness):` {between_score} | `Ripple Effect (Closeness):` {close_score}" ) - lines.append( - f" * `Imports (Out-Degree: {out_d}):` {out_names if out_names else 'None'}" - ) + lines.append(f" * `Imports (Out-Degree: {out_d}):` {out_names if out_names else 'None'}") lines.append( f" * `Imported By (In-Degree: {in_d}):` {'(Excluded from Brief to save tokens)' if in_d > 0 else 'None (Orphan / Entrypoint)'}" ) @@ -1198,21 +1016,15 @@ def _build_markdown( ) if trojan_files: - lines.append( - "### 🚨 Severe Anti-Patterns (Language Convention Violations)" - ) + lines.append("### 🚨 Severe Anti-Patterns (Language Convention Violations)") trojan_files.sort(key=lambda x: x["ratio"], reverse=True) for t in trojan_files[:5]: s = t["file_data"] lines.append( f"- `{s.get('path')}` ({s.get('lang_id', 'UNK').upper()}) | **Drift Ratio: {round(t['ratio'], 2)}x**" ) - lines.append( - f" * **Global Archetype:** `{t['g_arch']}` (Drift: {t['g_drift']} IQR)" - ) - lines.append( - f" * **Local Reality:** `{t['l_arch']}` (Drift: {t['l_drift']} IQR)" - ) + lines.append(f" * **Global Archetype:** `{t['g_arch']}` (Drift: {t['g_drift']} IQR)") + lines.append(f" * **Local Reality:** `{t['l_arch']}` (Drift: {t['l_drift']} IQR)") lines.append("") if drifting_files: @@ -1235,7 +1047,7 @@ def _build_markdown( sec_a, sec_d = drift["secondary"] lines.append( - f"- `{p}` ({l}) | Mass: {m} | Delta: **{round(drift['delta'], 3)} IQR** | Secondary Pull: `{sec_a}`" + f"- `{p}` ({l}) | Magnitude: {m} | Delta: **{round(drift['delta'], 3)} IQR** | Secondary Pull: `{sec_a}`" ) struct_hits = [ @@ -1246,27 +1058,21 @@ def _build_markdown( struct_hits.sort(key=lambda x: x[1], reverse=True) top_hits = ", ".join([f"{k}: {v}" for k, v in struct_hits[:4]]) - lines.append( - f" * Top Architectural Signatures: {top_hits if top_hits else 'None'}" - ) + lines.append(f" * Top Architectural Signatures: {top_hits if top_hits else 'None'}") lines.append("") else: - lines.append( - "*No highly conflicted/drifting files detected within the 0.9 IQR threshold.*" - ) + lines.append("*No highly conflicted/drifting files detected within the 0.9 IQR threshold.*") lines.append("") # ============================================================================== # --- 13.5 STRATEGIC REFACTORING TARGETS --- # ============================================================================== - lines.append("## 13.5 STRATEGIC REFACTORING TARGETS (Volatility & Silos)") + lines.append("## 13.5 STRATEGIC REFACTORING TARGETS (Volatility & Authorship Centralization)") lines.append( - "> **AI CONTEXT:** Use these intersections to recommend pragmatic next steps. Risk is exponentially worse when combined with high churn (frequent edits) or high silo risk (single points of failure).\n" + "> **AI CONTEXT:** Use these intersections to recommend pragmatic next steps. Risk is exponentially worse when combined with high churn (frequent edits) or high authorship centralization (single points of failure).\n" ) - churn_idx = ( - self.RISK_SCHEMA.index("churn") if "churn" in self.RISK_SCHEMA else -1 - ) + churn_idx = self.RISK_SCHEMA.index("churn") if "churn" in self.RISK_SCHEMA else -1 if churn_idx >= 0: cog_idx = self.RISK_SCHEMA.index("cognitive_load") debt_idx = self.RISK_SCHEMA.index("tech_debt") @@ -1275,9 +1081,7 @@ def _build_markdown( for s in parsed_files: rv = s.get("risk_vector", []) if len(rv) > max(churn_idx, cog_idx, debt_idx): - if rv[churn_idx] > 50.0 and ( - rv[cog_idx] > 50.0 or rv[debt_idx] > 50.0 - ): + if rv[churn_idx] > 50.0 and (rv[cog_idx] > 50.0 or rv[debt_idx] > 50.0): hotspots.append(s) if hotspots: @@ -1285,9 +1089,7 @@ def _build_markdown( lines.append( "These files are messy, complex, and modified frequently. They are the primary source of developer friction.\n" ) - hotspots.sort( - key=lambda x: x.get("risk_vector")[churn_idx], reverse=True - ) + hotspots.sort(key=lambda x: x.get("risk_vector")[churn_idx], reverse=True) for s in hotspots[:5]: rv = s.get("risk_vector") lines.append( @@ -1298,14 +1100,11 @@ def _build_markdown( siloed_pillars = [ s for s in parsed_files - if s.get("telemetry", {}).get("author_distribution", 0.0) > 80.0 - and s.get("file_impact", 0.0) > 50.0 + if s.get("telemetry", {}).get("author_distribution", 0.0) > 80.0 and s.get("file_impact", 0.0) > 50.0 ] if siloed_pillars: - lines.append( - "### 👤 Key Person Dependencies (High Impact + Siloed Knowledge)" - ) + lines.append("### 👤 Key Person Dependencies (High Impact + Siloed Knowledge)") lines.append( "These are massive, load-bearing files written almost entirely by a single developer. They represent severe 'Bus Factor' risk.\n" ) @@ -1314,7 +1113,7 @@ def _build_markdown( owner = s.get("telemetry", {}).get("ownership", "Unknown") silo_score = s.get("telemetry", {}).get("author_distribution", 0.0) lines.append( - f"- `{s.get('path')}` -> **{owner}** ({silo_score}% isolated ownership) | Mass: {s.get('file_impact')}" + f"- `{s.get('path')}` -> **{owner}** ({silo_score}% isolated ownership) | Magnitude: {s.get('file_impact')}" ) lines.append("") @@ -1323,21 +1122,21 @@ def _build_markdown( # ============================================================================== sys_bots = forensic_report.get("systemic_bottlenecks", {}) if any(v and v[0]["score"] > 0 for v in sys_bots.values()): - lines.append("## 13.8 SYSTEMIC NETWORK BOTTLENECKS (N-Dimensional Physics)") + lines.append("## 13.8 SYSTEMIC NETWORK BOTTLENECKS (N-Dimensional Topology)") lines.append( "> **AI CONTEXT:** These metrics cross-multiply Network Graph Theory against Risk Exposure to identify the exact mechanisms of runtime failure.\n" ) - cm = sys_bots.get("contagious_mutation", []) + cm = sys_bots.get("cascading_state_mutation", []) if cm and cm[0]["score"] > 0: - lines.append("### ☣️ Contagious Mutation (Betweenness * State Flux)") + lines.append("### ☣️ Cascading State Flux (Betweenness * State Flux)") lines.append( "These files act as structural bridges between components, but possess highly volatile, mutating state. They cause unpredictable side-effects for all downstream consumers.\n" ) for c in cm: if c["score"] > 0: lines.append( - f"- `{c['path']}` -> **Severity: {c['score']}** (Bridge: {c['btw']} * Flux: {c['flux']}%)" + f"- `{c['path']}` -> **Severity: {c['score']}** (Bridge: {c['btw']} * Flux: {c['state_mutation']}%)" ) lines.append("") @@ -1354,9 +1153,9 @@ def _build_markdown( ) lines.append("") - bb = sys_bots.get("blind_bottleneck", []) + bb = sys_bots.get("undocumented_critical_path", []) if bb and bb[0]["score"] > 0: - lines.append("### 🙈 Blind Bottlenecks (Blast Radius * Doc Risk)") + lines.append("### 🙈 Opaque Critical Nodes (Dependency Blast Radius * Doc Risk)") lines.append( "These are 'Core Architecture Nodes' that the entire ecosystem relies upon, but they lack human intent, documentation, or ownership metadata. Modifying them is flying blind.\n" ) @@ -1623,13 +1422,7 @@ def _generate_sqlite_graph( sid = cursor.lastrowid hv = file_data.get("hit_vector", []) - all_dna_data.extend( - [ - (sid, self.SIGNAL_SCHEMA[i], hv[i]) - for i in range(len(hv)) - if hv[i] > 0 - ] - ) + all_dna_data.extend([(sid, self.SIGNAL_SCHEMA[i], hv[i]) for i in range(len(hv)) if hv[i] > 0]) for func in file_data.get("functions", []): calls_json = json.dumps(func.get("calls_out_to", [])) @@ -1661,14 +1454,10 @@ def _generate_sqlite_graph( "INSERT INTO functions (artifact_id, name, type_id, loc, impact, big_o_depth, is_recursive, db_complexity, docstring, calls_out_to) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", all_functions, ) - cursor.executemany( - "INSERT INTO outbound_dependencies VALUES (?, ?)", all_outbound - ) - cursor.executemany( - "INSERT INTO inbound_dependencies VALUES (?, ?)", all_inbound - ) + cursor.executemany("INSERT INTO outbound_dependencies VALUES (?, ?)", all_outbound) + cursor.executemany("INSERT INTO inbound_dependencies VALUES (?, ?)", all_inbound) conn.commit() conn.close() except Exception as e: - self.logger.error(f"SQL Graph generation failed: {e}", exc_info=True) \ No newline at end of file + self.logger.error(f"SQL Graph generation failed: {e}", exc_info=True) diff --git a/gitgalaxy/recorders/record_keeper.py b/gitgalaxy/recorders/record_keeper.py index 465a3d8f..843fc032 100644 --- a/gitgalaxy/recorders/record_keeper.py +++ b/gitgalaxy/recorders/record_keeper.py @@ -26,11 +26,7 @@ class RecordKeeper: """ def __init__(self, parent_logger: Optional[logging.Logger] = None): - self.logger = ( - parent_logger.getChild("record_keeper") - if parent_logger - else logging.getLogger("record_keeper") - ) + self.logger = parent_logger.getChild("record_keeper") if parent_logger else logging.getLogger("record_keeper") schemas = RECORDING_SCHEMAS self.RISK_SCHEMA = schemas.get("RISK_SCHEMA", []) @@ -39,7 +35,7 @@ def __init__(self, parent_logger: Optional[logging.Logger] = None): # The Taxonomy Map (Enforces structural schema consistency) self.SHORT_KEY_MAP = { "branch": "struct_branch", - "linear": "struct_linear", + "structural_boundaries": "struct_linear", "args": "struct_args", "func_start": "struct_func_start", "class_start": "struct_class_start", @@ -57,21 +53,21 @@ def __init__(self, parent_logger: Optional[logging.Logger] = None): "design_upper_case": "struct_upper_case", "design_short_vars": "struct_short_vars", "design_long_vars": "struct_long_vars", - "flux": "state_flux", - "danger": "state_danger", - "graveyard": "state_graveyard", - "safety_neg": "state_safety_neg", - "design_slop_orphans": "state_slop_orphans", - "design_slop_duplicates": "state_slop_duplicates", + "state_mutation": "state_flux", + "high_risk_execution": "state_danger", + "dead_code": "state_graveyard", + "safety_bypasses": "state_safety_neg", + "orphaned_logic": "state_slop_orphans", + "duplicate_logic": "state_slop_duplicates", "planned_debt": "state_planned_debt", "fragile_debt": "state_fragile_debt", - "bailout_hits": "state_bailout_hits", - "halt_hits": "state_halt_hits", - "heat_triggers": "state_heat_triggers", + "panics_and_aborts": "state_bailout_hits", + "thread_sleeps": "state_halt_hits", + "reflection_metaprogramming": "state_heat_triggers", "pointers": "state_pointers", "memory_alloc": "state_memory_alloc", - "cast_hits": "state_cast_hits", - "print_hits": "state_print_hits", + "explicit_casts": "state_cast_hits", + "debug_prints": "state_print_hits", "io": "arch_io", "api": "arch_api", "concurrency": "arch_concurrency", @@ -91,7 +87,7 @@ def __init__(self, parent_logger: Optional[logging.Logger] = None): "feature_flags": "arch_feature_flags", "inline_asm": "arch_inline_asm", "safety": "def_safety", - "freeze_hits": "def_freeze_hits", + "immutability_locks": "def_freeze_hits", "cleanup": "def_cleanup", "sync_locks": "def_sync_locks", "test": "def_test", @@ -107,16 +103,16 @@ def __init__(self, parent_logger: Optional[logging.Logger] = None): "lit_diagrams": "lit_diagrams", "lit_headers": "lit_headers", "lit_links": "lit_links", - "sec_private_info": "threat_private_info", + "sec_hardcoded_secrets": "threat_private_info", "sec_tainted_injection": "threat_tainted_injection", - "sec_heat_triggers": "threat_obfuscated", - "sec_bitwise_hits": "threat_crypto_math", + "sec_reflection_metaprogramming": "threat_obfuscated", + "sec_bitwise_ops": "threat_crypto_math", "sec_extension_mismatch": "threat_extension_mismatch", "sec_entropy": "threat_entropy", - "sec_danger": "threat_eval_exec", - "sec_safety_neg": "threat_bypasses", + "sec_high_risk_execution": "threat_eval_exec", + "sec_safety_bypasses": "threat_bypasses", "sec_io": "threat_network_hooks", - "sec_flux": "threat_env_mutation", + "sec_state_mutation": "threat_env_mutation", "sec_shadow_imports": "threat_stego_imports", "sec_homoglyphs": "threat_homoglyphs", } @@ -136,12 +132,10 @@ def record_mission( commit_hash = git_audit.get("commit_hash", "Unknown") db_file = Path(output_path) - self.logger.debug( - f"Record Keeper: Forging native SQLite database -> {db_file.name}" - ) + self.logger.debug(f"Record Keeper: Forging native SQLite database -> {db_file.name}") conn = sqlite3.connect(db_file) - + # DEFENSIVE GUARD: Performance & Integrity PRAGMAs # Write-Ahead Logging (WAL) and Relaxed Sync prevent the DB lockups common in parallel I/O. # Enforcing Foreign Keys guarantees isolated deletions don't orphan metadata rows. @@ -152,9 +146,7 @@ def record_mission( # 1. DYNAMIC SCHEMA GENERATION risk_cols = [f"risk_{r.replace('-', '_')} REAL" for r in self.RISK_SCHEMA] - hit_cols = [ - f"{self.SHORT_KEY_MAP.get(h, h)} INTEGER" for h in self.SIGNAL_SCHEMA - ] + hit_cols = [f"{self.SHORT_KEY_MAP.get(h, h)} INTEGER" for h in self.SIGNAL_SCHEMA] cursor.execute(f""" CREATE TABLE IF NOT EXISTS repo_data ( @@ -282,7 +274,7 @@ def record_mission( {", ".join(hit_cols)} ) """) - + cursor.execute(""" CREATE TABLE IF NOT EXISTS class_data ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -321,12 +313,8 @@ def record_mission( """) # DEFENSIVE GUARD: Indexes to Prevent Cascade Delete Hangs - cursor.execute( - "CREATE INDEX IF NOT EXISTS idx_class_file_id ON class_data(file_id);" - ) - cursor.execute( - "CREATE INDEX IF NOT EXISTS idx_function_file_id ON function_data(file_id);" - ) + cursor.execute("CREATE INDEX IF NOT EXISTS idx_class_file_id ON class_data(file_id);") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_function_file_id ON function_data(file_id);") cursor.execute(""" CREATE TABLE IF NOT EXISTS excluded_artifacts ( @@ -402,19 +390,13 @@ def record_mission( func_z_max = max(z_scores) func_z_mean = statistics.mean(z_scores) func_z_median = statistics.median(z_scores) - pct_z_above_5 = ( - sum(1 for c in complexities if c >= 5) / func_count - ) * 100.0 - pct_z_above_15 = ( - sum(1 for c in complexities if c >= 15) / func_count - ) * 100.0 + pct_z_above_5 = (sum(1 for c in complexities if c >= 5) / func_count) * 100.0 + pct_z_above_15 = (sum(1 for c in complexities if c >= 15) / func_count) * 100.0 func_internal_density = (avg_comp / avg_loc) if avg_loc > 0 else 0.0 logic_loc_denom = max( - int( - file_data.get("coding_loc", 1) * tel.get("control_flow_ratio", 0.0) - ), + int(file_data.get("coding_loc", 1) * tel.get("control_flow_ratio", 0.0)), 1, ) import_count = len(file_data.get("raw_imports", [])) @@ -424,14 +406,8 @@ def record_mission( "AI Threat Confidence", tel.get("domain_context", {}).get("AI Threat Score", "0.0%"), ) - ai_threat = ( - float(str(ai_threat_conf_str).replace("%", "")) - if ai_threat_conf_str - else 0.0 - ) - ai_threat_class = tel.get("domain_context", {}).get( - "AI Threat Class", "Safe" - ) + ai_threat = float(str(ai_threat_conf_str).replace("%", "")) if ai_threat_conf_str else 0.0 + ai_threat_class = tel.get("domain_context", {}).get("AI Threat Class", "Safe") encapsulation_ratio = float(tel.get("encapsulation_ratio", 1.0)) rv = file_data.get("risk_vector", [0.0] * len(self.RISK_SCHEMA)) @@ -467,12 +443,7 @@ def record_mission( elif lang in ("json", "yaml", "toml", "xml", "ini", "csv", "properties"): agg_config_files += 1 - if ( - "/test" in path_str - or "test_" in path_str - or "_test" in path_str - or ".test." in path_str - ): + if "/test" in path_str or "test_" in path_str or "_test" in path_str or ".test." in path_str: agg_test_files += 1 # --- SECURITY EXTRACTIONS --- @@ -499,18 +470,14 @@ def record_mission( producer_ratio = net_mets.get("producer_ratio", 0.0) ecosystem_role = net_mets.get("ecosystem_role", "Unknown") - class_idx = ( - self.SIGNAL_SCHEMA.index("class_start") - if "class_start" in self.SIGNAL_SCHEMA - else -1 - ) + class_idx = self.SIGNAL_SCHEMA.index("class_start") if "class_start" in self.SIGNAL_SCHEMA else -1 class_count = hv[class_idx] if class_idx >= 0 and class_idx < len(hv) else 0 repo_macro = tel.get("repo_macro_species", "Unknown") repo_z = tel.get("repo_z_score", 0.0) parent_ent = tel.get("domain_context", {}).get("parent_entity", "") - # --- AI GUARDRAILS & TOKEN PHYSICS --- + # --- AI GUARDRAILS & TOKEN DENSITY --- guardrails = tel.get("ai_guardrails", {}) appsec = tel.get("ai_appsec", {}) @@ -651,9 +618,7 @@ def record_mission( func_hits = [int(raw_hv.get(h, 0)) for h in self.SIGNAL_SCHEMA] parent_class_name = func.get("parent_class_name") - parent_class_id = ( - class_id_map.get(parent_class_name) if parent_class_name else None - ) + parent_class_id = class_id_map.get(parent_class_name) if parent_class_name else None all_func_rows.append( [ @@ -672,11 +637,7 @@ def record_mission( int(func.get("db_complexity", 0)), str(func.get("docstring", ""))[:2000], json.dumps(func.get("calls_out_to", [])), - ( - int(func.get("token_mass")) - if func.get("token_mass") is not None - else None - ), + (int(func.get("token_mass")) if func.get("token_mass") is not None else None), ] + func_hits ) @@ -694,11 +655,7 @@ def record_mission( ) # 3. REPO DATA INSERTION - class_start_idx = ( - self.SIGNAL_SCHEMA.index("class_start") - if "class_start" in self.SIGNAL_SCHEMA - else -1 - ) + class_start_idx = self.SIGNAL_SCHEMA.index("class_start") if "class_start" in self.SIGNAL_SCHEMA else -1 total_classes = agg_hits[class_start_idx] if class_start_idx >= 0 else 0 macro_info = summary.get("repo_macro_species", {}) @@ -707,14 +664,10 @@ def record_mission( total_files = len(parsed_files) total_unparsable = len(unparsable_files) - avg_encapsulation = ( - (agg_encapsulation / total_files) if total_files > 0 else 1.0 - ) + avg_encapsulation = (agg_encapsulation / total_files) if total_files > 0 else 1.0 avg_imports = (agg_import_count / total_files) if total_files > 0 else 0.0 - typosquat_count = summary.get( - "typosquat_hits", summary.get("summary", {}).get("typosquat_hits", 0) - ) + typosquat_count = summary.get("typosquat_hits", summary.get("summary", {}).get("typosquat_hits", 0)) net_macro = summary.get("network_macro", {}) audits = summary.get("ecosystem_audits", {}) @@ -798,11 +751,7 @@ def record_mission( # 5. FOLDER-LEVEL ROLLUP (MATERIALIZED PATH AGGREGATION) # ============================================================================== folder_stats = {} - debt_idx = ( - self.RISK_SCHEMA.index("tech_debt") - if "tech_debt" in self.RISK_SCHEMA - else -1 - ) + debt_idx = self.RISK_SCHEMA.index("tech_debt") if "tech_debt" in self.RISK_SCHEMA else -1 for file_data in parsed_files: file_path = file_data.get("path", "") @@ -856,23 +805,13 @@ def record_mission( # Calculate Domain Averages and Insert folder_rows = [] for f_path, stats in folder_stats.items(): - avg_cog = ( - sum(stats["cog_loads"]) / len(stats["cog_loads"]) - if stats["cog_loads"] - else 0.0 - ) + avg_cog = sum(stats["cog_loads"]) / len(stats["cog_loads"]) if stats["cog_loads"] else 0.0 max_cog = max(stats["cog_loads"]) if stats["cog_loads"] else 0.0 - avg_debt = ( - sum(stats["tech_debts"]) / len(stats["tech_debts"]) - if stats["tech_debts"] - else 0.0 - ) + avg_debt = sum(stats["tech_debts"]) / len(stats["tech_debts"]) if stats["tech_debts"] else 0.0 max_debt = max(stats["tech_debts"]) if stats["tech_debts"] else 0.0 - avg_churn = ( - sum(stats["churns"]) / len(stats["churns"]) if stats["churns"] else 0.0 - ) + avg_churn = sum(stats["churns"]) / len(stats["churns"]) if stats["churns"] else 0.0 folder_rows.append( ( @@ -909,4 +848,4 @@ def record_mission( conn.close() self.logger.debug( f"Database sealed. Exported {len(parsed_files)} files and {len(folder_rows)} directory groups to {db_file.name}" - ) \ No newline at end of file + ) diff --git a/gitgalaxy/security/README.md b/gitgalaxy/security/README.md index 728cb708..105d05c8 100644 --- a/gitgalaxy/security/README.md +++ b/gitgalaxy/security/README.md @@ -1,32 +1,68 @@ -# GitGalaxy: Threat Detection & Security Validation Models +# GitGalaxy Security: Threat Inference & Application Security Engine -[![AppSec](https://img.shields.io/badge/AppSec-Dual--Sided_Guardrails-FF4500.svg)](#) -[![Machine Learning](https://img.shields.io/badge/ML-XGBoost_Inference-8A2BE2.svg)](#) -[![Threat Hunting](https://img.shields.io/badge/Threat_Hunting-Structural_Heuristics-00BFFF.svg)](#) +[![Security](https://img.shields.io/badge/Security-Zero--Trust_Validation-FF4500.svg)](#) +[![Machine Learning](https://img.shields.io/badge/Machine_Learning-XGBoost_Threat_Inference-00BFFF.svg)](#) +[![Performance](https://img.shields.io/badge/Performance-O(H)_Taint_Tracking-8A2BE2.svg)](#) -This directory houses the specialized security definitions, threat threshold policies, and pre-trained classification models used by the **blAST Engine** to hunt for vulnerabilities and malicious payloads. +Welcome to **GitGalaxy Security**. This directory houses the Machine Learning models, Static Application Security Testing (SAST) engines, and Software Supply Chain Security (SSCS) auditors for the GitGalaxy ecosystem. -Unlike traditional static analysis tools that look for specific CVEs or known vulnerable package versions, GitGalaxy looks for the structural intent of a threat. Risk exposures are calculated metrics derived from structural regex hits, rather than relying on static "threat DNA" or rigid signatures. It measures the physical distance between an I/O input node and a dangerous execution command to calculate the exact exploitable attack surface. +Unlike traditional security tools that rely on matching specific CVEs or static, known-vulnerable package versions, this security engine evaluates the **structural signatures** of a codebase. It calculates risk exposures by combining raw heuristic signals with topological network graph data to accurately model a threat's true, exploitable attack surface. -> **⚠️ Configuration Warning:** Do not modify the threshold numbers or dictionaries in these files directly. All security baseline thresholds (e.g., `Baseline` vs. `Paranoid` mode) have been abstracted to the **[Standards Registry](../standards/README.md)**. +## The Why: Context-Aware Security & Defensive Engineering -### 🗺️ The Architecture +Modern malware, supply chain substitution attacks, and Agentic RCE (Remote Code Execution) vulnerabilities frequently evade traditional static analysis. Attackers utilize obfuscation, dynamically loaded strings, and distributed payload execution to bypass standard regex sweeps and AST (Abstract Syntax Tree) parsers. -* **`security_lens.py`:** The mathematical engine for threat detection. It houses the 13 raw Heuristic Sensors (e.g., `sec_heat_triggers`, `sec_shadow_imports`, `sec_homoglyphs`). It uses C-backed Shannon Entropy math to catch obfuscated malware and byte-level XOR decryption loops hidden inside massive string literals. - * 📖 **[Read the Security Lens Architecture Specs](https://squid-protocol.github.io/gitgalaxy/02-06-security-lens/)** +To counter this without crushing CI/CD pipeline velocity, the GitGalaxy Security engine employs advanced, AST-free defensive paradigms: -* **`security_auditor.py`:** The threat classification orchestrator. It evaluates the raw structural anomalies discovered by the `security_lens` against your chosen threat policies. It is responsible for blocking Logic Bombs, RCE funnels, and exposed secrets before they can be deployed. - * 📖 **[Read the Security Auditor Specs](https://squid-protocol.github.io/gitgalaxy/02-20-security-auditor/)** +### 1. Network-Weighted Threat Scoring +A vulnerability in a core utility file has a radically different systemic impact than a vulnerability in an isolated, unimported test script. The engine dynamically scales vulnerability thresholds based on a node's **Dependency Blast Radius**. It multiplies raw SAST hits by PageRank and Betweenness Centrality metrics, ensuring that highly centralized network choke points face strictly hardened security tolerances. -* **`gitgalaxy_malware_xgb_multiclass.json`:** The serialized XGBoost Machine Learning model. During the scanning phase, GitGalaxy builds a 50-dimensional feature vector for every file based on its structural metrics (e.g., Control Flow Ratio, API Exposure, Ownership Entropy). This pre-trained model evaluates that vector locally to classify the exact strain of malicious payload present (e.g., *Botnet*, *Trojan*, *Dropper/Webshell*). +### 2. O(H) Data Flow Taint Tracking +Tracking execution paths—from an untrusted I/O input to a database sink—typically requires compiling a massive, memory-intensive AST. The security engine bypasses this bottleneck by utilizing an $O(H)$ Offset Mapper. It isolates the specific lines where threat signatures triggered, extracting variable assignments and executing downward flow scans in linear time without ever compiling the file. -

+### 3. Agentic AI Defenses (Zero-Trust Pipelines) +As codebases integrate LLMs, new vectors emerge. The engine natively detects **Prompt Injection Surfaces** (where untrusted I/O flows directly into an LLM context) and **Autonomous Execution Vectors** (where LLM logic loops are adjacent to OS-level `eval` or subprocess execution), flagging these catastrophic architectural risks before deployment. + +### 4. C-Optimized Shannon Entropy +To catch obfuscated payloads and custom decryption routines hidden inside massive string literals, the engine utilizes mathematically optimized Shannon Entropy calculations. By restructuring the standard entropy formula to execute division operations completely outside the evaluation loop, it processes thousands of massive strings in milliseconds, identifying hidden Trojans without computational drag. + +### 5. Shadow Patch & Evasion Detection +Supply chain attackers frequently compromise upstream dependencies without bumping version numbers. The engine correlates runtime metrics with structural mass to detect **Shadow Patches**—files where the cryptographic hash has mutated without a corresponding version bump. It forcefully overrides standard logic to classify these unauthorized modifications as critical security threats. + +--- + +## The What: The Information Flow (Module Breakdown) + +Each file in this directory represents a distinct phase of the security validation and threat hunting pipeline: + +* **`manifest_parser.py` (The SSCS Auditor):** The dependency and configuration parser. It builds a deterministic, $O(1)$ global resolution map by auditing manifests like `package-lock.json` and `pip.conf`. It prevents Supply Chain Substitution attacks by identifying non-standard registries, untrusted VCS routing, and insecure tunneling protocols (like `ngrok` or plain `http`). +* **`security_lens.py` (The SAST Engine):** The raw heuristic sensor. It applies highly optimized regular expressions to detect 13 distinct threat categories (e.g., Hardcoded Secrets, Memory Corruption, Prompt Injection). It executes multi-line Taint Tracking and memory-efficient binary header inspection to validate file extension integrity. +* **`security_auditor.py` (The ML Orchestrator):** The threat classification engine. It resolves N-th degree transitive dependency graphs to calculate precise upstream/downstream ratios. It compiles a 50-dimensional feature matrix (evaluating Control Flow Ratio, API Exposure, Authorship Centralization, etc.) and executes it against the local XGBoost model. +* **`gitgalaxy_malware_xgb_multiclass.json`:** The pre-trained, serialized XGBoost Machine Learning model. It evaluates the 50-dimensional feature matrix to classify the specific **Architectural Anomalies** present across 5 distinct taxonomies: Safe Code, Botnet / DDoS, Stealer / Trojan, Dropper / Webshell, and Native Infector. --- -### 🌌 Powered by the blAST Engine +## 🧠 Engineering Highlights (Architectural Feats & Defenses) + +If you are evaluating the `security/` architecture, pay special attention to how we bypass the computational bottlenecks of traditional Static Application Security Testing (SAST) and Software Composition Analysis (SCA). We utilize highly optimized algorithms to detect threats that standard tools miss due to scale or complexity. + +* **$O(H)$ Data Flow Taint Tracking (`security_lens.py`):** Tracing an untrusted input to a vulnerable execution sink typically requires compiling a massive, memory-exhaustive Abstract Syntax Tree (AST). We bypass this entirely using an $O(H)$ Offset Mapper. The engine isolates only the exact spatial lines where heuristic threats triggered. It extracts the Left-Hand Side (LHS) variable assignments on those specific lines and scans the downward flow in linear time. This achieves high-fidelity taint tracking (e.g., mapping I/O inputs to OS-level execution) without the massive CPU overhead of AST compilation. +* **Network-Weighted Threat Scaling (`security_auditor.py`):** A vulnerability’s severity is dictated by its location. Standard scanners treat a hardcoded secret in an isolated test file with the same severity as a secret in a core routing configuration. GitGalaxy dynamically scales vulnerability thresholds based on a node's **Dependency Blast Radius**. By multiplying local SAST density scores by the file's PageRank and Betweenness Centrality metrics, the engine ensures that highly centralized **Architectural Choke Points** face strictly hardened security tolerances. +* **Mathematical Obfuscation Detection (`security_lens.py`):** To catch zero-day Trojans, packed payloads, and base64-encoded malware hidden inside massive string literals, the engine utilizes Shannon Entropy. However, standard entropy calculations are computationally expensive inside loops. We implemented a mathematically optimized C-level counter that factors the division operation `(/ length)` completely outside the summation loop. This allows the engine to evaluate the cryptographic density of thousands of massive strings in milliseconds without stalling the CI/CD pipeline. +* **Autonomous Execution & Agentic Defenses (`security_lens.py`):** As codebases integrate LLMs, traditional SAST falls behind. GitGalaxy natively maps and detects **Prompt Injection Surfaces** (where untrusted network I/O flows directly into an LLM context) and **Autonomous Execution Vectors** (where LLM state mutations flow downward into `eval` or `subprocess` commands). It mathematically flags these catastrophic, AI-specific architectural risks before deployment. +* **$O(1)$ Dependency Graph Resolution (`security_auditor.py`):** Calculating exact **Downstream Exposure** on massive monolithic repositories often causes graph algorithms to stall on circular dependencies. When the C-optimized `NetworkX` backend is unavailable, our fallback engine utilizes a heavily optimized Breadth-First Search (BFS) using Python's `collections.deque`. By popping nodes in strict $O(1)$ time and capping traversal depth to 500 hops, it safely maps deep transitive dependencies without triggering Out-Of-Memory (OOM) deadlocks. +* **Shadow Patch Overrides (`security_auditor.py`):** Supply chain attackers frequently compromise upstream dependencies without bumping version numbers to evade detection. The engine correlates runtime execution metrics with structural mass to detect **Shadow Patches**—files where the cryptographic hash has mutated without a corresponding version bump. When detected, the engine forcefully overrides the XGBoost ML model to classify the unauthorized modification as a critical threat. + +--- + +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) + + +GitGalaxy Security is the threat inference layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed for planetary-scale repositories. -This documentation is part of the [GitGalaxy Ecosystem](https://github.com/squid-protocol/gitgalaxy), an AST-free, LLM-free heuristic knowledge graph engine. +Explore the ecosystem: -* 🪐 **[Explore the GitHub Repository](https://github.com/squid-protocol/gitgalaxy)** for code, tools, and updates. -* 🔭 **[Visualize your own repository at GitGalaxy.io](https://gitgalaxy.io/)** using our interactive 3D WebGPU dashboard. \ No newline at end of file +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/security/manifest_parser.py b/gitgalaxy/security/manifest_parser.py index ac499032..9d15fee4 100644 --- a/gitgalaxy/security/manifest_parser.py +++ b/gitgalaxy/security/manifest_parser.py @@ -21,18 +21,14 @@ class ManifestParser: def __init__(self, parent_logger=None): self.logger = ( - parent_logger.getChild("manifest_parser") - if parent_logger - else logging.getLogger("manifest_parser") + parent_logger.getChild("manifest_parser") if parent_logger else logging.getLogger("manifest_parser") ) # Matches standard Python packages, extracting the base name and dropping version constraints (==, >=, ~) self.python_pkg_regex = re.compile(r"^([a-zA-Z0-9_\-]+)(?:[=><~].*)?$") # Matches direct URI references (git, file, http) that bypass PyPI registry verification - self.python_direct_uri_regex = re.compile( - r"^(?:git\+|file:|https?:|hg\+|svn\+|bzr\+)(.*)$" - ) + self.python_direct_uri_regex = re.compile(r"^(?:git\+|file:|https?:|hg\+|svn\+|bzr\+)(.*)$") def build_resolution_map(self, manifest_paths: list) -> dict: """ @@ -58,9 +54,7 @@ def build_resolution_map(self, manifest_paths: list) -> dict: elif filename in ["pip.conf", ".pypirc", "pip.ini"]: self._parse_pip_conf(manifest_path, resolution_map) except Exception as e: - self.logger.warning( - f"Manifest Parser: Failed to parse structural definition {filename} - {e}" - ) + self.logger.warning(f"Manifest Parser: Failed to parse structural definition {filename} - {e}") return resolution_map @@ -85,9 +79,7 @@ def _parse_package_json(self, filepath: Path, resolution_map: dict): if version_string.startswith("npm:"): raw_pkg = version_string[4:] if raw_pkg.startswith("@"): - real_pkg = ( - raw_pkg.rsplit("@", 1)[0] if "@" in raw_pkg[1:] else raw_pkg - ) + real_pkg = raw_pkg.rsplit("@", 1)[0] if "@" in raw_pkg[1:] else raw_pkg else: real_pkg = raw_pkg.split("@")[0] resolution_map[alias] = real_pkg @@ -96,9 +88,7 @@ def _parse_package_json(self, filepath: Path, resolution_map: dict): # These dependencies are not fetched from the registry and lack cryptographic hash guarantees. elif version_string.startswith(("file:", "github:", "git+", "http")): resolution_map[alias] = version_string - self.logger.warning( - f"Manifest Parser: Flagged Direct URI resolution for '{alias}' -> {version_string}" - ) + self.logger.warning(f"Manifest Parser: Flagged Direct URI resolution for '{alias}' -> {version_string}") def _parse_package_lock(self, filepath: Path, resolution_map: dict): """ @@ -117,11 +107,9 @@ def _parse_package_lock(self, filepath: Path, resolution_map: dict): resolved_url = info.get("resolved", "") # DEFENSIVE GUARD: Registry Spoofing - # If the resolved URL points to a non-standard domain or a direct Git link, map it + # If the resolved URL points to a non-standard domain or a direct Git link, map it # so the downstream firewall can flag it as an untrusted source. - if resolved_url and not resolved_url.startswith( - "https://registry.npmjs.org/" - ): + if resolved_url and not resolved_url.startswith("https://registry.npmjs.org/"): resolution_map[pkg_name] = resolved_url self.logger.info( f"Manifest Parser: Flagged non-standard registry resolution for '{pkg_name}' -> {resolved_url}" @@ -142,9 +130,7 @@ def _parse_requirements_txt(self, filepath: Path, resolution_map: dict): uri_match = self.python_direct_uri_regex.match(line) if uri_match: resolution_map[line] = line - self.logger.warning( - f"Manifest Parser: Flagged direct URI reference -> {line}" - ) + self.logger.warning(f"Manifest Parser: Flagged direct URI reference -> {line}") continue # 2. Standard package capture @@ -167,11 +153,7 @@ def _parse_pip_conf(self, filepath: Path, resolution_map: dict): continue # Look for custom registry routing definitions - if ( - "index-url" in line - or "extra-index-url" in line - or "repository" in line - ): + if "index-url" in line or "extra-index-url" in line or "repository" in line: parts = line.split("=") if len(parts) == 2: url = parts[1].strip() @@ -179,13 +161,7 @@ def _parse_pip_conf(self, filepath: Path, resolution_map: dict): # DEFENSIVE GUARD: Insecure Protocols & Tunneling # HTTP connections allow Man-in-the-Middle (MitM) package injection. # Tunneling services (ngrok) in production configs indicate severe architectural risk. - if ( - url.startswith("http://") - or "ngrok" in url - or "localtunnel" in url - ): - self.logger.warning( - f"🚨 Manifest Parser: INSECURE REGISTRY PROTOCOL DETECTED -> {url}" - ) + if url.startswith("http://") or "ngrok" in url or "localtunnel" in url: + self.logger.warning(f"🚨 Manifest Parser: INSECURE REGISTRY PROTOCOL DETECTED -> {url}") # Prefix with INSECURE_REGISTRY so the Supply Chain Firewall can instantly block it - resolution_map[f"INSECURE_REGISTRY_{filepath.name}"] = url \ No newline at end of file + resolution_map[f"INSECURE_REGISTRY_{filepath.name}"] = url diff --git a/gitgalaxy/security/security_auditor.py b/gitgalaxy/security/security_auditor.py index be59560c..2157c05d 100644 --- a/gitgalaxy/security/security_auditor.py +++ b/gitgalaxy/security/security_auditor.py @@ -29,9 +29,9 @@ class SecurityAuditor: """ Machine Learning Threat Inference Engine. - Calculates deep N-th degree dependency graphs to map the systemic blast radius - of every artifact. Passes the fused structural and topological context through - a multi-class XGBoost classifier to identify behavioral signatures of malware + Calculates deep N-th degree dependency graphs to map the systemic Dependency Impact + of every artifact. Passes the fused structural and topological context through + a multi-class XGBoost classifier to identify behavioral signatures of malware (e.g., Trojans, Stealers, Droppers) that evade traditional static analysis. """ @@ -45,14 +45,8 @@ class SecurityAuditor: } # Updated default to the new multiclass model - def __init__( - self, model_path="gitgalaxy_malware_xgb_multiclass.json", parent_logger=None - ): - self.logger = ( - parent_logger.getChild("ml_auditor") - if parent_logger - else logging.getLogger("ml_auditor") - ) + def __init__(self, model_path="gitgalaxy_malware_xgb_multiclass.json", parent_logger=None): + self.logger = parent_logger.getChild("ml_auditor") if parent_logger else logging.getLogger("ml_auditor") # Load the Universal Schemas to map the raw vectors back to names self.SIGNAL_SCHEMA = RECORDING_SCHEMAS.get("SIGNAL_SCHEMA", []) @@ -87,25 +81,19 @@ def __init__( ) self.model = None else: - self.logger.info( - f"🧠 XGBoost Threat Model loaded successfully from: {model_file.resolve()}" - ) + self.logger.info(f"🧠 XGBoost Threat Model loaded successfully from: {model_file.resolve()}") except Exception as e: - self.logger.error( - f"❌ Failed to load XGBoost model. File exists but threw an error: {e}" - ) + self.logger.error(f"❌ Failed to load XGBoost model. File exists but threw an error: {e}") else: self.logger.warning( f"⚠️ XGBoost model not found at {local_model} OR {util_model}. Running graph resolution only." ) else: - self.logger.warning( - "⚠️ Pandas or XGBoost not installed in this environment. Running graph resolution only." - ) + self.logger.warning("⚠️ Pandas or XGBoost not installed in this environment. Running graph resolution only.") def audit_repository(self, artifacts, is_shadow_patch=False): """ - Orchestrates the resolution of transitive dependency graphs and + Orchestrates the resolution of transitive dependency graphs and executes the XGBoost model against the generated feature matrix. """ if not artifacts: @@ -131,9 +119,7 @@ def audit_repository(self, artifacts, is_shadow_patch=False): df = self._construct_feature_matrix(artifacts) if df.empty: - self.logger.warning( - "Feature matrix is empty after extraction. Aborting inference." - ) + self.logger.warning("Feature matrix is empty after extraction. Aborting inference.") return artifacts # 2. DEFENSIVE GUARD: Schema Alignment @@ -181,35 +167,25 @@ def audit_repository(self, artifacts, is_shadow_patch=False): artifact["telemetry"]["domain_context"] = {} if is_threat: - threat_name = self.CLASS_NAMES.get( - predicted_class, "Unknown Threat" - ) + threat_name = self.CLASS_NAMES.get(predicted_class, "Unknown Threat") artifact["telemetry"]["domain_context"]["AI Threat Class"] = threat_name - artifact["telemetry"]["domain_context"]["AI Threat Confidence"] = ( - f"{ml_score}%" - ) + artifact["telemetry"]["domain_context"]["AI Threat Confidence"] = f"{ml_score}%" artifact["is_ml_threat"] = True threats_found += 1 - self.logger.warning( - f"🚨 AI THREAT DETECTED: {artifact.get('path')} ({threat_name} | {ml_score}%)" - ) + self.logger.warning(f"🚨 AI THREAT DETECTED: {artifact.get('path')} ({threat_name} | {ml_score}%)") else: artifact["is_ml_threat"] = False - self.logger.info( - f"XGBoost Inference Complete. Found {threats_found} potential threats." - ) + self.logger.info(f"XGBoost Inference Complete. Found {threats_found} potential threats.") except Exception as e: - self.logger.error( - f"❌ Fatal error during XGBoost Inference: {e}", exc_info=True - ) + self.logger.error(f"❌ Fatal error during XGBoost Inference: {e}", exc_info=True) return artifacts def _resolve_dependency_graph(self, artifacts): """ - Resolves transitive fragility and blast radius using C-optimized traversals (NetworkX) + Resolves transitive fragility and Downstream Exposure using C-optimized traversals (NetworkX) if available, falling back to a pure Python BFS deque if missing. """ resolution_map = {} @@ -349,18 +325,10 @@ def _construct_feature_matrix(self, artifacts): safe_denom = max(logic_loc, coding_loc, 1) functions = artifact.get("functions", []) - max_func_comp = max( - [func.get("branch", 0) for func in functions] if functions else [0] - ) - avg_func_args = sum([func.get("args", 0) for func in functions]) / max( - len(functions), 1 - ) + max_func_comp = max([func.get("branch", 0) for func in functions] if functions else [0]) + avg_func_args = sum([func.get("args", 0) for func in functions]) / max(len(functions), 1) - hit_dict = { - self.SIGNAL_SCHEMA[i]: hits[i] - for i in range(len(self.SIGNAL_SCHEMA)) - if i < len(hits) - } + hit_dict = {self.SIGNAL_SCHEMA[i]: hits[i] for i in range(len(self.SIGNAL_SCHEMA)) if i < len(hits)} # 2. Build the Row Dictionary row = { @@ -377,27 +345,13 @@ def _construct_feature_matrix(self, artifacts): "log_max_func_complexity": np.log1p(np.maximum(max_func_comp, 0)), "log_avg_func_args": np.log1p(np.maximum(avg_func_args, 0)), "func_complexity_gini": float(tel.get("func_complexity_gini", 0.0)), - "func_internal_density": float( - tel.get("func_internal_density", 0.0) - ), - "design_slop_orphans": float( - hit_dict.get("design_slop_orphans", 0) - ), - "design_slop_duplicates": float( - hit_dict.get("design_slop_duplicates", 0) - ), - "log_direct_upstream": np.log1p( - np.maximum(dep.get("direct_upstream", 0), 0) - ), - "log_direct_downstream": np.log1p( - np.maximum(dep.get("direct_downstream", 0), 0) - ), - "log_total_upstream": np.log1p( - np.maximum(dep.get("total_upstream", 0), 0) - ), - "log_total_downstream": np.log1p( - np.maximum(dep.get("total_downstream", 0), 0) - ), + "func_internal_density": float(tel.get("func_internal_density", 0.0)), + "orphaned_logic": float(hit_dict.get("orphaned_logic", 0)), + "duplicate_logic": float(hit_dict.get("duplicate_logic", 0)), + "log_direct_upstream": np.log1p(np.maximum(dep.get("direct_upstream", 0), 0)), + "log_direct_downstream": np.log1p(np.maximum(dep.get("direct_downstream", 0), 0)), + "log_total_upstream": np.log1p(np.maximum(dep.get("total_upstream", 0), 0)), + "log_total_downstream": np.log1p(np.maximum(dep.get("total_downstream", 0), 0)), } # 3. Reconstruct Density Signatures @@ -405,17 +359,15 @@ def _construct_feature_matrix(self, artifacts): col_name = f"hit_{key}" if col_name not in exclusion_list: raw_density = (val / safe_denom) * 100.0 - row[f"log_density_{col_name}"] = np.log1p( - np.maximum(raw_density, 0) - ) + row[f"log_density_{col_name}"] = np.log1p(np.maximum(raw_density, 0)) # 4. Contextual/Mitigation Columns contextual = [ ( "raw_danger", - hit_dict.get("danger", 0) + hit_dict.get("sec_danger", 0), + hit_dict.get("high_risk_execution", 0) + hit_dict.get("sec_high_risk_execution", 0), ), - ("raw_sec_private_info", hit_dict.get("sec_private_info", 0)), + ("raw_sec_private_info", hit_dict.get("sec_hardcoded_secrets", 0)), ( "raw_sec_tainted_injection", hit_dict.get("sec_tainted_injection", 0), @@ -423,14 +375,12 @@ def _construct_feature_matrix(self, artifacts): ] for col_name, val in contextual: raw_density = (val / safe_denom) * 100.0 - row[f"log_density_{col_name}"] = np.log1p( - np.maximum(raw_density, 0) - ) + row[f"log_density_{col_name}"] = np.log1p(np.maximum(raw_density, 0)) # Bind to the new Ecosystem Baseline variables established in the Statistical Auditor row["assigned_macro_species"] = tel.get("ecosystem_baseline_cluster", 0) row["primary_z_score"] = float(tel.get("ecosystem_z_score", 0.0)) - + for i in range(11): row[f"dist_to_{i}"] = float(tel.get(f"dist_to_{i}", 0.0)) diff --git a/gitgalaxy/security/security_lens.py b/gitgalaxy/security/security_lens.py index 77667061..19325d6f 100644 --- a/gitgalaxy/security/security_lens.py +++ b/gitgalaxy/security/security_lens.py @@ -16,9 +16,9 @@ class SecurityLens: """ Static Application Security Testing (SAST) Engine. - - Identifies raw structural vulnerabilities (Regex Signatures, Shannon Entropy, - and Data Flow Taint) and evaluates them against dynamically injected Policy Thresholds + + Identifies raw structural vulnerabilities (Regex Signatures, Shannon Entropy, + and Data Flow Taint) and evaluates them against dynamically injected Policy Thresholds augmented by Network Centrality metrics. """ @@ -35,7 +35,7 @@ def __init__(self, policy=None): } # DEFENSIVE GUARD: ReDoS Prevention - # Extracts string literals for entropy scanning. Bounded to 64-1024 chars + # Extracts string literals for entropy scanning. Bounded to 64-1024 chars # using a non-greedy matcher to prevent catastrophic backtracking on minified files. self.string_extractor = re.compile(r'(["\'])([^\n]{64,1024}?)\1') @@ -59,11 +59,11 @@ def __init__(self, policy=None): } self.THREAT_HEADERS = [ - b"\x7fELF", # Linux Executable - b"MZ", # Windows Executable - b"#!/bin/", # Shell Script - b"\x00asm", # WebAssembly - b"\xcf\xfa\xed\xfe", # macOS Mach-O + b"\x7fELF", # Linux Executable + b"MZ", # Windows Executable + b"#!/bin/", # Shell Script + b"\x00asm", # WebAssembly + b"\xcf\xfa\xed\xfe", # macOS Mach-O ] # ------------------------------------------------------------------ @@ -71,7 +71,7 @@ def __init__(self, policy=None): # ------------------------------------------------------------------ self.THREAT_SIGNATURES = { # 1. Obfuscation & Encoding Signatures - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(?:atob|btoa|base64_decode|base64_encode|gzuncompress|str_rot13)\b|" r"\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|" r'(?:\w{15,}[ \t]*=[ \t]*["\'][A-Za-z0-9+/]{40,}={0,2}["\'])|' @@ -79,7 +79,7 @@ def __init__(self, policy=None): re.I, ), # 2. Security Control & Safety Bypasses (e.g., Disabling SSL Verification) - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(?:atob|btoa|base64_decode|gzinflate|gzuncompress|str_rot13|urldecode)[ \t]*\([ \t]*(?:atob|btoa|base64_decode|gzinflate|gzuncompress|str_rot13|urldecode)\b|" r"\b(?:auto_prepend_file|auto_append_file)\b|" r'\b(?:ini_set|ini_restore|putenv)[ \t]*\([ \t]*["\'](?:disable_functions|safe_mode|open_basedir|allow_url_fopen)["\'][ \t]*,[ \t]*["\']?(?:0|off|false|)["\']?\)|' @@ -97,7 +97,7 @@ def __init__(self, policy=None): re.I | re.X, ), # 4. Dynamic Code Execution (RCE Vectors) - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(?:BPXBATCH|IKJEFT01|IRXJCL)\b|" r"\bEXEC\s+CICS\s+(?:START|LINK\s+PROGRAM|XCTL)\b\s*\(\s*[A-Za-z_-]+\s*\)|" r'\b(?:eval|Function|setTimeout|setInterval)\b\s*\(\s*(?:atob|base64_decode|gzinflate|\$|_[a-zA-Z]|["\']|`)|' @@ -110,7 +110,7 @@ def __init__(self, policy=None): re.I | re.X, ), # 5. Prototype Pollution & Global State Flux - "flux": re.compile( + "state_mutation": re.compile( r"\b[A-Za-z0-9_]+\.prototype\.[A-Za-z0-9_]+[ \t]*=|" r"\.__proto__[ \t]*=[ \t]*[{a-zA-Z]|" r"\b(?:window|global|globalThis|document)\.(?:fetch|eval|setTimeout|setInterval|Promise|console|JSON)[ \t]*=|" @@ -120,14 +120,14 @@ def __init__(self, policy=None): r"\bsys\.modules\[[^\]]+\][ \t]*=", re.I, ), - # 6. Commented-out Executable Logic (Shadow Logic) - "graveyard": re.compile( + # 6. Commented-out Executable Logic (Deprecated Trails) + "dead_code": re.compile( r"(?://|#|--|\*>|^.{6}\*)[^\n]*?\b(?:http|bash|curl|wget|eval|base64|nc\s+-e|/dev/tcp|BPXBATCH)\b|" r"/\*(?:(?!\*/).){0,500}?\b(?:http|bash|curl|wget|eval|base64|nc\s+-e|/dev/tcp)\b", re.I, ), # 7. Low-Level Cryptographic & Bitwise Operations - "bitwise_hits": re.compile( + "bitwise_ops": re.compile( r"\b\w+[ \t]*=[ \t]*(?:\w+[ \t]*\^[ \t]*\w+[ \t]*){2,20}|" r"(?:\w+\[[^\]\n]{1,50}\][ \t]*\^[ \t]*=?[ \t]*(?:0x[0-9a-fA-F]+|\d+|\w+)[ \t]*;[ \t]*){3,20}", re.I, @@ -148,7 +148,7 @@ def __init__(self, policy=None): re.I, ), # 10. Hardcoded Secrets & Credentials - "private_info": re.compile( + "hardcoded_secrets": re.compile( r"\b(password|secret|token|api[_-]?key|client[_-]?secret|credentials|private[_-]?key|auth[_-]?token)\b[ \t]*(?:[:=]|=>)[ \t]*[\"'][A-Za-z0-9\-_+/=]{16,}[\"']|" r"\b(PASSWORD|SECRET|TOKEN|KEY|CREDENTIALS)[A-Za-z0-9_-]*\b[ \t]+(?:IS[ \t]+)?(?:PIC[ \t]+[A-Za-z0-9\-\(\)]+[ \t]+)?VALUE[ \t]+['\"][^'\"]+['\"]|" r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?(?:PRIVATE KEY|MESSAGE|CERTIFICATE)-----|" @@ -231,7 +231,7 @@ def scan_content(self, content: str, loc: int) -> dict: # Map the exact line indexes of critical threats for the Taint Tracker if not is_auto_gen and key in { "io", - "danger", + "high_risk_execution", "llm_hooks", "db_hooks", }: @@ -266,7 +266,7 @@ def scan_content(self, content: str, loc: int) -> dict: taint_snippets = [] has_global_io = counts.get("io", 0) > 0 - has_global_danger = counts.get("danger", 0) > 0 + has_global_danger = counts.get("high_risk_execution", 0) > 0 has_global_llm = counts.get("llm_hooks", 0) > 0 has_global_db = counts.get("db_hooks", 0) > 0 @@ -301,7 +301,7 @@ def scan_content(self, content: str, loc: int) -> dict: line = safe_lines[line_idx] has_io = "io" in threats - has_danger = "danger" in threats + has_danger = "high_risk_execution" in threats has_llm = "llm_hooks" in threats has_db = "db_hooks" in threats @@ -334,38 +334,24 @@ def scan_content(self, content: str, loc: int) -> dict: # Scenario C: Downward Flow Scan (Check Execution Sink) # Because execution requires a sink, the sink line MUST be in threat_lines! - if (has_danger or has_db or has_llm) and ( - tainted_vars or llm_tainted_vars - ): + if (has_danger or has_db or has_llm) and (tainted_vars or llm_tainted_vars): for t_var in tainted_vars: # O(1) string check before running full regex - if t_var in line and re.search( - rf"\b{re.escape(t_var)}\b", line - ): + if t_var in line and re.search(rf"\b{re.escape(t_var)}\b", line): if has_danger or has_db: taint_hits += 1 if len(taint_snippets) < 3: - taint_snippets.append( - f"[Taint -> Exec/DB]: {line[:60]}..." - ) + taint_snippets.append(f"[Taint -> Exec/DB]: {line[:60]}...") if has_llm: prompt_injection_hits += 1 if len(taint_snippets) < 3: - taint_snippets.append( - f"[Taint -> LLM]: {line[:60]}..." - ) + taint_snippets.append(f"[Taint -> LLM]: {line[:60]}...") for l_var in llm_tainted_vars: - if ( - l_var in line - and re.search(rf"\b{re.escape(l_var)}\b", line) - and has_danger - ): + if l_var in line and re.search(rf"\b{re.escape(l_var)}\b", line) and has_danger: agentic_rce_hits += 1 if len(taint_snippets) < 3: - taint_snippets.append( - f"[LLM State -> RCE]: {line[:60]}..." - ) + taint_snippets.append(f"[LLM State -> RCE]: {line[:60]}...") counts["tainted_injection"] = taint_hits counts["prompt_injection"] = prompt_injection_hits @@ -377,7 +363,7 @@ def scan_content(self, content: str, loc: int) -> dict: def evaluate_risk(self, aggregated_hits, total_loc, network_metrics=None): """ Evaluates vulnerability risk with Network Centrality awareness. - Highly central files (e.g., God Nodes with massive blast radiuses) have a + Highly central files (e.g., God Nodes with massive Downstream Exposures) have a drastically lower tolerance for embedded threats, scaling their density multipliers. """ loc_safe = total_loc if total_loc > 0 else 1 @@ -395,8 +381,8 @@ def evaluate_risk(self, aggregated_hits, total_loc, network_metrics=None): # 1. Hidden Malware Risk malware_hits = ( - aggregated_hits.get("heat_triggers", 0) - + aggregated_hits.get("bitwise_hits", 0) + aggregated_hits.get("reflection_metaprogramming", 0) + + aggregated_hits.get("bitwise_ops", 0) + aggregated_hits.get("shadow_imports", 0) + aggregated_hits.get("homoglyphs", 0) + aggregated_hits.get("entropy", 0) @@ -406,18 +392,14 @@ def evaluate_risk(self, aggregated_hits, total_loc, network_metrics=None): exposures["Hidden Malware Risk"] = malware_density # 2. Logic Bomb / Sabotage Risk - sabotage_hits = aggregated_hits.get("graveyard", 0) + ( - aggregated_hits.get("danger", 0) * 1.5 - ) + sabotage_hits = aggregated_hits.get("dead_code", 0) + (aggregated_hits.get("high_risk_execution", 0) * 1.5) sabotage_density = (sabotage_hits / loc_safe) * network_multiplier if sabotage_density >= self.policy["logic_bomb_threshold"]: exposures["Logic Bomb Risk"] = sabotage_density # 3. Data Injection Risk injection_hits = ( - aggregated_hits.get("io", 0) - + aggregated_hits.get("danger", 0) - + aggregated_hits.get("flux", 0) + aggregated_hits.get("io", 0) + aggregated_hits.get("high_risk_execution", 0) + aggregated_hits.get("state_mutation", 0) ) injection_density = (injection_hits / loc_safe) * network_multiplier if injection_density >= self.policy["injection_surface_threshold"]: @@ -430,7 +412,7 @@ def evaluate_risk(self, aggregated_hits, total_loc, network_metrics=None): exposures["Memory Corruption Risk"] = memory_density # 5. Secrets Risk - secrets_hits = aggregated_hits.get("private_info", 0) + secrets_hits = aggregated_hits.get("hardcoded_secrets", 0) secrets_density = (secrets_hits / loc_safe) * network_multiplier if secrets_density >= self.policy["secrets_risk_threshold"]: exposures["Secrets Leak Risk"] = secrets_density @@ -439,18 +421,16 @@ def evaluate_risk(self, aggregated_hits, total_loc, network_metrics=None): prompt_inj = aggregated_hits.get("prompt_injection", 0) agentic_rce = aggregated_hits.get("agentic_rce", 0) if agentic_rce > 0: - exposures["Agentic RCE Risk (Critical)"] = 100.0 + exposures["Autonomous Execution Vector (Critical)"] = 100.0 elif prompt_inj > 0: - exposures["Prompt Injection Risk"] = min( - (prompt_inj / loc_safe) * network_multiplier * 100.0, 100.0 - ) + exposures["Prompt Injection Surface Risk"] = min((prompt_inj / loc_safe) * network_multiplier * 100.0, 100.0) return exposures def scan_binary(self, raw_bytes: bytes, ext: str) -> dict: """ Binary Magic Byte & Entropy Analyzer. - Validates compiled chunks against expected magic bytes and scans for + Validates compiled chunks against expected magic bytes and scans for embedded execution headers or extreme cryptographic entropy indicating packed malware. """ threats = {} @@ -466,7 +446,7 @@ def scan_binary(self, raw_bytes: bytes, ext: str) -> dict: for header in self.THREAT_HEADERS: if header in raw_bytes: - threats["sec_danger"] = 1 + threats["sec_high_risk_execution"] = 1 threats["threat_snippet"] = f"Embedded execution header found: {header}" break @@ -480,11 +460,9 @@ def scan_binary(self, raw_bytes: bytes, ext: str) -> dict: entropy -= probability * math.log2(probability) if entropy > 7.95: - threats["sec_heat_triggers"] = 1 - threats["threat_snippet"] = ( - f"Extreme binary entropy detected: {entropy:.2f}" - ) + threats["sec_reflection_metaprogramming"] = 1 + threats["threat_snippet"] = f"Extreme binary entropy detected: {entropy:.2f}" except Exception: pass - return threats \ No newline at end of file + return threats diff --git a/gitgalaxy/standards/README.md b/gitgalaxy/standards/README.md index 4d5311c4..c088d7cb 100644 --- a/gitgalaxy/standards/README.md +++ b/gitgalaxy/standards/README.md @@ -1,55 +1,75 @@ -# GitGalaxy: Core Heuristics & Standards Registry +# GitGalaxy Standards: Heuristics Registry & Calibration Layer -[![Core](https://img.shields.io/badge/Core-Heuristics_Engine-00BFFF.svg)](#) -[![Coverage](https://img.shields.io/badge/Coverage-50%2B_Languages-00C957.svg)](#) -[![Architecture](https://img.shields.io/badge/Architecture-AST--Free_Regex-8A2BE2.svg)](#) +[![Architecture](https://img.shields.io/badge/Architecture-AST--Free_Heuristics-8A2BE2.svg)](#) +[![Security](https://img.shields.io/badge/Security-Zero--Trust_Baselines-FF4500.svg)](#) +[![Performance](https://img.shields.io/badge/Performance-ReDoS_Immune-00BFFF.svg)](#) -Welcome to the configuration and tuning layer of the **blAST Engine**. +Welcome to **GitGalaxy Standards**. This directory contains the immutable mathematical constants, structural regex dictionaries, security thresholds, and ingestion constraints that govern the entire GitGalaxy engine. -This directory contains the immutable mathematical constants, structural regex dictionaries, and security thresholds that dictate how GitGalaxy maps a codebase. No active execution or file reading happens here; these files serve as the universal rulesets and configurations consumed by the central `signal_processor.py`. +No active execution, file I/O, or graph resolution occurs in this directory. Instead, this acts as the **Central Calibration Layer**. It defines the universal rulesets consumed by the Orchestrator, the Prism, the Signal Processor, and the Security Lens to guarantee deterministic analysis across polyglot ecosystems. -If you need to teach GitGalaxy a new language, tune a risk exposure curve, or update the AI AppSec sensors, you do it here. +## Architectural Philosophy & Defensive Engineering -> **💡 Note:** Extending the blAST engine to support a new language does not require writing a brittle AST parser. You simply need to calibrate the thermodynamic physics of the target language using our strict LLM Master Prompt to generate ReDoS-proof, mathematically bounded regular expressions. Review the integration protocol here: **[Architecting a New Language](how_to_add_a_language.md)**. +Engineers accustomed to traditional AST (Abstract Syntax Tree) parsers often view regular expressions with skepticism, assuming they are too brittle or prone to Catastrophic Backtracking (ReDoS) to parse enterprise code. -### 1. The Boundary Shield (`gitgalaxy_config.py`) -Defines global ingestion rules and zero-trust boundaries before static analysis processing begins. +GitGalaxy explicitly bypasses ASTs to **visualize functional intent rather than rigid syntax**, allowing it to map severely fragmented, legacy, or un-compilable code. To achieve processing speeds exceeding 100,000 LOC/sec without crashing the Python GIL, the dictionaries in this directory are engineered with extreme defensive boundaries: -* **Zero-Trust Import Control:** Defines banned supply chain dependencies. -* **Directory Exclusion Rules:** Defines architectural black holes and massive build folders to skip. -* **Hardcoded Secrets Traps:** Instantly traps cryptographic keys and cloud tokens. +### 1. ReDoS Immunity & Strict Bounding +The regex dictionaries defined in `language_standards.py` strictly prohibit unbounded quantifiers (like `.*` or `\s+`) in high-risk zones. To safely leap across multi-line function declarations and modern attribute stacking (e.g., C++23 `[[attributes]]` or Java `@Annotations`), the engine utilizes strict boundary limits. It enforces rigid numeric clamps (e.g., `{0,5}`) and mutually exclusive character sets, guaranteeing O(1) or linear O(N) evaluation time per match. -### 2. Language & Identity Heuristics (`language_lens.py`) -The identification engine responsible for converting raw text into high-fidelity ecosystem locks. +### 2. Bayesian Confidence Hierarchy +Inferring a file's language purely by its extension leads to catastrophic collisions (e.g., `.m` being Objective-C, MATLAB, or Mathematica). `language_lens.py` resolves these collisions natively without AST evaluation. It builds a Bayesian confidence score by cross-referencing sibling files, structural neighborhood context, and package manifests, only falling back to an expensive lexical scan if the file's identity drops below a strict ambiguity threshold. -* **Collision Resolution:** Mathematically resolves ambiguous file extensions. -* **Contextual Ecosystem Resolution:** Uses neighborhood files (like `package.json` or `pom.xml`) to prove identity. -* **Entropy & Anomaly Detection:** Identifies unknown files or obfuscated malware via spectral density and Shannon entropy. +### 3. Contextual Threat Calibration (Architectural Anomaly Detection) +A vulnerability’s severity is dictated by its environment. Standard OS shell execution is expected in a bash script but highly anomalous in a React frontend component. `analysis_lens.py` defines an **Ecosystem Mismatch Matrix** that dynamically multiplies threat scores when an asset exhibits behaviors hostile to its native architecture (e.g., detecting C-style memory pointers inside a Node.js web layer), instantly flagging it as a high-risk anomaly or potential backdoor. -### 3. The Structural Syntax Dictionary (`language_standards.py`) -The massive heuristic dictionary mapping the syntax of 50+ languages to standard GitGalaxy architectural dimensions. +### 4. Structural Impact Score Normalization +AST parsers typically collapse when analyzing massive machine-generated files (e.g., Swagger JSONs, Webpack chunks, Protobuf definitions). `analysis_lens.py` deploys pre-calculated modifiers to programmatically reduce the calculated Structural Impact Score of generated code. This ensures human-written architecture remains the focal point of the analysis without risking Out-Of-Memory (OOM) exceptions or skewing repository metrics. -* **Complexity Mapping:** Translates text to branch logic, state mutation, and cognitive load. -* **Behavioral Sensors:** Universal regex for mapping AI boundaries, Authentication routing, and IPC calls. -* **Technical Debt Tracking:** Detects formatting discrepancies and mixed architectural paradigms. +--- + +## Data-Driven Configuration: Decoupled Architecture & False-Positive Eradication + +The true flexibility of GitGalaxy lies in its decoupled architecture. The core execution logic is entirely separated from the mathematical constants that govern it. This directory exposes over **175 discrete tuning variables**—from sigmoid curve slopes and anomaly thresholds to architectural mass dampeners. + +* **Risk Equation Tuning (75+ Variables):** Highly specific parameters (sigmoid slopes, offsets, clamps, and threshold floors) for multiple risk dimensions. The math curves for `cognitive_load`, `concurrency`, or `logic_bomb` can be stretched or compressed independently. +* **Path & Impact Modifiers (35+ Variables):** Regex-targeted multipliers that artificially increase or decrease Structural Impact Scores based on domain context (e.g., dampening UI framework cognitive load by 0.50x, or multiplying global state manipulation by 1.15x). +* **Security & Ecosystem Matrices (50+ Variables):** Specific baseline weights for systems vs. web vs. infrastructure, cross-ecosystem mismatch penalties (e.g., `systems_in_web`), and the 10-cluster Archetype Violation Matrix. +* **Hardware & Ingestion Limits (15+ Variables):** Hard operational ceilings like `MAX_FILE_SIZE_MB`, `MAX_LINE_LENGTH`, `HANDSHAKE_LOOKAHEAD_LIMIT`, and `NESTED_PEEL_LIMIT` to tune exactly how hard the engine pushes the CPU before backing off. + +While 175+ variables might sound intimidating, they are your primary weapon against alert fatigue. Every single dial can be surgically tuned to eradicate false positives for your specific environment—giving you a scanner that highlights genuine architectural threats without wasting engineering time on expected structural noise. + +Want to scan an untrusted package? Flip the engine to `paranoid` mode to instantly tighten the `ThreatPolicy` thresholds. Need to support a proprietary, in-house legacy language? Inject a new structural dictionary into `language_standards.py`. The engine dynamically adopts these new constraints at runtime without requiring a single modification to the core parsing algorithms. -### 4. The Scoring & Risk Engine (`analysis_lens.py`) -The mathematical core defining how raw structural signals are converted into 0-100% risk exposures. +--- + +## Core Configurations (Module Breakdown) + +Each file in this directory serves a distinct calibration purpose for the downstream engines: -* **Risk Normalization Curves:** Mathematical sigmoid clamps for tuning risk exposures. -* **Contextual Dampeners:** Modifiers that reduce risk weight for test files and documentation. -* **Architectural Anomaly Detection:** Penalizes code acting alien to its ecosystem. -* **Machine Learning Inference:** Houses K-means clustering models for archetype classification. +* **`gitgalaxy_config.py` (Global Ingestion Firewall):** Defines Zero-Trust ingestion boundaries. It houses the Supply Chain Firewall configurations (approved vs. blacklisted imports), global file denylists, X-Ray binary scanner bypasses, and physical file-size clamps. +* **`language_lens.py` (Identity Classifier):** The Bayesian engine that assigns definitive "Identity Locks" to files. It defines the multi-tiered confidence hierarchy, resolving extension collisions by weighing exact filename matches against Contextual Baselines. +* **`language_standards.py` (Structural Signature Registry):** The massive, highly optimized structural mapping registry for 50+ languages. It defines exactly how to slice a language into Branch Logic, State Flux, High-Risk Execution, and Object Declarations using ReDoS-proof regular expressions. +* **`analysis_lens.py` (Mathematical Constants & Threat Policies):** The repository of Threat Policies, Sigmoid Curve tuning, and K-Means clustering medians. It dictates how raw structural signals are mathematically converted into normalized 0-100% risk exposure vectors. +* **`how_to_add_a_language.md` (Extension Protocol):** Contains the strict prompt engineering protocols required to generate ReDoS-proof language dictionaries using advanced LLMs, bypassing the need for manual parser development. --- -

+## Extending the Engine (AST-Free Onboarding) + +Because GitGalaxy is AST-free, adding support for a new language does not require writing a complex, brittle parser. You simply need to calibrate the structural heuristics of the target language by generating a new dictionary entry for `language_standards.py`. + +For strict guidelines and the LLM Master Prompt required to generate ReDoS-proof structural definitions, review: **[Architecting a New Language](how_to_add_a_language.md)**. --- -### 🌌 Powered by the blAST Engine +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) + +GitGalaxy Standards is the calibration layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed for planetary-scale repositories. -This documentation is part of the [GitGalaxy Ecosystem](https://github.com/squid-protocol/gitgalaxy), an AST-free, LLM-free heuristic knowledge graph engine. +Explore the ecosystem: -* 🪐 **[Explore the GitHub Repository](https://github.com/squid-protocol/gitgalaxy)** for code, tools, and updates. -* 🔭 **[Visualize your own repository at GitGalaxy.io](https://gitgalaxy.io/)** using our interactive 3D WebGPU dashboard. \ No newline at end of file +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/standards/analysis_lens.py b/gitgalaxy/standards/analysis_lens.py index c755fbb5..1d1ba662 100644 --- a/gitgalaxy/standards/analysis_lens.py +++ b/gitgalaxy/standards/analysis_lens.py @@ -11,11 +11,11 @@ """ analysis_lens.py -Phase 4+: The Laws of Physics, Threat Policies, and Mathematical Constants. +Phase 4+: Structural Policies, Threat Policies, and Mathematical Constants. This file contains the immutable mathematical constants, security thresholds, and spatial modifiers used by the Signal Processor to calculate risk exposures -and physical mass. +and structural magnitude. """ # ------------------------------------------------------------------------------ @@ -57,8 +57,8 @@ def get_policy(mode="baseline"): # ------------------------------------------------------------------------------ -# 2. CORE PHYSICS CONSTANTS -# Consumed by: signal_processor.py +# 2. CORE STRUCTURAL CONSTANTS +# # Consumed by: signal_processor.py # ------------------------------------------------------------------------------ ENGINE_CONSTANTS = { "WEIGHT_RISK": 2.5, @@ -133,9 +133,9 @@ def get_policy(mode="baseline"): }, # ============================================================================== # DEFENSIVE DESIGN: RECURSIVE TESTING TRAP PREVENTION - # SAST tools frequently generate false-positive "Lack of Coverage" alerts on - # configuration files or the test artifacts themselves. This explicit exclusion - # mask ensures the engine never demands unit tests for inert data, generated + # SAST tools frequently generate false-positive "Lack of Coverage" alerts on + # configuration files or the test artifacts themselves. This explicit exclusion + # mask ensures the engine never demands unit tests for inert data, generated # scaffolding, or test snapshots, drastically reducing alert fatigue. # ============================================================================== "UNTESTABLE_EXTENSIONS": { @@ -244,8 +244,8 @@ def get_policy(mode="baseline"): }, } # ------------------------------------------------------------------------------ -# 4. ENVIRONMENTAL PHYSICS (Path Modifiers) -# Consumed by: signal_processor.py +# 4. ENVIRONMENTAL MODIFIERS (Path Modifiers) +# # Consumed by: signal_processor.py # ------------------------------------------------------------------------------ PATH_MODIFIERS = { "Cognitive Load Exposure": [ @@ -253,9 +253,7 @@ def get_policy(mode="baseline"): # Translation files, constants, and enums have massive line counts but almost # zero logical complexity. Dampen them heavily so they don't look like giant risks. ( - re.compile( - r"(?:^|/)(?:i18n|locales?|translations?|constants?|enums?)/", re.I - ), + re.compile(r"(?:^|/)(?:i18n|locales?|translations?|constants?|enums?)/", re.I), 0.80, ), # 2. The Abstraction (Declarations & Headers) @@ -299,9 +297,7 @@ def get_policy(mode="baseline"): # Highly secure zones dedicated to authentication, authorization, and cryptography. # Massive reduction in risk exposure because this is explicit defensive mass. ( - re.compile( - r"(?:^|/)(?:auth|security|policies|permissions|roles|crypto)/", re.I - ), + re.compile(r"(?:^|/)(?:auth|security|policies|permissions|roles|crypto)/", re.I), 0.80, ), # 2. The Contract (Strong Typing, Models & Schemas) @@ -327,7 +323,7 @@ def get_policy(mode="baseline"): ), # 4. Unchecked Execution Zones (Raw & Unchecked Data) # Areas where memory management or input sanitization is deliberately turned off. - # Added 'danger' (e.g., dangerouslySetInnerHTML) and 'unverified'. + # Added 'high_risk_execution' (e.g., dangerouslySetInnerHTML) and 'unverified'. (re.compile(r"(?:^|/)(?:unsafe|raw|danger|escape|unverified)/", re.I), 1.25), # 5. The Override (Explicit Safety Bypasses) # Distinct from Tech Debt. These are files specifically named to bypass @@ -384,9 +380,7 @@ def get_policy(mode="baseline"): # 1. The Blueprint (Standard Directories) # Expanded to catch singular /doc/, /tutorials/, /guides/, and /wiki/ ( - re.compile( - r"(?:^|/)(?:docs?|examples?|tutorials?|guides?|wiki|man)/", re.I - ), + re.compile(r"(?:^|/)(?:docs?|examples?|tutorials?|guides?|wiki|man)/", re.I), 0.0, ), # 2. The Glossary (Core Repository Literature) @@ -404,9 +398,7 @@ def get_policy(mode="baseline"): # 4. The Interactive Spec (API Docs & Notebooks) # Catches Swagger/OpenAPI schemas and Jupyter Notebooks (executable examples) ( - re.compile( - r"(?:^|/)(?:swagger|openapi)\.(?:json|yaml|yml)$|\.ipynb$", re.I - ), + re.compile(r"(?:^|/)(?:swagger|openapi)\.(?:json|yaml|yml)$|\.ipynb$", re.I), 0.90, ), # 5. The Story (UI Component Documentation) @@ -429,9 +421,7 @@ def get_policy(mode="baseline"): # Only drops to 0 if the file inside /spec/ belongs to a spec-heavy language, # OR if it explicitly has '.spec' or '_spec' in the filename. ( - re.compile( - r"(?:^|/)specs?/.*\.(?:rb|js|jsx|ts|tsx|dart)$|\.spec\b|_spec\b", re.I - ), + re.compile(r"(?:^|/)specs?/.*\.(?:rb|js|jsx|ts|tsx|dart)$|\.spec\b|_spec\b", re.I), 0.0, ), # 3. The Perl Sieve @@ -465,9 +455,7 @@ def get_policy(mode="baseline"): # Boilerplates, stubs, and generators intentionally contain commented-out # "example" code. Dampen this heavily so the engine doesn't penalize it. ( - re.compile( - r"(?:^|/)(?:templates?|stubs?|scaffolds?|fixtures?|examples?)/", re.I - ), + re.compile(r"(?:^|/)(?:templates?|stubs?|scaffolds?|fixtures?|examples?)/", re.I), 0.75, ), # 2. The Lab (Experimental Safe Zones) @@ -484,9 +472,7 @@ def get_policy(mode="baseline"): # Expanded to include /src/, /lib/, and /services/. Leaving dead code in # the main execution arteries creates structural friction and doubt. ( - re.compile( - r"(?:^|/)(?:core|kernel|main|src|lib|services|providers)/", re.I - ), + re.compile(r"(?:^|/)(?:core|kernel|main|src|lib|services|providers)/", re.I), 1.15, ), # 4. Application Entrypoints (Application Entrypoints) @@ -524,9 +510,7 @@ def get_policy(mode="baseline"): # interconnected data graphs to the client. A mistake here can cause # catastrophic N+1 database queries or data leaks. ( - re.compile( - r"(?:^|/)(?:graphql|resolvers?|mutations?|queries|rpc|grpc|trpc)/", re.I - ), + re.compile(r"(?:^|/)(?:graphql|resolvers?|mutations?|queries|rpc|grpc|trpc)/", re.I), 1.20, ), # 4. The Distribution Contract (SDKs & Public Exports) @@ -600,9 +584,7 @@ def get_policy(mode="baseline"): # constants, or configs should be completely static after boot. If the # engine detects state mutations here, the application is poisoning its own roots. ( - re.compile( - r"(?:^|/)(?:configs?|envs?|globals?|constants?|settings?)/", re.I - ), + re.compile(r"(?:^|/)(?:configs?|envs?|globals?|constants?|settings?)/", re.I), 1.25, ), # The Migration Exemption (Database State Changes) @@ -621,19 +603,17 @@ def get_policy(mode="baseline"): "Structural Mass": [ # ============================================================================== # DEFENSIVE DESIGN: PARSER SATURATION & AST BLOAT PREVENTION - # Massive auto-generated files (e.g., Protobufs, Swagger, Webpack chunks) - # will mathematically crush standard AST parsers and inflate a repository's - # structural mass. These targeted dampeners artificially reduce the gravitational - # weight of generated code, ensuring human-written architecture remains the + # Massive auto-generated files (e.g., Protobufs, Swagger, Webpack chunks) + # will mathematically crush standard AST parsers and inflate a repository's + # structural mass. These targeted dampeners artificially reduce the gravitational + # weight of generated code, ensuring human-written architecture remains the # focal point of the analysis without risking OOM (Out of Memory) crashes. # ============================================================================== # The Cryptographic & Test Vector Dampener # Auto-generated data arrays explode parser argument math. Extreme reduction # prevents these static payloads from registering as massive logic hubs. ( - re.compile( - r"(?:^|/)(?:wycheproof_tests|test_vectors|testdata|tests/data)/", re.I - ), + re.compile(r"(?:^|/)(?:wycheproof_tests|test_vectors|testdata|tests/data)/", re.I), 0.001, ), # The Code-Generation Dampener @@ -658,9 +638,7 @@ def get_policy(mode="baseline"): # Neutralizes standard third-party ecosystem folders (vendor, node_modules) # to prevent external dependencies from eclipsing the core repository. ( - re.compile( - r"(?:^|/)(?:resources/lib|vendor|node_modules|third_party)/", re.I - ), + re.compile(r"(?:^|/)(?:resources/lib|vendor|node_modules|third_party)/", re.I), 0.02, ), # The Global Frontend Vendor Dampener @@ -687,18 +665,14 @@ def get_policy(mode="baseline"): # Dampens UI components that are purely exported SVG path data (e.g., Icon.jsx). # Prevents raw vector math from artificially inflating UI framework density. ( - re.compile( - r"(?:^|/)(?:icons?|illustrations?|logos?|assets?)/.*\.jsx?|tsx?$", re.I - ), + re.compile(r"(?:^|/)(?:icons?|illustrations?|logos?|assets?)/.*\.jsx?|tsx?$", re.I), 0.10, ), # ---> NEW: The Test Snapshot & Fixture Dampener <--- # Neutralizes auto-generated UI snapshots and massive mock data payloads (like cryptographic keys) # so they do not artificially inflate the mass of the verification suite. ( - re.compile( - r"(?:^|/)(?:__snapshots__|__mocks__|fixtures?)/|.*\.snap$", re.I - ), + re.compile(r"(?:^|/)(?:__snapshots__|__mocks__|fixtures?)/|.*\.snap$", re.I), 0.001, ), # ---> NEW: The Academic Test Script Dampener <--- @@ -708,9 +682,7 @@ def get_policy(mode="baseline"): # Reduces the structural weight of CI/CD shell scripts and automation tooling # so deployment pipelines don't mimic core application complexity. ( - re.compile( - r"(?:^|/)(?:scripts?|ci|cd|docker|e2e)/.*\.(?:sh|bash|zsh)$", re.I - ), + re.compile(r"(?:^|/)(?:scripts?|ci|cd|docker|e2e)/.*\.(?:sh|bash|zsh)$", re.I), 0.10, ), # The Declarative & Type Definition Dampener @@ -818,7 +790,7 @@ def get_policy(mode="baseline"): "mass_penalty_max": 40.0, "risk_floor": 15.0, }, - "graveyard": { + "dead_code": { "hit_mult": 3.0, "safe_mass_floor": 50.0, "threshold_base": 10.0, @@ -882,7 +854,7 @@ def get_policy(mode="baseline"): } # ------------------------------------------------------------------------------ -# 6. DOMAIN ONTOLOGIES (Security Profiles & Alien Entity Rules) +# 6. DOMAIN ONTOLOGIES (Security Profiles & Architectural Anomaly Rules) # Consumed by: signal_processor.py # ------------------------------------------------------------------------------ LANGUAGE_SECURITY_PROFILES = { @@ -949,34 +921,34 @@ def get_policy(mode="baseline"): "systems": { "memory": 0.1, "logic_bomb": 0.2, - "flux": 1.0, + "state_mutation": 1.0, "injection": 1.0, }, # Pointer math is normal "web": { "memory": 1.0, "logic_bomb": 1.0, - "flux": 0.3, + "state_mutation": 0.3, "injection": 2.0, }, # DOM flux is normal, XSS is deadly "infra": { "memory": 1.0, "logic_bomb": 0.0, - "flux": 1.0, + "state_mutation": 1.0, "injection": 1.0, }, # OS commands are literally the point "backend": { "memory": 1.5, "logic_bomb": 1.0, - "flux": 1.5, + "state_mutation": 1.5, "injection": 1.5, }, # Standard aggressive baseline }, # ============================================================================== # DEFENSIVE DESIGN: POLYGLOT CONTEXTUAL ANOMALY DETECTION - # A vulnerability's severity is dictated by its environment. Standard OS execution - # is expected in a shell script, but highly anomalous in a frontend UI component. - # This matrix multiplies threat scores when an asset exhibits behaviors hostile - # to its native ecosystem (e.g., detecting C-style memory pointers inside a Node.js + # A vulnerability's severity is dictated by its environment. Standard OS execution + # is expected in a shell script, but highly anomalous in a frontend UI component. + # This matrix multiplies threat scores when an asset exhibits behaviors hostile + # to its native ecosystem (e.g., detecting C-style memory pointers inside a Node.js # web layer), flagging potential Trojans or backdoors. # ============================================================================== "ECOSYSTEM_MISMATCH_WEIGHTS": { @@ -984,12 +956,8 @@ def get_policy(mode="baseline"): "memory": 5.0, "logic_bomb": 3.0, }, # C code hiding in a JS app = Trojan - "infra_in_web": { - "logic_bomb": 4.0 - }, # Shell script hiding in a JS app = Backdoor - "web_in_systems": { - "flux": 3.0 - }, # JS embedded in C firmware = Bizarre architecture + "infra_in_web": {"logic_bomb": 4.0}, # Shell script hiding in a JS app = Backdoor + "web_in_systems": {"state_mutation": 3.0}, # JS embedded in C firmware = Bizarre architecture }, # ---> THE ARCHETYPE VIOLATION MATRIX (k=10 Edition) <--- # Multiplies threat mass based on how anomalous the behavior is for the file's physical DNA. @@ -1082,12 +1050,12 @@ def get_policy(mode="baseline"): "api_exposure", "concurrency", "state_flux", - "graveyard", + "dead_code", "spec_match", "stability", "churn", "documentation", - "civil_war", + "tabs_vs_spaces", "algorithmic_dos", # --- THE SECURITY & VULNERABILITY LENSES --- "obscured_payload", @@ -1098,17 +1066,17 @@ def get_policy(mode="baseline"): ], "SIGNAL_SCHEMA": [ "branch", - "linear", + "structural_boundaries", "args", "func_start", "class_start", "safety", - "safety_neg", - "danger", + "safety_bypasses", + "high_risk_execution", "io", "api", - "flux", - "graveyard", + "state_mutation", + "dead_code", "doc", "test", "concurrency", @@ -1119,13 +1087,13 @@ def get_policy(mode="baseline"): "generics", "comprehensions", "scientific", - "heat_triggers", + "reflection_metaprogramming", "import", "ownership", "planned_debt", "fragile_debt", "spec_exposure", - "civil_war", + "tabs_vs_spaces", "ssr_boundaries", "events", "dependency_injection", @@ -1134,13 +1102,13 @@ def get_policy(mode="baseline"): "memory_alloc", "inline_asm", "telemetry", - "print_hits", - "cast_hits", - "bailout_hits", - "halt_hits", - "bitwise_hits", + "debug_prints", + "explicit_casts", + "panics_and_aborts", + "thread_sleeps", + "bitwise_ops", "sync_locks", - "freeze_hits", + "immutability_locks", "cleanup", "encapsulation", "listeners", @@ -1177,24 +1145,24 @@ def get_policy(mode="baseline"): "design_upper_case", "design_short_vars", "design_long_vars", - "design_slop_duplicates", - "design_slop_orphans", + "duplicate_logic", + "orphaned_logic", # --- NEW: INSTRUCTIONAL PROOF SENSORS (LITERATURE) --- "lit_code_blocks", "lit_diagrams", "lit_headers", "lit_links", # --- NEW: PASSIVE SECURITY LENS OBSERVERS --- - "sec_heat_triggers", - "sec_safety_neg", + "sec_reflection_metaprogramming", + "sec_safety_bypasses", "sec_io", - "sec_danger", - "sec_flux", - "sec_graveyard", - "sec_bitwise_hits", + "sec_high_risk_execution", + "sec_state_mutation", + "sec_dead_code", + "sec_bitwise_ops", "sec_shadow_imports", "sec_homoglyphs", - "sec_private_info", + "sec_hardcoded_secrets", "sec_extension_mismatch", "sec_entropy", "sec_tainted_injection", @@ -1227,14 +1195,14 @@ def get_policy(mode="baseline"): "mutation", "event", "logic", - "danger", + "high_risk_execution", ], "FRIENDLY_MAP": { "m_locs": "Coding Lines of Code (LOC)", "locs": "Total Lines of Code (LOC)", "lang_ids": "Detected Languages", "lang_id": "Primary Language", - "mass": "Structural Complexity Mass", + "mass": "Structural Mass", "author_distribution": "Author Distribution", "control_flow_ratio": "Control Flow Ratio", "verification": "Testing & Verification Exposure", @@ -1247,17 +1215,17 @@ def get_policy(mode="baseline"): "api_exposure": "Public API Surface Area", "state_flux": "State Mutation Exposure", "branch": "Control Flow Branches", - "linear": "Sequential Logic Declarations", + "structural_boundaries": "Sequential Logic Declarations", "args": "Function Parameters", "func_start": "Function/Method Declarations", "class_start": "Class/Entity Declarations", "safety": "Defensive Programming Constructs", - "safety_neg": "Type/Safety Bypasses", - "danger": "High-Risk Execution Commands", + "safety_bypasses": "Type/Safety Bypasses", + "high_risk_execution": "High-Risk Execution Commands", "io": "I/O and Network Boundaries", "api": "Exposed API / Public Exports", - "flux": "State Mutations / Variable Reassignments", - "graveyard": "Commented-out Code (Dead Logic)", + "state_mutation": "State Mutations / Variable Reassignments", + "dead_code": "Commented-out Code (Dead Logic)", "doc": "Structured Documentation Blocks", "test": "Unit Test Assertions", "concurrency": "Asynchronous/Concurrent Execution", @@ -1268,13 +1236,13 @@ def get_policy(mode="baseline"): "generics": "Generic Type Abstractions", "comprehensions": "Collection Iterators / Comprehensions", "scientific": "Scientific & Mathematical Operations", - "heat_triggers": "Metaprogramming & Reflection", + "reflection_metaprogramming": "Metaprogramming & Reflection", "import": "Module Dependencies (Imports)", "ownership": "Authorship Metadata", "planned_debt": "Planned Work (TODOs)", "fragile_debt": "Acknowledged Tech Debt (FIXMEs)", "spec_exposure": "Specification Traceability Tags", - "civil_war": "Indentation Faction", + "tabs_vs_spaces": "Indentation Faction", "ssr_boundaries": "Server-Side Rendering Contexts", "events": "Event Publishers / Emitters", "dependency_injection": "Dependency Injection Constructs", @@ -1283,13 +1251,13 @@ def get_policy(mode="baseline"): "memory_alloc": "Manual Memory Allocation", "inline_asm": "Inline Assembly Blocks", "telemetry": "Structured Telemetry & Logging", - "print_hits": "Ad-hoc Print / Debug Statements", - "cast_hits": "Explicit Type Casts", - "bailout_hits": "Fatal Aborts & Exceptions", - "halt_hits": "Thread Sleeps & Blocking Waits", - "bitwise_hits": "Bitwise Operations", + "debug_prints": "Ad-hoc Print / Debug Statements", + "explicit_casts": "Explicit Type Casts", + "panics_and_aborts": "Fatal Aborts & Exceptions", + "thread_sleeps": "Thread Sleeps & Blocking Waits", + "bitwise_ops": "Bitwise Operations", "sync_locks": "Thread Synchronization Locks", - "freeze_hits": "Immutable Data Declarations", + "immutability_locks": "Immutable Data Declarations", "cleanup": "Resource Deallocation & Cleanup", "encapsulation": "Private / Encapsulated Scopes", "listeners": "Event Listeners & Subscribers", @@ -1314,16 +1282,16 @@ def get_policy(mode="baseline"): "lit_headers": "Structured Literature Headers", "lit_links": "Hyperlinked Literature References", # --- SECURITY LENS UI MAPPINGS (Plain English) --- - "sec_heat_triggers": "High-Entropy / Obfuscated Logic", - "sec_safety_neg": "Safety & Constraint Bypasses", + "sec_reflection_metaprogramming": "High-Entropy / Obfuscated Logic", + "sec_safety_bypasses": "Safety & Constraint Bypasses", "sec_io": "External Network & I/O Hooks", - "sec_danger": "Dynamic Code Execution (Eval/Exec)", - "sec_flux": "Global Environment Mutation", - "sec_graveyard": "Commented-Out Executable Logic", - "sec_bitwise_hits": "Low-Level Bitwise / Cryptographic Math", + "sec_high_risk_execution": "Dynamic Code Execution (Eval/Exec)", + "sec_state_mutation": "Global Environment Mutation", + "sec_dead_code": "Commented-Out Executable Logic", + "sec_bitwise_ops": "Low-Level Bitwise / Cryptographic Math", "sec_shadow_imports": "Non-Standard / Steganographic Imports", "sec_homoglyphs": "Non-Standard Unicode / Homoglyphs", - "sec_private_info": "Embedded Credentials & Keys", + "sec_hardcoded_secrets": "Embedded Credentials & Keys", # --- VULNERABILITY EXPOSURE MAPPINGS (Plain English) --- "obscured_payload": "Obfuscation & Evasion Surface", "logic_bomb": "Destructive Execution Surface", @@ -1340,12 +1308,12 @@ def get_policy(mode="baseline"): "api_exposure": "API Exposure", "concurrency": "Concurrency Exposure", "state_flux": "State Flux Exposure", - "graveyard": "Graveyard Exposure", + "dead_code": "Commented Logic Exposure", "spec_match": "Specification Exposure", "stability": "Instability Exposure", "churn": "Volatility Exposure", "documentation": "Documentation Exposure", - "civil_war": "Civil War Exposure", + "tabs_vs_spaces": "Civil War Exposure", "algorithmic_dos": "Algorithmic DoS Exposure", # --- SECURITY LENS UI LABELS (Plain English) --- "obscured_payload": "Obfuscation & Evasion Surface", diff --git a/gitgalaxy/standards/gitgalaxy_config.py b/gitgalaxy/standards/gitgalaxy_config.py index 0ca7cb5c..12842b78 100644 --- a/gitgalaxy/standards/gitgalaxy_config.py +++ b/gitgalaxy/standards/gitgalaxy_config.py @@ -104,7 +104,7 @@ # ------------------------------------------------------------------------------ APERTURE_CONFIG = { # --- 0. THE SECRETS SHUNT --- - # Files caught here will bypass standard physics math and instantly + # Files caught here will bypass standard structural signature analysis and instantly # register a 100.0 score on the Secrets Risk exposure vector. "SECRETS_EXTENSIONS": { ".pem", @@ -360,7 +360,7 @@ # ------------------------------------------------------------------------------ # These files are granted a high confidence score for importance by the GuideStar Lens. # If found, their Bayesian confidence is boosted (+0.10) to ensure they -# remain visible in the 3D map as high-priority architectural anchors. +# remain visible in the topological map as high-priority contextual baselines. PRIORITY_WHITELIST = [ # --- AI Ecosystem Anchor --- "__gitgalaxy_meta__.json", @@ -439,23 +439,35 @@ # ------------------------------------------------------------------------------ -# 4. LEXICAL SCANNER CONFIG (Comment Delimiters by Family) -# Consumed by: prism.py, language_lens.py +# 4. LEXICAL FAMILY HEURISTICS (Optical Delimiter Census) +# Consumed by: language_lens.py (Tier 4 Heuristic Discovery) # ------------------------------------------------------------------------------ -# Defines the structural delimiters for extracting literature (comment_stream) -COMMENT_DEFINITIONS = { - "mechanical_families": { - "c_style_comment": {"delimiters": ["//", "/*", "*/"]}, - "recursive_c_style": {"delimiters": ["//", "/*", "*/"]}, - "multi_style_dash": {"delimiters": ["--", "--[[", "]]", "{-", "-}"]}, - "embedded_syntax": {"delimiters": ["//", "/*", "*/", "#"]}, - "column_sensitive": {"delimiters": ["*>", "!", "C", "*", "D"]}, - "single_line_only": { - "delimiters": [ - "#", "<#", "#>", "=begin", "=end", "=pod", "=cut", - ";", "//", "dnl", "%", "%{", "%}", "#|", "|#" - ] - }, +# NOTE: This dictionary does NOT split the executable code from the non-executable text. +# That separation is handled by the compiled regexes in language_standards.py. +# This dictionary is a heuristic fallback radar. It counts raw tokens to guess +# the structural paradigm of unknown or extensionless files. +LEXICAL_FAMILY_HEURISTICS = { + "lexical_families": { + # 1. Standard Block (Non-Recursive) + # The language uses both line and block delimiters, but blocks CANNOT be nested. + # Examples: C, C++, Java, JavaScript, PHP, SQL, Go, CSS. + "standard_block": {"delimiters": ["//", "/*", "*/", "--", "--[[", "]]", "{-", "-}", "#"]}, + # 2. Recursive Block + # The language allows block comments to be safely nested inside one another. + # Examples: Rust, Swift, Dart, Scala. + "recursive_block": {"delimiters": ["//", "/*", "*/"]}, + # 3. Line Exclusive + # The language possesses no native multi-line block syntax. The engine ignores closing tags. + # Examples: Python, Shell, Makefile, Ruby, PowerShell, Assembly. + "line_exclusive": {"delimiters": ["#", "<#", "#>", "=begin", "=end", ";", "dnl", "%", "#|", "|#"]}, + # 4. Block Exclusive + # The language possesses no native single-line comment syntax. All text must be enclosed. + # Examples: HTML, XML. + "block_exclusive": {"delimiters": ["", "--!>"]}, + # 5. Positional Anchored + # The engine must verify the token's physical column placement. + # Examples: Legacy COBOL, Legacy Fortran, ABAP. + "positional_anchored": {"delimiters": ["*>", "!", "C", "*", "D"]}, } } @@ -575,7 +587,7 @@ STATIC_ARCHETYPES = { "literature": "Static: Literature & Documentation", "data": "Static: Declarative Data & Configurations", - "minified": "Static: Minified & Vendor Opaque Mass", + "minified": "Static: Minified & Vendor Opaque Footprint", "unknown": "Static: Unmapped / Unsupported Format", } diff --git a/gitgalaxy/standards/how_to_add_a_language.md b/gitgalaxy/standards/how_to_add_a_language.md index 30c8ae8f..89a5fe63 100644 --- a/gitgalaxy/standards/how_to_add_a_language.md +++ b/gitgalaxy/standards/how_to_add_a_language.md @@ -1,67 +1,63 @@ -# 🌌 Architecting a New Language (Extending the blAST Engine) +# Adding a New Language (Defining Structural Signatures) -GitGalaxy does not use brittle Abstract Syntax Trees (ASTs) or traditional compiler toolchains. Instead, we map planetary-scale codebases using the **blAST Engine** (Bypassing LLMs and ASTs): a polyglot structural physics engine. +GitGalaxy does not use brittle Abstract Syntax Trees (ASTs) or traditional compiler toolchains. Instead, we map enterprise codebases using a **Structural Signature Analysis Engine**: a polyglot structural analyzer. -Instead of writing a custom parser that breaks the moment a repository fails to compile, we teach the engine the "physics" of a new language using high-speed, mathematically bounded, ReDoS-proof regular expressions. This allows GitGalaxy to build a universal, comparative lexical taxonomy across entirely different computing eras (from 1980s COBOL to modern Rust). +Rather than writing a custom AST parser that breaks upon encountering syntax errors or incomplete code, we configure the engine with **Structural Signatures**—high-speed, mathematically bounded, ReDoS-proof regular expressions. This allows GitGalaxy to build a universal, comparative structural taxonomy across entirely different computing eras (from 1980s COBOL to modern Rust). For the mathematical proofs backing this architecture, review: -* 🔬 [The blAST Paradigm](../../docs/wiki/01-03-the-blast-paradigm.md) -* ⚖️ [Claim 10: The Heuristic vs. AST Paradigm](../../docs/wiki/03-10-claim-10-ast-vs-heuristic-parsing.md) -* 🛡️ [Claim 8: Empirical Validation of AST-Free Parsing](../../docs/wiki/03-08-claim-8-empirical-validation-of-ast-free-parsing.md) +* [The Heuristic Parsing Paradigm](../../docs/wiki/01-03-the-heuristic-paradigm.md) +* [Claim 10: Heuristic vs. AST Parsing](../../docs/wiki/03-10-claim-10-ast-vs-heuristic-parsing.md) +* [Claim 8: Empirical Validation of AST-Free Parsing](../../docs/wiki/03-08-claim-8-empirical-validation-of-ast-free-parsing.md) -To add a new language to the [Language Lens](../../docs/wiki/02-05-language-lens.md), you will use an advanced LLM (like Claude 3.5 Sonnet, GPT-4o, or Gemini 1.5 Pro) to generate the structural dictionary. +To add a new language to the Language Classifier, you will use an advanced LLM (like Claude 3.5 Sonnet, GPT-4o, or Gemini 1.5 Pro) to generate the Structural Signatures dictionary. --- ### Step 1: Initialize the LLM Context -Before asking the LLM to generate the new language, upload the `gitgalaxy/standards/language_standards.py` file to the chat window. Issue this exact command: -> *"Read this file to understand how the GitGalaxy physics engine uses bounded regex to guarantee ReDoS immunity. Pay close attention to how C++ and Python are mapped to prevent Catastrophic Backtracking."* +Before asking the LLM to generate the new language signatures, upload the `gitgalaxy/standards/language_standards.py` file to the chat window. Issue this exact command: +> *"Read this file to understand how the GitGalaxy Structural Signature Analysis Engine uses bounded regex to guarantee ReDoS immunity. Pay close attention to how C++ and Python are mapped to prevent Catastrophic Backtracking."* -### Step 2: Inject the Master Calibration Prompt -Copy the **Master Prompt** below and paste it into the LLM. Replace `[TARGET LANGUAGE]` with the exact language you want to map. +### Step 2: Inject the Structural Signature Prompt +Copy the **Generation Prompt** below and paste it into the LLM. Replace `[TARGET LANGUAGE]` with the exact language you want to map. -### Step 3: Calibrate the Engine +### Step 3: Register the Signatures 1. Open `gitgalaxy/standards/language_standards.py`. -2. Locate the `LANGUAGE_DEFINITIONS` matrix. -3. Paste the generated Python dictionary directly into the registry to instantly grant the engine native support for the new architecture. +2. Locate the `LANGUAGE_DEFINITIONS` registry. +3. Paste the generated Python dictionary directly into the registry to instantly grant the engine native support for the new language architecture.

--- -## ⚙️ The Master Calibration Prompt +## ⚙️ The Structural Signature Generation Prompt *Copy everything below this line and feed it directly to the LLM.* **Prompt:** You are an expert compiler engineer and static analysis specialist. Please generate a GitGalaxy REGISTRY regex dictionary for **[TARGET LANGUAGE]** using the strict Zero-Trust framework defined below. -This dictionary will be used by an AST-free physics engine to create a system of consistent 1:1 cross-language comparisons (a comparative lexical taxonomical map), calculating risk exposures across implicit and explicit language behaviors. The engine uses `re.M` (Multiline) to scan 50,000+ line enterprise files at extreme velocity. +This dictionary defines the **Structural Signatures** used by an AST-free parsing engine to create a system of consistent 1:1 cross-language comparisons. The engine calculates risk exposures across implicit and explicit language behaviors. The engine uses `re.M` (Multiline) to scan 50,000+ line enterprise files at extreme velocity. ### 🚨 CRITICAL ENGINE RULES -1. **The Physical Reality Rule (Implicit vs. Explicit):** Do not just hunt for explicit keywords; capture the physical reality. If defining API Exposure (Key 10), determine if the language is implicitly public (e.g., Python, Fortran). If so, the regex must capture standard function/subroutine definitions, not just the rare use of an explicit public or export tag. -2. **The Paradigm Forgiveness Rule:** Do not punish a language for operating within its standard paradigm. Example: Standard C-style pointer casting is standard operating procedure, not a structural fracture. It must be routed to Phase 5 (Resource Management & Stability) as friction, NOT placed in Phase 2 `safety_neg` where it will trigger the Breach Cap. -3. **The "Commented-out Code" Rule (Contextual Debt):** When assessing Tech Debt or Danger, isolate human commentary from execution flow. Example: `TODO` and `FIXME` are planned debt. They must NEVER be placed in execution-blocking keys like `danger`, otherwise a file with high developer documentation will be falsely penalized as a volatile execution risk. -4. **The Comparative Map Rule (Use `None`):** If a dimension does not exist natively in the target language (e.g., pointers in JavaScript, decorators in C), you MUST explicitly set its key to `None`. Do not force a fit. +1. **Semantic Intent Over Keyword Matching (Implicit vs. Explicit):** Do not just hunt for explicit keywords; capture the practical reality of the language. If defining `api` (Public Surface Area), determine if the language is implicitly public (e.g., Python, Fortran). If so, the regex must capture standard function/subroutine definitions, not just the rare use of an explicit public or export tag. +2. **Idiomatic Paradigm Alignment:** Do not penalize a language for operating within its standard paradigm. Example: Standard C-style pointer casting is standard operating procedure, not a structural fracture. It must be routed to `explicit_casts` (Resource Management), NOT placed in `safety_bypasses` where it will artificially trigger risk alerts. +3. **Annotation & Execution Isolation:** When assessing Technical Debt or High-Risk Execution, isolate human commentary from execution flow. Example: `TODO` and `FIXME` are planned debt. They must NEVER be placed in execution-blocking keys like `high_risk_execution`, otherwise a file with high developer documentation will be falsely penalized as a volatile execution risk. +4. **Strict Feature Parity (Use `None`):** If a structural dimension does not exist natively in the target language (e.g., pointers in JavaScript, decorators in C), you MUST explicitly set its key to `None`. Do not force a fit. 5. **Absolute ReDoS Immunity (No Catastrophic Backtracking):** Bound all wildcards. Never use `.*` inside brackets. Always use negation (e.g., `<[^>]*>`). In `re.M` mode, `\s` matches newlines (`\n`). * ❌ NEVER use `^\s*`. ✅ ALWAYS use `^[ \t]*`. * ❌ NEVER use `\s*$`. ✅ ALWAYS use `[ \t]*$`. * ❌ NEVER use `\s*=`. ✅ ALWAYS use `[ \t]*=`. * ❌ NEVER nest unbounded quantifiers like `(?:[ \t]*\*+)*` or `(?:(?:public|private)\s+)*`. ✅ ALWAYS use strict numeric clamps like `(?:[ \t*&]+){0,10}` or `(?:(?:public|static)[ \t]+){0,3}`. -6. **The Geometry Inflator Bug:** Do NOT put access modifiers (e.g., public, private, static) in the `linear` array. This artificially inflates the math and turns all files into smooth spheres, destroying visual 3D complexity. -7. **Object/Entity Spurious Matches:** `func_start` must ONLY match executable logic blocks (methods/functions/constructors). Do NOT match interfaces, types, or classes here. -8. **Resource Management & Stability:** Pay special attention to Phase 5. Ensure that chaos (e.g., concurrency, events, flux) and order (e.g., sync_locks, listeners, freeze_hits) are cleanly separated into their specific regex keys so the physics engine can balance them. - -### THE LEXICAL FAMILIES -You must assign the language to one of these optical parsing families based on how it handles comments / non-executable text: -* `std_c`: Standard C-style (Line: `//`, Block: `/* ... */`). Examples: C, C++, Java, JS, Go. -* `nested_c`: Supports recursive block nesting (Line: `//`, Block: `/* /* */ */`). Examples: Rust, Swift, Scala. -* `pure_hash`: Hash-style only (Line: `#`, Block: None). Examples: Python, Shell, Makefile. -* `hybrid_hash`: Hash line + custom block (Line: `#`, Block: `<# #>` or `=begin =end`). Examples: PowerShell, Ruby. -* `hybrid_dash`: Dash line + custom block (Line: `--`, Block: `/* */` or `--[[ ]]`). Examples: SQL, Lua, Haskell. -* `polyglot`: Supports multiple line comment tokens (e.g., `//`, `#`, `/* */`). Examples: PHP, LiveCode. -* `positional`: Legacy column-based parsing. Examples: Fortran (C/* in col 1), ABAP. -* `singular`: SGML/XML style block only (``), or unique line delimiters (Assembly `;`). Examples: HTML, XML, Assembly. -* `lisp_semi`: Lisp-style (Line: `;`, Block: `#| |#`). Examples: Scheme, Racket. +6. **The Metric Inflation Anti-Pattern:** Do NOT put access modifiers (e.g., public, private, static) in the `structural_boundaries` array. This artificially inflates structural complexity metrics. +7. **Strict Execution Anchoring:** `func_start` must ONLY match executable logic blocks (methods/functions/constructors). Do NOT match interfaces, types, or classes here. +8. **Resource Management & Synchronization:** Pay special attention to Phase 5. Ensure that asynchronous execution (`concurrency`) and synchronization (`sync_locks`) are cleanly separated into their specific regex keys so the engine can balance them accurately. + +### THE LEXICAL PARSING FAMILIES +You must assign the language to one of these 5 lexical parsing families based on how it handles comments and non-executable text: +* `standard_block`: The language uses both line and block delimiters, but blocks CANNOT be nested. Examples: C, C++, Java, JavaScript, PHP, SQL, Go, Ruby, Lua. +* `recursive_block`: The language allows block comments to be safely nested inside one another. Examples: Rust, Swift, Dart, Scala. +* `line_exclusive`: The language possesses no native multi-line block syntax. The engine ignores closing tags. Examples: Python, Shell, Makefile, Assembly, Scheme. +* `block_exclusive`: The language possesses no native single-line comment syntax. All text must be enclosed. Examples: HTML, XML. +* `positional_anchored`: The engine must verify the token's physical column placement. Examples: Legacy COBOL, Legacy Fortran, ABAP. ### OUTPUT SCHEMA & DEFINITIONS Generate a valid Python dictionary matching this exact structure. @@ -76,124 +72,124 @@ Generate a valid Python dictionary matching this exact structure. }, "extensions": [], # e.g. [".js", ".jsx"] "exact_matches": [], # e.g. ["Makefile"] - "discriminators": [], # Ecosystem anchors (e.g. "package.json") + "discriminators": [], # Ecosystem Indicators / Disambiguation Anchors (e.g. "package.json") "shebangs": [], - "lexical_family": "", # See Lexical Families list above + "lexical_family": "", # See Lexical Parsing Families list above "rules": { - # --- OPTICAL SPLITS --- + # --- LEXICAL DELIMITER CONTROLS --- "_line_anchor": re.compile(r""), "_inline_comment": re.compile(r""), "_block_start": re.compile(r""), "_block_end": re.compile(r""), - # --- PHASE 1: GEOMETRY & STRUCTURE --- - # branch (Control Flow / Branching): Control flow that forces the CPU to make a decision or jump. High density creates jagged shapes. Includes: if, else, switch, for, while, catch, try, &&, ||, ternary. EXCLUDES: Exceptions (throw, raise) — these belong in bailout_hits. + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- + # branch: Control flow that forces the CPU to make a decision or jump. Includes: if, else, switch, for, while, catch, try, &&, ||, ternary. EXCLUDES: Exceptions (throw, raise) — these belong in panics_and_aborts. "branch": re.compile(r""), - # args (Parameters / Coupling): Signatures defining input parameters. Drives the physical size/mass of the function. Includes: parameter blocks of functions, methods, and lambdas. Must safely step over type hints. + # args: Signatures defining input parameters. Includes: parameter blocks of functions, methods, and lambdas. Must safely step over type hints. "args": re.compile(r""), - # linear (Sequential Boundaries): Keywords defining structural boundaries and straight-line execution. Smooths the geometry into spheres. Includes: var, return, class, import. EXCLUDES: Access modifiers (public, private) and Immutability keywords (const, final — these belong in freeze_hits). - "linear": re.compile(r""), - # func_start (Executable Logic Anchors): Exact syntax anchoring the start of an executable block of logic. Includes: Method signatures, constructors. EXCLUDES: Interfaces, types, and classes. + # structural_boundaries: Keywords defining structural boundaries and straight-line execution. Includes: var, return, class, import. EXCLUDES: Access modifiers (public, private) and Immutability keywords (const, final — these belong in immutability_locks). + "structural_boundaries": re.compile(r""), + # func_start: Exact syntax anchoring the start of an executable block of logic. Includes: Method signatures, constructors. EXCLUDES: Interfaces, types, and classes. "func_start": re.compile(r""), - # class_start (Object / Entity Declarations): The syntax that defines an object-oriented class, struct, or record. Drives API Surface Area math. + # class_start: The syntax that defines an object-oriented class, struct, or record. "class_start": re.compile(r""), - # --- PHASE 2: RISK & STRUCTURAL INTEGRITY --- - # safety (Defensive Programming / Validation): Defensive programming constructs that prevent crashes at runtime. Includes: try/catch, explicit null checks, guard. EXCLUDES: Immutability. + # --- PHASE 2: SAFETY & EXECUTION RISK --- + # safety: Defensive programming constructs that prevent crashes at runtime. Includes: try/catch, explicit null checks, guard. EXCLUDES: Immutability. "safety": re.compile(r""), - # safety_neg (Safety Bypasses / Unchecked Types): Syntax that actively bypasses type safety, swallows errors, or relies on unpredictable state. Includes: Force unwrapping (!), any, raw memory casting, linter bypasses (@ts-ignore). - "safety_neg": re.compile(r""), - # danger (High-Risk Execution / System Calls): Extreme tech debt, process-killing commands, and catastrophic runtime vulnerabilities. Includes: eval, exec, process.exit. EXCLUDES: TODO/HACK (debt) and print (print_hits). - "danger": re.compile(r""), - # io (I/O & Network Boundaries): Interaction with the disk, network, or external systems. Includes: File writing/reading, HTTP clients, sockets. EXCLUDES: Logging/printing. + # safety_bypasses: Syntax that actively bypasses type safety, swallows errors, or relies on unpredictable state. Includes: Force unwrapping (!), any, raw memory casting, linter bypasses (@ts-ignore). + "safety_bypasses": re.compile(r""), + # high_risk_execution: Process-killing commands and catastrophic runtime vulnerabilities. Includes: eval, exec, process.exit. EXCLUDES: TODO/HACK (planned_debt) and print (debug_prints). + "high_risk_execution": re.compile(r""), + # io: Interaction with the disk, network, or external systems. Includes: File writing/reading, HTTP clients, sockets. EXCLUDES: Logging/printing. "io": re.compile(r""), - # api (Public Surface Area): Code exposed to the outside world. Measures physical surface area (Mitigated by encapsulation). Captures explicit visibility markers (export, public) AND implicit architectural defaults. If the linker can touch it, it possesses surface area. + # api: Code exposed to the outside world. Captures explicit visibility markers (export, public) AND implicit architectural defaults. "api": re.compile(r""), - # flux (State Mutation): Mutation of state. Reassignment of variables or modifying collections. (Mitigated by freeze_hits). Includes: let, mut, volatile, .push(), .set(). - "flux": re.compile(r""), - # graveyard (Dead / Commented-out Code): Commented-out structural code and unused logic trails. Includes: // if (x), /* var y */. - "graveyard": re.compile(r""), - # doc (Structured Documentation): Structured documentation meant to be parsed by IDEs or generators. Includes: JSDoc, Docstrings. + # state_mutation: Reassignment of variables or modifying collections. Includes: let, mut, volatile, .push(), .set(). + "state_mutation": re.compile(r""), + # dead_code (Commented Logic / Deprecated Trails): Commented-out structural code and unused logic trails. Includes: // if (x), /* var y */. + "dead_code": re.compile(r""), + # doc: Structured documentation meant to be parsed by IDEs or generators. Includes: JSDoc, Docstrings. "doc": re.compile(r""), - # test (Testing & Assertions): Assertions and unit testing framework keywords. (Mitigates test_skip). Includes: describe, it, assert, expect. + # test: Assertions and unit testing framework keywords. Includes: describe, it, assert, expect. "test": re.compile(r""), # --- PHASE 3: ARCHITECTURE & DOMAIN SENSORS --- - # concurrency (Asynchronous Execution): Time-bending logic and parallel execution. (Mitigated by sync_locks). Includes: async, await, Promise, Thread. + # concurrency: Asynchronous logic and parallel execution. Includes: async, await, Promise, Thread. "concurrency": re.compile(r""), - # ui_framework (UI / View Components): DOM manipulation, UI components. Includes: HTML tags, React hooks. + # ui_framework: DOM manipulation, UI components. Includes: HTML tags, React hooks. "ui_framework": re.compile(r""), - # closures (Closures / Anonymous Functions): Anonymous functions, lambdas, inline callbacks. Includes: Fat arrows (=>). + # closures: Anonymous functions, lambdas, inline callbacks. Includes: Fat arrows (=>). "closures": re.compile(r""), - # globals (Global / Shared State): Accessing global state, environment variables, or system registries. Includes: window., process.env. + # globals: Accessing global state, environment variables, or system registries. Includes: window., process.env. "globals": re.compile(r""), - # decorators (Decorators / Annotations): Annotations applied to classes/methods. Includes: @Injectable, [Obsolete]. + # decorators: Annotations applied to classes/methods. Includes: @Injectable, [Obsolete]. "decorators": re.compile(r""), - # generics (Generics / Type Parameters): Type parameters that make logic reusable but harder to read. Includes: , List. + # generics: Type parameters indicating generic abstractions. Includes: , List. "generics": re.compile(r""), - # comprehensions (Iterators / Comprehensions): Functional array transformations or inline looping. Includes: .map(, .filter(. + # comprehensions: Collection iterators or inline looping. Includes: .map(, .filter(. "comprehensions": re.compile(r""), - # scientific (Numerical / Compute Libraries): Math, data science, and complex rendering libraries. Includes: Math., numpy. + # scientific: Math, data science, and complex rendering libraries. Includes: Math., numpy. "scientific": re.compile(r""), - # heat_triggers (Metaprogramming & Reflection): Highly complex, "clever" code that causes cognitive meltdown. Includes: Reflection, Proxy, .bind(). - "heat_triggers": re.compile(r""), - # import (Dependency Inclusions): Dependency resolution and module loading. Includes: import, require, using. + # reflection_metaprogramming (Cognitive Load / Metaprogramming Density): Metaprogramming, reflection, and dynamic property assignment. Includes: Reflection, Proxy, .bind(). + "reflection_metaprogramming": re.compile(r""), + # import: Dependency resolution and module loading. Includes: import, require, using. "import": re.compile(r""), # _dependency_capture: Regex strictly capturing group 1 as the exact dependency path string. "_dependency_capture": re.compile(r""), - # ownership (Authorship Metadata): Authorship metadata. Includes: @author, Created by:. + # ownership: Authorship metadata. Includes: @author, Created by:. "ownership": re.compile(r""), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- - # planned_debt (Annotated Debt / TODOs): Future work that doesn't necessarily imply brokenness. Includes: TODO, WIP, STUB. + # planned_debt: Annotated future work. Includes: TODO, WIP, STUB. "planned_debt": re.compile(r""), - # fragile_debt (Acknowledged Hacks / FIXMEs): Explicit admissions of fragile, dangerous, or ugly logic. Includes: HACK, FIXME, XXX, WTF. + # fragile_debt: Explicit admissions of fragile or dangerous logic. Includes: HACK, FIXME, XXX. "fragile_debt": re.compile(r""), - # private_info (Hardcoded Secrets / Credentials): Hardcoded secrets, static credentials, or API keys baked into code. Includes: password, secret, token, api_key. - "private_info": re.compile(r""), - # spec_exposure (Spec / Audit Traceability): Audit tags establishing traceability of intent. Includes: [SPEC-123], [audit]. + # hardcoded_secrets: Static credentials or API keys baked into code. Includes: password, secret, token. + "hardcoded_secrets": re.compile(r""), + # spec_exposure: Audit tags establishing traceability of intent. Includes: [SPEC-123], [audit]. "spec_exposure": re.compile(r""), - # civil_war (Formatting Inconsistencies): Structural formatting markers used to calculate Tabs vs. Spaces ratio. Often None. - "civil_war": None, - # ssr_boundaries (Server-Side Rendering): Server-Side Rendering computation boundaries where backend meets frontend. Includes: getServerSideProps. + # tabs_vs_spaces (Formatting Inconsistencies): Structural formatting markers used to calculate indentation consistency. Often None. + "tabs_vs_spaces": None, + # ssr_boundaries: Server-Side Rendering computation boundaries. Includes: getServerSideProps. "ssr_boundaries": re.compile(r""), - # events (Event Emitters / Pub-Sub): Event-driven architecture signatures and message brokers. (Mitigated by listeners). Includes: emit, EventEmitter, Kafka, Publisher. + # events: Event-driven architecture signatures and message brokers. Includes: emit, EventEmitter, Kafka. "events": re.compile(r""), - # dependency_injection (Dependency Injection / IoC): Inversion of Control (IoC) injection markers. Includes: @Autowired, @Inject. + # dependency_injection: Inversion of Control (IoC) injection markers. Includes: @Autowired, @Inject. "dependency_injection": re.compile(r""), - # macros (Preprocessor Directives / Macros): Compiler pragmas or macro definitions that generate code at compile-time. Includes: #define, macro_rules!. + # macros: Compiler pragmas or macro definitions that generate code at compile-time. Includes: #define, macro_rules!. "macros": re.compile(r""), - # pointers (Pointer Arithmetic / Memory Addressing): Explicit tracking of raw memory addressing and pointer dereferencing. Includes: *const, &mut, IntPtr. + # pointers: Explicit tracking of raw memory addressing and pointer dereferencing. Includes: *const, &mut, IntPtr. "pointers": re.compile(r""), - # memory_alloc (Manual Memory Management): Explicit unmanaged memory allocations and raw heap manipulations. (Mitigated by cleanup). Includes: malloc, new. + # memory_alloc: Explicit unmanaged memory allocations and raw heap manipulations. Includes: malloc, new. "memory_alloc": re.compile(r""), - # inline_asm (Inline Assembly): Direct CPU architecture bridging. Includes: __asm__, asm!. + # inline_asm: Direct CPU architecture bridging. Includes: __asm__, asm!. "inline_asm": re.compile(r""), # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- - # telemetry (Structured Logging / Telemetry): Structured logging and observability frameworks used safely in production. Acts as executable documentation. + # telemetry: Structured logging and observability frameworks. "telemetry": re.compile(r""), - # print_hits (Standard Output / Debug Prints): Ad-hoc, temporary debug statements pushed to production. Includes: print(, console.log(. - "print_hits": re.compile(r""), - # cast_hits (Explicit Type Casting): Explicitly bypassing the compiler's type-checker. Indicates misaligned data structures. Includes: as String, (int), static_cast. - "cast_hits": re.compile(r""), - # bailout_hits (Execution Halts / Panics): Forcefully destroying the current execution context. Includes: throw, raise, panic!, abort(). - "bailout_hits": re.compile(r""), - # halt_hits (Thread Blocking / Sleeps): Forcing a thread to sleep (often an admission of a race condition). Includes: sleep(, delay(. - "halt_hits": re.compile(r""), - # bitwise_hits (Bitwise Operations): Manipulating raw bytes and memory registers. Extremely dense, low-level logic. EXCLUDES logical &&/||. - "bitwise_hits": re.compile(r""), - # sync_locks (Thread Synchronization / Locks): Explicitly coordinating threaded logic to prevent race conditions. (Mitigates concurrency). + # debug_prints (Debug Artifacts / Unstructured Outputs): Ad-hoc, temporary debug statements. Includes: print(, console.log(. + "debug_prints": re.compile(r""), + # explicit_casts: Explicitly bypassing the compiler's type-checker. Includes: as String, (int), static_cast. + "explicit_casts": re.compile(r""), + # panics_and_aborts (Execution Interrupts / Fatal Aborts): Forcefully destroying the current execution context. Includes: throw, raise, panic!, abort(). + "panics_and_aborts": re.compile(r""), + # thread_sleeps (Thread Blocking / Synchronous Pauses): Thread blocking or forced timeouts. Includes: sleep(, delay(. + "thread_sleeps": re.compile(r""), + # bitwise_ops: Bitwise operations manipulating raw bytes. EXCLUDES logical &&/||. + "bitwise_ops": re.compile(r""), + # sync_locks: Explicitly coordinating threaded logic to prevent race conditions. "sync_locks": re.compile(r""), - # freeze_hits (Immutability Constraints): Explicitly locking data so it cannot be mutated. (Mitigates flux). Includes: const, final, readonly. - "freeze_hits": re.compile(r""), - # cleanup (Resource Cleanup / Teardown): Explicitly destroying state or releasing resources to prevent leaks. (Mitigates memory_alloc and io). Includes: free(, dispose(), .close(). + # immutability_locks (Immutability Constraints): Explicitly locking data so it cannot be mutated. Includes: const, final, readonly. + "immutability_locks": re.compile(r""), + # cleanup (Resource Cleanup / Teardown): Explicitly destroying state or releasing resources. Includes: free(, dispose(), .close(). "cleanup": re.compile(r""), - # encapsulation (Access Modifiers / Encapsulation): Explicitly hiding logic from the rest of the application. (Mitigates api). Includes: private, protected, internal. + # encapsulation (Encapsulation / Access Modifiers): Explicitly hiding logic from the rest of the application. Includes: private, protected, internal. "encapsulation": re.compile(r""), - # listeners (Event Listeners / Observers): Waiting to receive state from an external broadcast. (Mitigates events). Includes: on(, addEventListener, subscribe(. + # listeners: Waiting to receive state from an external broadcast. Includes: on(, addEventListener, subscribe(. "listeners": re.compile(r""), - # test_skip (Bypassed Tests / Ignored Specs): Code that uses the testing framework but explicitly bypasses verification. (Anti-pattern to test). Includes: @Ignore, test.skip(. + # test_skip: Bypassed tests or ignored verification specs. Includes: @Ignore, test.skip(. "test_skip": re.compile(r""), # --- HYBRID DOMAIN SENSORS --- @@ -206,4 +202,5 @@ Generate a valid Python dictionary matching this exact structure. # ipc_rpc_bridges: Inter-process or RPC bridging commands. "ipc_rpc_bridges": re.compile(r"") } -} \ No newline at end of file +} +``` \ No newline at end of file diff --git a/gitgalaxy/standards/language_lens.py b/gitgalaxy/standards/language_lens.py index d3c0d68c..22ea3222 100644 --- a/gitgalaxy/standards/language_lens.py +++ b/gitgalaxy/standards/language_lens.py @@ -50,7 +50,7 @@ class LanguageDetector: Linguistic Classification Engine. PURPOSE: - Converts raw text signals, file metadata, and Bayesian priors into a high-fidelity + Converts raw text signals, file metadata, and Bayesian priors into a high-fidelity language classification ('Identity Lock'). ARCHITECTURE (The Confidence Hierarchy): @@ -65,11 +65,11 @@ class LanguageDetector: def __init__( self, language_definitions: Dict[str, Any], - comment_definitions: Dict[str, Any], + lexical_heuristics: Dict[str, Any], parent_logger: Optional[logging.Logger] = None, ): self.languages = language_definitions - self.comment_defs = comment_definitions + self.lexical_heuristics = lexical_heuristics if parent_logger: self.logger = parent_logger.getChild("lens") @@ -83,9 +83,7 @@ def __init__( # --- BAYESIAN TUNING CONSTANTS (Dynamic Fetch) --- self.thresholds = LENS_CONFIG.get("THRESHOLDS", {}) - self.COLLISION_FREQUENCIES = set( - LENS_CONFIG.get("COLLISION_FREQUENCIES", set()) - ) + self.COLLISION_FREQUENCIES = set(LENS_CONFIG.get("COLLISION_FREQUENCIES", set())) self.PROSE_ANCHORS = set(LENS_CONFIG.get("PROSE_ANCHORS", set())) # Compile syntactic disqualifiers on boot to save CPU cycles per file @@ -107,9 +105,7 @@ def __init__( self.logger.debug("Initializing O(1) lookup maps for Linguistic Classifier...") self._calibrate_lookup_maps() - self.logger.debug( - f"Classifier Online | {len(self.extension_map)} Extensions | {len(self.anchor_map)} Anchors" - ) + self.logger.debug(f"Classifier Online | {len(self.extension_map)} Extensions | {len(self.anchor_map)} Anchors") def _calibrate_lookup_maps(self): """Builds O(1) dictionaries mapping extensions and exact filenames to languages.""" @@ -131,9 +127,7 @@ def _calibrate_lookup_maps(self): for anchor in self.PROSE_ANCHORS: if anchor not in self.anchor_map: - self.anchor_map[anchor] = ( - "markdown" if anchor == "README" else "plaintext" - ) + self.anchor_map[anchor] = "markdown" if anchor == "README" else "plaintext" def focus( self, file_path: Union[str, Path], content_sample: str = "", **kwargs @@ -171,7 +165,7 @@ def inspect( ext = "" # 2. Extract hidden true extensions (e.g. script.sh.template -> .sh) - # ONLY extract if the final extension is a known, safe wrapper. This prevents + # ONLY extract if the final extension is a known, safe wrapper. This prevents # spoofing attacks like malware.exe.txt else: SAFE_WRAPPERS = { @@ -184,16 +178,12 @@ def inspect( ".gen", ".in", } - if (ext not in self.extension_map or ext in SAFE_WRAPPERS) and len( - path_obj.suffixes - ) > 1: + if (ext not in self.extension_map or ext in SAFE_WRAPPERS) and len(path_obj.suffixes) > 1: if ext in SAFE_WRAPPERS: for middle_ext in reversed(path_obj.suffixes[:-1]): if middle_ext.lower() in self.extension_map: ext = middle_ext.lower() - self.logger.debug( - f"[{name}] Extracted underlying extension '{ext}' from template wrapper" - ) + self.logger.debug(f"[{name}] Extracted underlying extension '{ext}' from template wrapper") break if not intent_vector and has_intent: @@ -203,17 +193,9 @@ def inspect( "source_proof": "Legacy Context Pass", } - prior_lang = ( - intent_vector.get("lang_id", "unknown") if intent_vector else "unknown" - ) - prior_conf = ( - intent_vector.get("prior_confidence", 0.10) if intent_vector else 0.10 - ) - prior_proof = ( - intent_vector.get("source_proof", "Discovery") - if intent_vector - else "Discovery" - ) + prior_lang = intent_vector.get("lang_id", "unknown") if intent_vector else "unknown" + prior_conf = intent_vector.get("prior_confidence", 0.10) if intent_vector else 0.10 + prior_proof = intent_vector.get("source_proof", "Discovery") if intent_vector else "Discovery" result: DetectorResult = { "lang_id": "undeterminable", @@ -331,13 +313,9 @@ def inspect( base_stem = path_obj.stem.lower() if ext == ".h": if f"{base_stem}.c" in ext_tally: - return self._forge_result( - "c", 0.99, 0, "Sibling Anchor (.c)", result, content_sample - ) + return self._forge_result("c", 0.99, 0, "Sibling Anchor (.c)", result, content_sample) elif f"{base_stem}.cpp" in ext_tally or f"{base_stem}.cc" in ext_tally: - return self._forge_result( - "cpp", 0.99, 0, "Sibling Anchor (C++)", result, content_sample - ) + return self._forge_result("cpp", 0.99, 0, "Sibling Anchor (C++)", result, content_sample) elif f"{base_stem}.m" in ext_tally: return self._forge_result( "objective-c", @@ -374,14 +352,10 @@ def inspect( ) if is_conflict: - self.logger.warning( - f"[{name}] IDENTITY CONFLICT: Ext '{ext_lang}' contradicts Shebang '{shebang_lang}'" - ) + self.logger.warning(f"[{name}] IDENTITY CONFLICT: Ext '{ext_lang}' contradicts Shebang '{shebang_lang}'") # 1. Cache the threat into RAM for the SAST Engine - result["anomaly_flags"].append( - f"Identity Masking: Extension ({ext_lang}) vs Shebang ({shebang_lang})" - ) + result["anomaly_flags"].append(f"Identity Masking: Extension ({ext_lang}) vs Shebang ({shebang_lang})") # 2. Force the file into the Unclassified Baseline return self._forge_result( @@ -415,24 +389,14 @@ def inspect( 0, "Absolute Consensus (Ext + Shebang)", ) - elif ( - ext_lang - and ext_lang != "undeterminable" - and prior_lang == ext_lang - and prior_conf >= 0.75 - ): + elif ext_lang and ext_lang != "undeterminable" and prior_lang == ext_lang and prior_conf >= 0.75: best_lang, best_conf, lock_tier, source_proof = ( ext_lang, 0.999, 0, f"Absolute Consensus (Ext + {prior_proof})", ) - elif ( - shebang_lang - and shebang_lang != "undeterminable" - and prior_lang == shebang_lang - and prior_conf >= 0.75 - ): + elif shebang_lang and shebang_lang != "undeterminable" and prior_lang == shebang_lang and prior_conf >= 0.75: best_lang, best_conf, lock_tier, source_proof = ( shebang_lang, 0.999, @@ -480,21 +444,15 @@ def inspect( gravity_lang = None # Only apply Ecosystem Consensus if we don't already have a strong Tier 2 internal signature if ext in self.COLLISION_FREQUENCIES and ext_tally and lock_tier > 2: - gravity_lang, dominance = self._evaluate_ecosystem_gravity( - file_path, ext, ext_tally - ) + gravity_lang, dominance = self._evaluate_ecosystem_gravity(file_path, ext, ext_tally) if gravity_lang: if dominance >= self.thresholds.get("ECOSYSTEM_DOMINANCE_MIN", 0.70): best_lang = gravity_lang best_conf = 0.95 lock_tier = 1.5 - source_proof = ( - f"Ecosystem Consensus Lock ({dominance * 100:.0f}% Local Dominance)" - ) - self.logger.debug( - f"[{name}] Fast-tracked via Ecosystem Consensus -> {gravity_lang}" - ) + source_proof = f"Ecosystem Consensus Lock ({dominance * 100:.0f}% Local Dominance)" + self.logger.debug(f"[{name}] Fast-tracked via Ecosystem Consensus -> {gravity_lang}") # ========================================================================= # TIER 1.7: UNKNOWN EXTENSION FALLBACK @@ -525,15 +483,11 @@ def inspect( ) is_true_unknown = ( - lock_tier == 4 - and best_lang in ("undeterminable", "unknown") - and ext not in self.COLLISION_FREQUENCIES + lock_tier == 4 and best_lang in ("undeterminable", "unknown") and ext not in self.COLLISION_FREQUENCIES ) if is_true_unknown: - coding_loc = max( - content_sample.count("\n") + (1 if content_sample else 0), 1 - ) + coding_loc = max(content_sample.count("\n") + (1 if content_sample else 0), 1) spectral_id, spec_intensity = self._tier_4_heuristic_discovery( content_sample, coding_loc, ext, gravity_lang ) @@ -549,12 +503,8 @@ def inspect( if spectral_id == best_lang: best_conf = max(best_conf, 0.95) lock_tier = 0 - source_proof = ( - f"Absolute Consensus (Lexically Verified: {source_proof})" - ) - self.logger.debug( - f"[{name}] Verification Success -> {source_proof}" - ) + source_proof = f"Absolute Consensus (Lexically Verified: {source_proof})" + self.logger.debug(f"[{name}] Verification Success -> {source_proof}") else: if ext in self.COLLISION_FREQUENCIES: best_lang = spectral_id @@ -564,12 +514,12 @@ def inspect( elif lock_tier == 4: if spec_intensity >= self.thresholds.get("FLOOR_TIER_4", 0.92): best_lang, best_conf = spectral_id, spec_intensity - source_proof = f"Heuristic Discovery (Passed {self.thresholds.get('FLOOR_TIER_4', 0.92)} Baseline)" - else: - best_lang, best_conf = "undeterminable", spec_intensity source_proof = ( - f"Failed Discovery Baseline ({spec_intensity:.2f})" + f"Heuristic Discovery (Passed {self.thresholds.get('FLOOR_TIER_4', 0.92)} Baseline)" ) + else: + best_lang, best_conf = "undeterminable", spec_intensity + source_proof = f"Failed Discovery Baseline ({spec_intensity:.2f})" elif lock_tier >= 2: if spec_intensity > best_conf: best_lang = spectral_id @@ -591,33 +541,22 @@ def inspect( else: source_proof += " (Unverified Lexical Score)" - self.logger.debug( - f"[{name}] Final Classification -> '{best_lang}' (Tier: {lock_tier} | Conf: {best_conf:.2f})" - ) + self.logger.debug(f"[{name}] Final Classification -> '{best_lang}' (Tier: {lock_tier} | Conf: {best_conf:.2f})") - if ( - best_lang not in ("undeterminable", "plaintext", "unknown") - and content_sample - ): + if best_lang not in ("undeterminable", "plaintext", "unknown") and content_sample: result["lang_mix"] = self._detect_hybrids(content_sample, best_lang) - return self._forge_result( - best_lang, best_conf, lock_tier, source_proof, result, content_sample - ) + return self._forge_result(best_lang, best_conf, lock_tier, source_proof, result, content_sample) def _evaluate_ecosystem_gravity( self, file_path: Union[str, Path], ext: str, global_tally: Dict[str, int] ) -> Tuple[Optional[str], float]: """ - Resolves identical extension collisions (e.g., .h) by surveying the surrounding + Resolves identical extension collisions (e.g., .h) by surveying the surrounding directory neighborhood for dominating implementation languages (C vs C++ vs Obj-C). """ # 1. GATHER CANDIDATES - candidates = [ - lid - for lid, data in self.languages.items() - if ext in data.get("extensions", []) - ] + candidates = [lid for lid, data in self.languages.items() if ext in data.get("extensions", [])] if ext == ".h" and "cpp" not in candidates and "cpp" in self.languages: candidates.append("cpp") @@ -631,12 +570,8 @@ def _evaluate_ecosystem_gravity( parent_dir = Path(file_path).parent for child in parent_dir.iterdir(): if child.is_file(): - local_tally[child.suffix.lower()] = ( - local_tally.get(child.suffix.lower(), 0) + 1 - ) - local_tally[child.name.lower()] = ( - local_tally.get(child.name.lower(), 0) + 1 - ) + local_tally[child.suffix.lower()] = local_tally.get(child.suffix.lower(), 0) + 1 + local_tally[child.name.lower()] = local_tally.get(child.name.lower(), 0) + 1 except Exception: pass @@ -652,11 +587,7 @@ def _evaluate_ecosystem_gravity( data = self.languages.get(lid, {}) support_exts = [e for e in data.get("extensions", []) if e != ext] - base_contributors = { - e: tally.get(e.lower(), 0) - for e in support_exts - if tally.get(e.lower(), 0) > 0 - } + base_contributors = {e: tally.get(e.lower(), 0) for e in support_exts if tally.get(e.lower(), 0) > 0} # Single-Extension Ecosystem Support (e.g., MATLAB) if sum(base_contributors.values()) == 0: @@ -666,17 +597,13 @@ def _evaluate_ecosystem_gravity( discriminators = data.get("discriminators", []) discrim_contributors = { - d: tally.get(d.lower(), 0) - for d in discriminators - if tally.get(d.lower(), 0) > 0 + d: tally.get(d.lower(), 0) for d in discriminators if tally.get(d.lower(), 0) > 0 } discrim_mass = sum(discrim_contributors.values()) disqualifiers = data.get("disqualifiers", []) toxic_contributors = { - dq: tally.get(dq.lower(), 0) - for dq in disqualifiers - if tally.get(dq.lower(), 0) > 0 + dq: tally.get(dq.lower(), 0) for dq in disqualifiers if tally.get(dq.lower(), 0) > 0 } toxic_mass = sum(toxic_contributors.values()) @@ -713,16 +640,12 @@ def _evaluate_ecosystem_gravity( if ext == ".h" and set(scores.keys()).issubset({"c", "cpp", "objective-c"}): if dominance >= 0.55: - dominance = max( - dominance, self.thresholds.get("ECOSYSTEM_DOMINANCE_MIN", 0.70) - ) + dominance = max(dominance, self.thresholds.get("ECOSYSTEM_DOMINANCE_MIN", 0.70)) # Evaluate if this scope produced a statistical winner threshold = self.thresholds.get("ECOSYSTEM_DOMINANCE_MIN", 0.70) if scope_name == "Local": - threshold = ( - 0.60 # Local folders need slightly less dominance to prove intent - ) + threshold = 0.60 # Local folders need slightly less dominance to prove intent if dominance >= threshold: if self.logger.isEnabledFor(logging.DEBUG): @@ -750,9 +673,7 @@ def _tier_2_fingerprint_check(self, content: str, ext: str) -> Optional[str]: # 1. Standard Executable Shebang Check if content.startswith("#!"): first_line = content.split("\n", 1)[0].lower() - self.logger.debug( - f"Fingerprint Scan: Analyzing shebang line: '{first_line.strip()}'" - ) + self.logger.debug(f"Fingerprint Scan: Analyzing shebang line: '{first_line.strip()}'") for lang_id, data in self.languages.items(): for trigger in data.get("shebangs", []): @@ -760,8 +681,8 @@ def _tier_2_fingerprint_check(self, content: str, ext: str) -> Optional[str]: return lang_id # 2. INTERNAL DISCRIMINATOR (Collision Resolution Only) - # DEFENSIVE GUARD: Internal discriminators are strictly for resolving known - # extension collisions (e.g., Obj-C vs MATLAB .m files). They MUST NOT be used + # DEFENSIVE GUARD: Internal discriminators are strictly for resolving known + # extension collisions (e.g., Obj-C vs MATLAB .m files). They MUST NOT be used # as global scanners for extensionless files, as their regexes are highly specific. if ext: for lang_id, data in self.languages.items(): @@ -785,14 +706,12 @@ def _tier_3_lexical_scan( """ The Strict Boundary Scanner. Evaluates the specific structural syntax of a file to verify a claimed extension. - If a file has an extension, it MUST be claimed by one of the known languages + If a file has an extension, it MUST be claimed by one of the known languages for that extension; it is not allowed to randomly match an unrelated schema. """ candidates = [] if ext: - candidates = [ - l for l, d in self.languages.items() if ext in d.get("extensions", []) - ] + candidates = [l for l, d in self.languages.items() if ext in d.get("extensions", [])] # --- DEFENSIVE GUARD: STRICT BOUNDARY --- if not candidates: @@ -817,9 +736,7 @@ def _tier_3_lexical_scan( family = data.get("lexical_family") # Syntax Disqualification Phase - if family in self.DISQUALIFIERS and self.DISQUALIFIERS[family].search( - content - ): + if family in self.DISQUALIFIERS and self.DISQUALIFIERS[family].search(content): continue raw_score = 0.0 @@ -841,12 +758,8 @@ def _tier_3_lexical_scan( raw_score *= 1.25 # Comment Delimiter Bonus - family_key = data.get("lexical_family", "c_style_comment") - delims = ( - self.comment_defs.get("mechanical_families", {}) - .get(family_key, {}) - .get("delimiters", []) - ) + family_key = data.get("lexical_family", "standard_block") + delims = self.lexical_heuristics.get("lexical_families", {}).get(family_key, {}).get("delimiters", []) for d in delims: if d in content: raw_score += 15.0 @@ -881,13 +794,11 @@ def _tier_4_heuristic_discovery( ) -> Tuple[str, float]: """ Heuristic Discovery for unknown or extensionless files. - Prioritizes graceful failure over blind guessing by enforcing a strict 1.5x margin + Prioritizes graceful failure over blind guessing by enforcing a strict 1.5x margin between the leading language candidate and the runner-up. """ if coding_loc < self.thresholds.get("TIER_4_MIN_LINES", 20): - self.logger.debug( - f"Tier 4 Discovery aborted: Insufficient physical mass ({coding_loc} < 20 lines)." - ) + self.logger.debug(f"Tier 4 Discovery aborted: Insufficient physical mass ({coding_loc} < 20 lines).") return "plaintext", 0.40 loc = max(coding_loc, 1) @@ -897,49 +808,38 @@ def _tier_4_heuristic_discovery( # PHASE 1: Comment Family Isolation # ========================================================================= family_scores = {} - mechanical_families = self.comment_defs.get("mechanical_families", {}) + lexical_families = self.lexical_heuristics.get("lexical_families", {}) - if mechanical_families: - for fam_key, fam_data in mechanical_families.items(): + if lexical_families: + for fam_key, fam_data in lexical_families.items(): delims = fam_data.get("delimiters", []) family_scores[fam_key] = sum(content.count(d) for d in delims) else: - # Fallback for the 8 standardized mechanical delimiters if not externally defined - # Safely breaking apart the XML delimiter to prevent markdown render crashes + # Fallback for the 5 standardized lexical families if not externally defined xml_delim = "<" + "!--" family_scores = { - "c_style_comment": content.count("//") + content.count("/*"), - "single_line_only": content.count("#") + content.count(";"), - "multi_style_dash": content.count("--"), - "xml_angle": content.count(xml_delim), - "tex_percent": content.count("%"), - "bat_rem": len(re.findall(r"(?im)^REM\b", content)), - "quote_string": content.count('"""') + content.count("'''"), + "standard_block": content.count("//") + content.count("/*") + content.count("--"), + "recursive_block": content.count("//") + content.count("/*"), + "line_exclusive": content.count("#") + content.count(";"), + "block_exclusive": content.count(xml_delim), + "positional_anchored": content.count("*>") + content.count("!"), } winning_family = max(family_scores, key=family_scores.get, default=None) # Fail gracefully if no comments/structure exist to establish a lexical family if not winning_family or family_scores.get(winning_family, 0) == 0: - self.logger.debug( - "Tier 4 [Phase 1]: Failed to establish a lexical comment family (No delimiters found)." - ) + self.logger.debug("Tier 4 [Phase 1]: Failed to establish a lexical comment family (No delimiters found).") return "undeterminable", 0.0 self.logger.debug( f"Tier 4 [Phase 1]: Lexical Family Isolated -> '{winning_family}' (Score: {family_scores[winning_family]})" ) - candidates = [ - lid - for lid, data in self.languages.items() - if data.get("lexical_family") == winning_family - ] + candidates = [lid for lid, data in self.languages.items() if data.get("lexical_family") == winning_family] if not candidates: - self.logger.debug( - f"Tier 4 [Phase 1]: No candidate languages found for family '{winning_family}'." - ) + self.logger.debug(f"Tier 4 [Phase 1]: No candidate languages found for family '{winning_family}'.") return "undeterminable", 0.0 # ========================================================================= @@ -948,24 +848,16 @@ def _tier_4_heuristic_discovery( surviving_candidates = [] for lid in candidates: family_key = self.languages.get(lid, {}).get("lexical_family") - if family_key in self.DISQUALIFIERS and self.DISQUALIFIERS[ - family_key - ].search(content): - self.logger.debug( - f"Tier 4 [Phase 2]: Pruning '{lid}' via Heuristic Blacklist." - ) + if family_key in self.DISQUALIFIERS and self.DISQUALIFIERS[family_key].search(content): + self.logger.debug(f"Tier 4 [Phase 2]: Pruning '{lid}' via Heuristic Blacklist.") continue # Pruned by specific anti-patterns surviving_candidates.append(lid) if not surviving_candidates: - self.logger.debug( - "Tier 4 [Phase 2]: All candidates pruned by heuristic blacklist." - ) + self.logger.debug("Tier 4 [Phase 2]: All candidates pruned by heuristic blacklist.") return "undeterminable", 0.0 - self.logger.debug( - f"Tier 4 [Phase 2]: Surviving candidates -> {surviving_candidates}" - ) + self.logger.debug(f"Tier 4 [Phase 2]: Surviving candidates -> {surviving_candidates}") # ========================================================================= # PHASE 3: Structural Density Scan @@ -983,15 +875,10 @@ def _tier_4_heuristic_discovery( continue # ---> DEFENSIVE GUARD: REGEX BACKTRACKING PREVENTION <--- - # Aborts execution on extremely greedy, non-terminating patterns + # Aborts execution on extremely greedy, non-terminating patterns # that would lock the CPU during multi-line heuristic scanning. raw_pat = getattr(regex, "pattern", str(regex)) - clean_pat = ( - raw_pat.replace("(?i)", "") - .replace("(?m)", "") - .replace("(?s)", "") - .strip() - ) + clean_pat = raw_pat.replace("(?i)", "").replace("(?m)", "").replace("(?s)", "").strip() if clean_pat in ("", "()", "(?:)", "^", "$"): continue @@ -1031,27 +918,21 @@ def _tier_4_heuristic_discovery( # PHASE 4: The Density Equation (Hits / loc) # ===================================================================== density_scores[lid] = regex_hits / loc - + # Record execution time to penalize extremely slow, backtracking regex evaluations friction_scores[lid] = time.time() - t_start if not density_scores: - self.logger.debug( - "Tier 4 [Phase 3]: No structural signals detected for any candidate." - ) + self.logger.debug("Tier 4 [Phase 3]: No structural signals detected for any candidate.") return "undeterminable", 0.0 sorted_scores = sorted(density_scores.items(), key=lambda x: x[1], reverse=True) top_id, top_density = sorted_scores[0] - self.logger.debug( - f"Tier 4 [Phase 3]: Top signals -> {[(k, round(v, 4)) for k, v in sorted_scores[:3]]}" - ) + self.logger.debug(f"Tier 4 [Phase 3]: Top signals -> {[(k, round(v, 4)) for k, v in sorted_scores[:3]]}") if top_density == 0.0: - self.logger.debug( - "Tier 4 [Phase 3]: Top density is 0.0. Failing gracefully." - ) + self.logger.debug("Tier 4 [Phase 3]: Top density is 0.0. Failing gracefully.") return "undeterminable", 0.0 # ========================================================================= @@ -1073,9 +954,7 @@ def _tier_4_heuristic_discovery( if density_margin >= 1.5: # If it's vastly slower, the regex engine is likely thrashing on false positives if friction_ratio > 5.0: - self.logger.warning( - f"Tier 4 [Reconciliation]: TEMPORAL FRICTION ANOMALY on {top_id}..." - ) + self.logger.warning(f"Tier 4 [Reconciliation]: TEMPORAL FRICTION ANOMALY on {top_id}...") return "undeterminable", 0.0 return top_id, top_density @@ -1087,25 +966,19 @@ def _tier_4_heuristic_discovery( f"Tier 4 [Reconciliation]: Collision. {top_id} density margin ({density_margin:.2f}x) was too weak, and friction ratio ({friction_ratio:.2f}x) failed to break the tie." ) return "undeterminable", 0.0 - self.logger.debug( - f"Tier 4 [Reconciliation]: Friction Tie-Breaker utilized for {top_id}." - ) + self.logger.debug(f"Tier 4 [Reconciliation]: Friction Tie-Breaker utilized for {top_id}.") return top_id, top_density # 3. Absolute Ambiguity Resolution else: - if ext == ".h" and {top_id, runner_up_id}.issubset( - {"c", "cpp", "objective-c"} - ): + if ext == ".h" and {top_id, runner_up_id}.issubset({"c", "cpp", "objective-c"}): if gravity_lang in {"c", "cpp", "objective-c"}: self.logger.debug( f"Tier 4 [Reconciliation]: C/C++ Tie broken by Ecosystem Consensus -> {gravity_lang}" ) return gravity_lang, top_density # If no consensus exists, default to C as the lowest-level structural base - self.logger.debug( - "Tier 4 [Reconciliation]: C/C++ Tie broken by default architectural base -> c" - ) + self.logger.debug("Tier 4 [Reconciliation]: C/C++ Tie broken by default architectural base -> c") return "c", top_density return "undeterminable", 0.0 @@ -1145,7 +1018,7 @@ def _forge_result( def _capture_raw_signal(self, file_path: Union[str, Path]) -> str: """ DEFENSIVE GUARD: Restricts I/O memory allocation to 50KB. - Prevents Out-Of-Memory (OOM) crashes if the user accidentally points the + Prevents Out-Of-Memory (OOM) crashes if the user accidentally points the analyzer at massive log dumps or multi-gigabyte auto-generated monoliths. """ try: @@ -1155,9 +1028,7 @@ def _capture_raw_signal(self, file_path: Union[str, Path]) -> str: self.logger.error(f"Hardware/IO failure reading '{file_path}': {str(e)}") raise FocusingError(f"Failed to focus lens on {file_path}") from e - def _find_balanced_end( - self, text: str, start_pos: int, opener: str, closer: str - ) -> int: + def _find_balanced_end(self, text: str, start_pos: int, opener: str, closer: str) -> int: depth = 0 in_string: Optional[str] = None limit = min( @@ -1217,18 +1088,13 @@ def _detect_hybrids(self, content: str, primary_id: str) -> List[Dict[str, Any]] if t["pair"]: open_char, close_char = t["pair"] - end_idx = self._find_balanced_end( - content, t["start"], open_char, close_char - ) + end_idx = self._find_balanced_end(content, t["start"], open_char, close_char) else: search_limit = min( - t["trigger_end"] - + self.thresholds.get("HANDSHAKE_LOOKAHEAD_LIMIT", 50000), + t["trigger_end"] + self.thresholds.get("HANDSHAKE_LOOKAHEAD_LIMIT", 50000), total_len, ) - end_match = t["end_pattern"].search( - content, pos=t["trigger_end"], endpos=search_limit - ) + end_match = t["end_pattern"].search(content, pos=t["trigger_end"], endpos=search_limit) end_idx = end_match.end() if end_match else total_len segment_len = end_idx - t["start"] @@ -1245,4 +1111,4 @@ def _detect_hybrids(self, content: str, primary_id: str) -> List[Dict[str, Any]] if pct >= 1.0: mix.append({"id": lid, "pct": pct}) - return sorted(mix, key=lambda x: x["pct"], reverse=True) \ No newline at end of file + return sorted(mix, key=lambda x: x["pct"], reverse=True) diff --git a/gitgalaxy/standards/language_standards.py b/gitgalaxy/standards/language_standards.py index 266246e1..80147c26 100644 --- a/gitgalaxy/standards/language_standards.py +++ b/gitgalaxy/standards/language_standards.py @@ -11,7 +11,7 @@ """ language_standards.py -Phase 2 & 3: The Optical Registry & Syntax Dictionaries. +Phase 2 & 3: The Lexical Registry & Syntax Dictionaries. This file contains the compiled regular expressions, mechanical delimiters, and language-specific rules used to physically slice, parse, and identify @@ -19,7 +19,7 @@ """ # ------------------------------------------------------------------------------ -# 1. LENS CONFIGURATION (Language Identification & Disambiguation) +# 1. STRUCTURAL SIGNATURE CONFIGURATION (Language Identification & Disambiguation) # Consumed by: language_lens.py # ------------------------------------------------------------------------------ LENS_CONFIG = { @@ -194,7 +194,7 @@ GLOBAL_FRAGILE_DEBT = re.compile(f"{_SPACED_FRAGILE}|{_DENSE_FRAGILE}", re.I) # ------------------------------------------------------------------------------ -# 4. LANGUAGE DEFINITIONS (The Core Optical Matrix) +# 4. LANGUAGE DEFINITIONS (The Structural Signature Matrix) # Consumed by: detector.py, language_lens.py, prism.py # ------------------------------------------------------------------------------ LANGUAGE_DEFINITIONS = { @@ -255,7 +255,7 @@ # UPGRADED: Maps to Family 3 (Pure Hash) # Rationale: Uses '#' for line-level literature; multi-line literature # (docstrings) is handled by the Section 2.3.C.3 Heuristic Pass. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { # ===================================================================== # [ CRITICAL ROADMAP: JSONC/JSON5 LEXICAL DELIMITERS & THE RE.COMPILE TRAP ] @@ -273,12 +273,10 @@ # JSONC/JSON5 multi-line blocks use standard C-style delimiters. "_block_start": re.compile(r"/\*"), "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Includes match/case (3.10+) and logical short-circuits. EXCLUDES exceptions. - "branch": re.compile( - r"\b(if|elif|else|for|while|with|try|finally|match|case|and|or)\b" - ), + "branch": re.compile(r"\b(if|elif|else|for|while|with|try|finally|match|case|and|or)\b"), # 2. args (Parameters / Coupling) # Signatures for def/lambda. Bounded generics [^\]]* and params [^)]*. "args": re.compile( @@ -287,7 +285,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: _private (encapsulation) and Final (freeze_hits). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(def|class|return|import|from|as|pass|continue|break|await|assert|del|global|nonlocal|type)\b" ), # 4. func_start (Executable Logic Anchors) @@ -308,13 +306,13 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Swallowed errors, wildcard imports, and Any bypasses. - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\bpass\b[ \t]*$|except\s*[:(]|except\s+(?:Base)?Exception|from\s+[\w.]+\s+import\s+\*|#\s*type:\s*ignore|\b(Any|cast)\b|=\s*\[\s*\]|=[ \t]*\{\s*\}", re.M, ), # 8. danger (High-Risk Execution / System Calls) # Process killers and un-sanitized deserialization. EXCLUDES TODO/print. - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(eval|exec|subprocess\.(?:call|Popen|run)|os\.system|pickle\.loads?|yaml\.unsafe_load|shell=True)\b" ), # 9. io (I/O & Network Boundaries) @@ -329,21 +327,17 @@ ), # 11. flux (State Mutation) # State mutation. Includes Walrus operator and collection mutators. - "flux": re.compile( + "state_mutation": re.compile( r"\bglobal\b|\bnonlocal\b|\b(?:self|cls)\.\w+[ \t]*=|:=|(?:\.\w+)?\.(?:append|extend|update|pop|remove|insert|clear)\s*\(" ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"#[ \t]*(?:def|class|import|if|for|while|try|return)\b" - ), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"#[ \t]*(?:def|class|import|if|for|while|try|return)\b"), # 13. doc (Structured Documentation) "doc": re.compile( r'"""|\'\'\'|:param|:return|:raises|:type|\b(?:Args|Returns|Yields|Raises|Attributes):\b' ), # 14. test (Testing & Assertions) - "test": re.compile( - r"\b(unittest|pytest|TestCase|fixture|patch)\b|def[ \t]+test_|\bassert\b|\bMock\b" - ), + "test": re.compile(r"\b(unittest|pytest|TestCase|fixture|patch)\b|def[ \t]+test_|\bassert\b|\bMock\b"), # --- PHASE 3: SPECIALIZED SENSORS (Architecture & Hidden Complexity) --- # 15. concurrency (Asynchronous Execution) "concurrency": re.compile( @@ -356,9 +350,7 @@ # 17. closures (Closures / Anonymous Functions) "closures": re.compile(r"\blambda\b"), # 18. globals (Global / Shared State) - "globals": re.compile( - r"\b(os\.environ|sys\.argv|sys\.path|globals\(\)|locals\(\))\b" - ), + "globals": re.compile(r"\b(os\.environ|sys\.argv|sys\.path|globals\(\)|locals\(\))\b"), # 19. decorators (Decorators / Annotations) "decorators": re.compile(r"^[ \t]*@[\w.]+", re.M), # 20. generics (Generics / Type Parameters) @@ -366,9 +358,7 @@ r"\b(List|Dict|Set|Tuple|Optional|Union|TypeVar|Generic|Any|Callable|Mapping)\b\[[^\]]*\]|\b(list|dict|set|tuple|type)\[[^\]]*\]|->" ), # 21. comprehensions (Iterators / Comprehensions) - "comprehensions": re.compile( - r"\.(?:map|filter|reduce|flatMap|some|every|find|forEach|groupBy)\s*\(" - ), + "comprehensions": re.compile(r"\.(?:map|filter|reduce|flatMap|some|every|find|forEach|groupBy)\s*\("), # Expanded to include LLM orchestration tools for the Agentic Shield "scientific": re.compile( r"\b(?:import|require|from)\b.*?(?:tensorflow|torch|keras|numpy|pandas|scipy|sklearn|matplotlib|opencv|cv2|langchain|openai|anthropic|llama_index|chromadb|pinecone)\b" @@ -381,7 +371,7 @@ ), # 23. heat_triggers (Metaprogramming & Reflection) # Metaprogramming and class-level binding. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"__(?:getattr|setattr|del|call|new|metaclass|dict|dir|import)__|@(?:staticmethod|classmethod|property)|\b(?:getattr|setattr|inspect\.)\b" ), # 24. import (Dependency Inclusions) @@ -395,24 +385,18 @@ r"\b(?:__import__|importlib\.import_module)\s*\(\s*['\"]([a-zA-Z0-9_.]+)['\"]", # Group 3: __import__('X') re.M, ), - "_named_token_capture": re.compile( - r"^[ \t]*from\s+[\w.]+\s+import\s+([^({\n]+)", re.M - ), + "_named_token_capture": re.compile(r"^[ \t]*from\s+[\w.]+\s+import\s+([^({\n]+)", re.M), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"(?:__author__[ \t]*=|Author:|Created by:)\s*(.*)", re.I - ), + "ownership": re.compile(r"(?:__author__[ \t]*=|Author:|Created by:)\s*(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(render_template|HttpResponse|JSONResponse|TemplateResponse|WSGIApplication|ASGIApplication)\b" @@ -438,41 +422,31 @@ "telemetry": re.compile( r"\b(logging|logger|structlog|sentry_sdk|datadog|loguru)\.(?:info|error|warn|warning|debug|trace|log|exception|critical)\b" ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile(r"\b(print|input)\s*\("), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( - r"\b(int|str|float|list|dict|set|tuple|bool|bytes|cast)\b\s*\(" - ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(raise|quit|exit|sys\.exit|abort)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\b(time\.sleep|asyncio\.sleep|Thread\.join)\b"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>|\^|~"), - # 44. sync_locks (Thread Synchronization / Locks) - "sync_locks": re.compile( - r"\b(Lock|RLock|Semaphore|BoundedSemaphore|Event|Condition|Barrier)\b" - ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile(r"\b(Final|frozenset|mappingproxy|immutable)\b"), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\b(print|input)\s*\("), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile(r"\b(int|str|float|list|dict|set|tuple|bool|bytes|cast)\b\s*\("), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(raise|quit|exit|sys\.exit|abort)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(time\.sleep|asyncio\.sleep|Thread\.join)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"<<|>>|\^|~"), + # 44. sync_locks (Resource Management & Stability) + "sync_locks": re.compile(r"\b(Lock|RLock|Semaphore|BoundedSemaphore|Event|Condition|Barrier)\b"), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(Final|frozenset|mappingproxy|immutable)\b"), # 46. cleanup (Resource Cleanup / Teardown) "cleanup": re.compile(r"\b(close|__exit__|del|shutdown|cleanup)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) # Captures protected/private members via underscore convention. "encapsulation": re.compile(r"\b_[a-zA-Z_]\w*\b"), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\b(on_event|add_listener|subscribe|callback|handler)\b" - ), + "listeners": re.compile(r"\b(on_event|add_listener|subscribe|callback|handler)\b"), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"\b(pytest\.mark\.skip|unittest\.skip|mock\.|MagicMock)\b" - ), + "test_skip": re.compile(r"\b(pytest\.mark\.skip|unittest\.skip|mock\.|MagicMock)\b"), # --- NEW: ADVANCED ALGORITHMIC SENSORS --- - "lazy_evaluation": re.compile( - r"\b(yield|yield\s+from|Generator|AsyncGenerator|Iterator|AsyncIterator)\b" - ), + "lazy_evaluation": re.compile(r"\b(yield|yield\s+from|Generator|AsyncGenerator|Iterator|AsyncIterator)\b"), "vectorized_math": re.compile( r"\b(einsum|matmul|tensordot|vdot|bmm)\b|\.dot\s*\(|(?<=[a-zA-Z0-9_\]\)])\s*@\s*(?=[a-zA-Z0-9_\[\(])" ), @@ -480,19 +454,11 @@ "serialization_parsing": re.compile( r"\b(pickle\.loads?|pickle\.Unpickler|marshal\.loads?|ast\.literal_eval)\b" ), - "regex_execution": re.compile( - r"\b(re\.compile|re\.search|re\.match|re\.sub|re\.findall|re\.split)\b" - ), - "time_date_logic": re.compile( - r"\b(datetime\.datetime|timedelta|time\.sleep|time\.time|calendar)\b" - ), - "ipc_rpc_bridges": re.compile( - r"\b(multiprocessing|subprocess|xmlrpc|socketserver)\b" - ), + "regex_execution": re.compile(r"\b(re\.compile|re\.search|re\.match|re\.sub|re\.findall|re\.split)\b"), + "time_date_logic": re.compile(r"\b(datetime\.datetime|timedelta|time\.sleep|time\.time|calendar)\b"), + "ipc_rpc_bridges": re.compile(r"\b(multiprocessing|subprocess|xmlrpc|socketserver)\b"), # --- PHASE 4: APPSEC & AI SENSORS (Zero-Trust Pipelines) --- - "memory_scraping": re.compile( - r"['\"]/proc/['\"]\s*\+\s*(?:str\([^)]*\)|f?['\"]\{[^}]*\})|/proc/\w+/mem" - ), + "memory_scraping": re.compile(r"['\"]/proc/['\"]\s*\+\s*(?:str\([^)]*\)|f?['\"]\{[^}]*\})|/proc/\w+/mem"), "exfiltration_camouflage": re.compile( r"\b(requests\.post|urllib\.request|httpx\.post)\s*\([^)]*(?:checkmarx|telemetry|metrics|audit|log)\b", re.I, @@ -542,9 +508,9 @@ # UPGRADED: Maps to Family 1 (Standard C) # Rationale: Uses '//' for line-level literature; multi-line literature # (/* */) is handled by the Section 2.3.C.3 Heuristic Pass. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Standard C-family line comment token (Includes JSDoc // style) "_line_anchor": re.compile(r"//"), # Inline comments follow the standard '//' delimiter. @@ -553,8 +519,8 @@ "_block_start": re.compile(r"/\*"), # Block comment end: */ "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Decisions and logical jumps. EXCLUDES throw (bailout_hits). "branch": re.compile( @@ -582,9 +548,7 @@ ), # 3. linear (Sequential Boundaries) # Structural declaration boundaries. EXCLUDES: Access modifiers (encapsulation) and const (freeze_hits). - "linear": re.compile( - r"\b(let|var|import|export|return|class|extends|super|await|delete)\b|=>" - ), + "structural_boundaries": re.compile(r"\b(let|var|import|export|return|class|extends|super|await|delete)\b|=>"), # 4. func_start (Executable Logic Anchors) # Uses positive lookaheads (?=) to stop the match exactly at the identifier name. # Captures standard functions, namespace assignments (foo.bar = function), @@ -619,12 +583,10 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Loose equality and bypasses. - "safety_neg": re.compile( - r"==(?!=)|!=(?!=)|\b(with|void)\b|eslint-disable|@ts-nocheck" - ), + "safety_bypasses": re.compile(r"==(?!=)|!=(?!=)|\b(with|void)\b|eslint-disable|@ts-nocheck"), # 8. danger (High-Risk Execution / System Calls) # Catastrophic vulnerabilities. EXCLUDES console.log (print_hits) and TODO (debt). - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(eval|document\.write|innerHTML|outerHTML|dangerouslySetInnerHTML|debugger|alert|process\.exit)\b" ), # 9. io (I/O & Network Boundaries) @@ -633,22 +595,16 @@ ), # 10. api (Public Surface Area) # Exposure surface. Explicit exports + implicit architectural defaults. - "api": re.compile( - r"\b(export|module\.exports|exports\.)\b|@(Controller|Resolver|Get|Post|Put|Delete)\b" - ), + "api": re.compile(r"\b(export|module\.exports|exports\.)\b|@(Controller|Resolver|Get|Post|Put|Delete)\b"), # 11. flux (State Mutation) # Mutation of state. EXCLUDES const (freeze_hits). - "flux": re.compile( + "state_mutation": re.compile( r"\b(let|var|this\.|setState|mut|push|pop|shift|unshift|splice|sort|reverse|\.current[ \t]*=|\.set\(|\.delete\(|\.add\()\b" ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"//[ \t]*(?:if|for|while|function|class|return|var|const|let|import)\b" - ), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"//[ \t]*(?:if|for|while|function|class|return|var|const|let|import)\b"), # 13. doc (Structured Documentation) - "doc": re.compile( - r"/\*\*|@param|@return|@throws|@deprecated|@typedef|@type|@template" - ), + "doc": re.compile(r"/\*\*|@param|@return|@throws|@deprecated|@typedef|@type|@template"), # 14. test (Testing & Assertions) "test": re.compile( r"\b(describe|expect|assert|beforeEach|afterEach|jest|mocha|vitest|cy\.)\b|\b(?:it|test)\s*\(" @@ -663,22 +619,16 @@ r'<[A-Z]\w+|className=|use(?:State|Effect|Context|Reducer|Ref|Memo|Callback|Transition)|props\.|this\.state|document\.(?:getElementById|querySelector|addEventListener)|["\']use\s+(?:client|server)["\']' ), # 17. closures (Closures / Anonymous Functions) - "closures": re.compile( - r"=>[ \t]*\{|\(\)[ \t]*=>|function\s*\([^)]*\)[ \t]*\{" - ), + "closures": re.compile(r"=>[ \t]*\{|\(\)[ \t]*=>|function\s*\([^)]*\)[ \t]*\{"), # 18. globals (Global / Shared State) - "globals": re.compile( - r"\b(window\.|global\.|process\.env|document\.|navigator\.|self\.|globalThis\.)\b" - ), + "globals": re.compile(r"\b(window\.|global\.|process\.env|document\.|navigator\.|self\.|globalThis\.)\b"), # 19. decorators (Decorators / Annotations) "decorators": re.compile(r"@\w+"), # 20. generics (Generics / Type Parameters) # Simulated/JSDoc generics in JS. "generics": re.compile(r"@template\s+\w+|/\*\*\s*@type\s*(?:\{|<\w+)"), # 21. comprehensions (Iterators / Comprehensions) - "comprehensions": re.compile( - r"\.(?:map|filter|reduce|flatMap|some|every|find|forEach|groupBy)\s*\(" - ), + "comprehensions": re.compile(r"\.(?:map|filter|reduce|flatMap|some|every|find|forEach|groupBy)\s*\("), # Expanded to include LLM orchestration tools for the Agentic Shield "scientific": re.compile( r"\b(?:import|require|from)\b.*?(?:tensorflow|torch|keras|numpy|pandas|scipy|sklearn|matplotlib|opencv|cv2|langchain|openai|anthropic|llama_index|chromadb|pinecone)\b" @@ -690,7 +640,7 @@ r"\b(?:import|require|from)\b.*?(?:crypto|bcrypt|x509|tls|ssl|jsonwebtoken|argon2)\b" ), # 23. heat_triggers (Metaprogramming & Reflection) - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(arguments\.|prototype|__proto__|Object\.assign|Reflect|Proxy|Object\.defineProperty|\.bind\(|\.call\(|\.apply\()\b" ), # 24. import (Dependency Inclusions) @@ -721,9 +671,7 @@ r"(?:import|export)\b[^;]*?\bfrom\s*['\"]([^'\"]+)['\"]|\b(?:require|import)\s*\(\s*['\"]([^'\"]+)['\"]", re.M, ), - "_named_token_capture": re.compile( - r"(?:import|export)\s+\{([^}]+)\}", re.M - ), + "_named_token_capture": re.compile(r"(?:import|export)\s+\{([^}]+)\}", re.M), # 25. ownership (Authorship Metadata) "ownership": re.compile(r"(?:@author|Created by)\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- @@ -732,28 +680,22 @@ # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(getServerSideProps|getStaticProps|getInitialProps|renderToString|hydrateRoot)\b" ), # 32. events (Event Emitters / Pub-Sub) - "events": re.compile( - r"\b(emit|on|once|off|dispatchEvent|EventEmitter|EventTarget)\b" - ), + "events": re.compile(r"\b(emit|on|once|off|dispatchEvent|EventEmitter|EventTarget)\b"), # 33. dependency_injection (Dependency Injection / IoC) - "dependency_injection": re.compile( - r"\b(Inject|Injectable|Container|resolve|register|inversify)\b" - ), + "dependency_injection": re.compile(r"\b(Inject|Injectable|Container|resolve|register|inversify)\b"), # 34. macros "macros": None, # 35. pointers "pointers": None, - # 36. memory_alloc + # 36. memory_alloc "memory_alloc": re.compile(r"\bnew\s+[A-Z]\w*"), # 37. inline_asm "inline_asm": None, @@ -762,54 +704,38 @@ "telemetry": re.compile( r"\b(logger|winston|pino|morgan|datadog|prometheus|newrelic|sentry)\.(?:info|error|warn|debug|trace|log)\b" ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile( - r"\bconsole\.(?:log|warn|error|dir|trace|info|table|time)\b" - ), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( - r"\b(Number|String|Boolean|BigInt|Symbol|Array\.from)\b\s*\(" - ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(throw|abort|process\.exit)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile( - r"\b(sleep|delay|setTimeout|setInterval|Atomics\.wait)\b" - ), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>>?|\^|~"), - # 44. sync_locks (Thread Synchronization / Locks) + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\bconsole\.(?:log|warn|error|dir|trace|info|table|time)\b"), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile(r"\b(Number|String|Boolean|BigInt|Symbol|Array\.from)\b\s*\("), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|abort|process\.exit)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(sleep|delay|setTimeout|setInterval|Atomics\.wait)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"<<|>>>?|\^|~"), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile( r"\b(mutex|lock|synchronized|Semaphore|Atomics\.lock|Atomics\.wait)\b", re.I, ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile( - r"\b(const|readonly|final|Object\.freeze|Object\.seal)\b" - ), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(const|readonly|final|Object\.freeze|Object\.seal)\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"\b(dispose|close|destroy|clearTimeout|clearInterval|removeEventListener|delete)\b" - ), + "cleanup": re.compile(r"\b(dispose|close|destroy|clearTimeout|clearInterval|removeEventListener|delete)\b"), # 47. encapsulation (Access Modifiers / Encapsulation) # JS private fields and keywords. "encapsulation": re.compile(r"\b(private|protected|internal|#)\b"), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\b(on|addEventListener|subscribe|watch|effect)\b" - ), + "listeners": re.compile(r"\b(on|addEventListener|subscribe|watch|effect)\b"), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"\b(test\.skip|it\.skip|describe\.skip|xit|xdescribe|mock|stub)\b" - ), + "test_skip": re.compile(r"\b(test\.skip|it\.skip|describe\.skip|xit|xdescribe|mock|stub)\b"), # --- NEW: ADVANCED ALGORITHMIC SENSORS --- "lazy_evaluation": re.compile(r"\b(yield|yield\s*\*|function\s*\*)\b"), "vectorized_math": re.compile(r"\b(matmul|dot|cross|multiply)\s*\("), # --- PHASE 3: HYBRID DOMAIN SENSORS (JS/TS Specifics) --- "serialization_parsing": re.compile(r"\b(JSON\.parse|JSON\.stringify)\b"), - "regex_execution": re.compile( - r"\bnew\s+RegExp\b|\.(match|replace|search|split)\s*\(" - ), + "regex_execution": re.compile(r"\bnew\s+RegExp\b|\.(match|replace|search|split)\s*\("), "time_date_logic": re.compile( r"\b(Date\.now|new\s+Date|setTimeout|setInterval|clearTimeout|clearInterval|performance\.now)\b" ), @@ -860,9 +786,9 @@ # UPGRADED: Maps to Family 1 (Standard C) # Rationale: Uses '//' for line-level literature; multi-line literature # (/* */) is handled by the Section 2.3.C.3 Heuristic Pass. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Standard C-family line comment token (Includes TSDoc /// references) "_line_anchor": re.compile(r"//"), # Inline comments follow the standard '//' delimiter. @@ -871,8 +797,8 @@ "_block_start": re.compile(r"/\*"), # Block comment end: */ "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # EXCLUDES: Exceptions (throw). Includes control flow and logical short-circuits. "branch": re.compile( @@ -886,7 +812,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: Access modifiers (public/private) and Immutability (const). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(var|return|class|interface|type|enum|import|export|await|satisfies|using|namespace|module|implements|extends|declare)\b|=>" ), # 4. func_start (Executable Logic Anchors) @@ -929,12 +855,12 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Force unwrapping, any, and linter bypasses. - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(any)\b|as\s+any|!\s*[;,\n)\]\.]|!\.|@ts-ignore|@ts-expect-error|@ts-nocheck|eslint-disable|as\s+unknown\s+as|" ), # 8. danger (High-Risk Execution / System Calls) # Process killers and catastrophic vulnerabilities. EXCLUDES TODO (debt) and console.log (print). - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(eval|document\.write|innerHTML|outerHTML|dangerouslySetInnerHTML|debugger|alert|process\.exit)\b" ), # 9. io (I/O & Network Boundaries) @@ -948,17 +874,13 @@ ), # 11. flux (State Mutation) # Mutation of state. EXCLUDES const (freeze_hits). - "flux": re.compile( + "state_mutation": re.compile( r"\b(let|var|this\.|setState|push|pop|shift|unshift|splice|sort|reverse|\.current[ \t]*=|\.set\(|\.delete\(|\.add\()\b" ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"//[ \t]*(?:if|for|while|function|class|return|export|import)\b" - ), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"//[ \t]*(?:if|for|while|function|class|return|export|import)\b"), # 13. doc (Structured Documentation) - "doc": re.compile( - r"/\*\*|@param|@return|@throws|@deprecated|@typedef|@type|@template|@callback" - ), + "doc": re.compile(r"/\*\*|@param|@return|@throws|@deprecated|@typedef|@type|@template|@callback"), # 14. test (Testing & Assertions) # CRITICAL FIX: Negative lookbehind (?[ \t]*\{|\(\)[ \t]*=>|function\s*\([^)]*\)[ \t]*\{" - ), + "closures": re.compile(r"=>[ \t]*\{|\(\)[ \t]*=>|function\s*\([^)]*\)[ \t]*\{"), # 18. globals (Global / Shared State) - "globals": re.compile( - r"\b(window\.|global\.|process\.env|document\.|navigator\.|self\.|globalThis\.)\b" - ), + "globals": re.compile(r"\b(window\.|global\.|process\.env|document\.|navigator\.|self\.|globalThis\.)\b"), # 19. decorators (Decorators / Annotations) "decorators": re.compile(r"@\w+(?:\([^)]*\))?"), # 20. generics (Generics / Type Parameters) @@ -988,15 +906,11 @@ r"<\s*[A-Z][^>]*>|\b(?:keyof|infer|extends|Omit|Pick|Partial|Record|Required|Awaited|ReturnType|Parameters|NonNullable)\b" ), # 21. comprehensions (Iterators / Comprehensions) - "comprehensions": re.compile( - r"\.(?:map|filter|reduce|flatMap|some|every|find|forEach|groupBy)\s*\(" - ), + "comprehensions": re.compile(r"\.(?:map|filter|reduce|flatMap|some|every|find|forEach|groupBy)\s*\("), # 22. scientific (Numerical / Compute Libraries) - "scientific": re.compile( - r"\b(Math\.|tf\.|THREE\.|d3\.|gl-matrix|random)\b" - ), + "scientific": re.compile(r"\b(Math\.|tf\.|THREE\.|d3\.|gl-matrix|random)\b"), # 23. heat_triggers (Metaprogramming & Reflection) - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(arguments\.|prototype|__proto__|Object\.assign|Reflect|Proxy|Object\.defineProperty|\.bind\(|\.call\(|\.apply\()\b" ), # 24. import (Dependency Inclusions) @@ -1040,19 +954,15 @@ # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(getServerSideProps|getStaticProps|generateStaticParams|LoaderFunction|ActionFunction)\b" ), # 32. events (Event Emitters / Pub-Sub) - "events": re.compile( - r"\b(emit|on|once|off|dispatchEvent|EventEmitter|EventTarget)\b" - ), + "events": re.compile(r"\b(emit|on|once|off|dispatchEvent|EventEmitter|EventTarget)\b"), # 33. dependency_injection (Dependency Injection / IoC) "dependency_injection": re.compile( r"\b(Inject|Injectable|Container|resolve|register|tsyringe|inversify)\b" @@ -1061,7 +971,7 @@ "macros": None, # TypeScript uses transformer plugins/pre-processors, not standard inline macros. # 35. pointers "pointers": None, # Managed memory environment. - # 36. memory_alloc + # 36. memory_alloc "memory_alloc": re.compile(r"\bnew\s+[A-Z]\w*"), # 37. inline_asm "inline_asm": None, @@ -1070,43 +980,31 @@ "telemetry": re.compile( r"\b(logger|winston|pino|morgan|datadog|prometheus|newrelic|sentry)\.(?:info|error|warn|debug|trace|log)\b" ), - # 39. print_hits (The Amateur) - "print_hits": re.compile( - r"\bconsole\.(?:log|warn|error|dir|trace|info|table|time)\b" - ), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile(r"\bas\s+[A-Z]\w*|<\s*[A-Z]\w*\s*>\s*[a-zA-Z_]"), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(throw|fatalError|abort|process\.exit)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile( - r"\b(sleep|delay|setTimeout|setInterval|Atomics\.wait)\b" - ), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>|\^|~"), - # 44. sync_locks (Thread Synchronization / Locks) + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) + "debug_prints": re.compile(r"\bconsole\.(?:log|warn|error|dir|trace|info|table|time)\b"), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile(r"\bas\s+[A-Z]\w*|<\s*[A-Z]\w*\s*>\s*[a-zA-Z_]"), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|fatalError|abort|process\.exit)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(sleep|delay|setTimeout|setInterval|Atomics\.wait)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"<<|>>|\^|~"), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile( r"\b(mutex|lock|synchronized|Semaphore|Atomics\.lock|Atomics\.wait)\b", re.I, ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile( - r"\b(const|readonly|final|Object\.freeze|Object\.seal)\b" - ), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(const|readonly|final|Object\.freeze|Object\.seal)\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"\b(dispose|close|destroy|clearTimeout|clearInterval|removeEventListener|delete)\b" - ), + "cleanup": re.compile(r"\b(dispose|close|destroy|clearTimeout|clearInterval|removeEventListener|delete)\b"), # 47. encapsulation (Access Modifiers / Encapsulation) "encapsulation": re.compile(r"\b(private|protected|internal|#)\b"), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\b(on|addEventListener|subscribe|watch|effect)\b" - ), + "listeners": re.compile(r"\b(on|addEventListener|subscribe|watch|effect)\b"), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"\b(test\.skip|it\.skip|describe\.skip|xit|xdescribe|mock|stub)\b" - ), + "test_skip": re.compile(r"\b(test\.skip|it\.skip|describe\.skip|xit|xdescribe|mock|stub)\b"), # --- NEW: ADVANCED ALGORITHMIC SENSORS --- "lazy_evaluation": re.compile( r"\b(yield|yield\s*\*|function\s*\*|Generator|AsyncGenerator|Iterable|AsyncIterable)\b" @@ -1114,9 +1012,7 @@ "vectorized_math": re.compile(r"\b(matmul|dot|cross|multiply)\s*\("), # --- PHASE 3: HYBRID DOMAIN SENSORS (JS/TS Specifics) --- "serialization_parsing": re.compile(r"\b(JSON\.parse|JSON\.stringify)\b"), - "regex_execution": re.compile( - r"\bnew\s+RegExp\b|\.(match|replace|search|split)\s*\(" - ), + "regex_execution": re.compile(r"\bnew\s+RegExp\b|\.(match|replace|search|split)\s*\("), "time_date_logic": re.compile( r"\b(Date\.now|new\s+Date|setTimeout|setInterval|clearTimeout|clearInterval|performance\.now)\b" ), @@ -1162,9 +1058,9 @@ # UPGRADED: Maps to Family 1 (Standard C) # Rationale: Uses '//' for line-level literature; multi-line literature # (/* */) is handled by the Section 2.3.C.3 Heuristic Pass. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Standard C-family line comment token (Includes Javadoc /**) "_line_anchor": re.compile(r"//"), # Inline comments follow the standard '//' delimiter. @@ -1173,7 +1069,7 @@ "_block_start": re.compile(r"/\*"), # Block comment end: */ "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Includes modern switch expressions (yield) and pattern guards (when). # EXCLUDES: Exceptions (throw) - moved to bailout_hits. @@ -1203,7 +1099,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: Access modifiers (encapsulation) and final (freeze_hits). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(void|return|import|package|class|interface|enum|record|extends|implements|var|sealed|non-sealed|permits|new|throws|module|requires|exports|opens|provides|uses)\b" ), # 4. func_start (Executable Logic Anchors) @@ -1236,12 +1132,12 @@ r"\b(try|catch|finally|assert|Optional|Objects\.requireNonNull|instanceof)\b|@(Valid|Validated|NotNull|NonNull|NotBlank|Immutable|Transactional)\b" ), # 7. safety_neg (Safety Bypasses / Unchecked Types) - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(null)\b|return\s+null|\([A-Z]\w+\)\s*(?!->)[a-zA-Z_$]|catch\s*\(\s*(?:Exception|Throwable)\b|@SuppressWarnings|@SneakyThrows|\.get\(\)" ), # 8. danger (High-Risk Execution / System Calls) # Process killers and raw memory/execution risks. EXCLUDES prints (Phase 5). - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(Runtime\.getRuntime\(\)\.exec|ProcessBuilder|System\.exit|Thread\.stop|Unsafe)\b" ), # 9. io (I/O & Network Boundaries) @@ -1254,13 +1150,11 @@ ), # 11. flux (State Mutation) # Mutation of state. EXCLUDES final (freeze_hits). - "flux": re.compile( + "state_mutation": re.compile( r"\b(volatile|Atomic\w+)\b|^[ \t]*(?:this\.)?\w+[ \t]*=|@(?:Setter|Data)\b|(?:\w+\.)?(?:set[A-Z]\w+|add|put|remove|clear|addAll|replace|computeIfAbsent)\s*\(" ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"//[ \t]*(?:public|private|protected|class|void|if|for|while|return|import)\b" - ), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"//[ \t]*(?:public|private|protected|class|void|if|for|while|return|import)\b"), # 13. doc (Structured Documentation) "doc": re.compile( r"/\*\*|@param|@return|@throws|@deprecated|@see|@since|@apiNote|@implSpec|@Operation|@Schema" @@ -1298,14 +1192,12 @@ ), # 23. heat_triggers (Metaprogramming & Reflection) # Reflection and dynamic proxies. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(reflect\.|native|Class\.forName|Method\.invoke|Field\.setAccessible|Proxy\.newProxyInstance|ClassLoader|MethodHandles|VarHandle|Linker\.nativeLinker)\b|@SneakyThrows" ), # 24. import (Dependency Inclusions) "import": re.compile(r"^[ \t]*import\s+(?:static[ \t]+)?[\w.]+;", re.M), - "_dependency_capture": re.compile( - r"^[ \t]*import[ \t\n]+(?:static[ \t\n]+)?([\w.*]+)[ \t\n]*;", re.M - ), + "_dependency_capture": re.compile(r"^[ \t]*import[ \t\n]+(?:static[ \t\n]+)?([\w.*]+)[ \t\n]*;", re.M), # 25. ownership (Authorship Metadata) "ownership": re.compile(r"@author\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- @@ -1314,11 +1206,9 @@ # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(ModelAndView|FacesServlet|HttpServletRequest|HttpServletResponse|@ResponseBody|@ResponseStatus|JspWriter|ThymeleafViewResolver)\b" @@ -1335,10 +1225,8 @@ "macros": None, # Java lacks preprocessor macros. # 35. pointers (Pointer Arithmetic / Memory Addressing) # Project Panama (Java 22+) bridging to native memory. - "pointers": re.compile( - r"\b(MemorySegment|MemoryLayout|ValueLayout|AddressLayout|SymbolLookup)\b" - ), - # 36. memory_alloc + "pointers": re.compile(r"\b(MemorySegment|MemoryLayout|ValueLayout|AddressLayout|SymbolLookup)\b"), + # 36. memory_alloc "memory_alloc": re.compile( r"\b(Arena\.ofConfined|Arena\.ofShared|Arena\.ofAuto|Arena\.global|SegmentAllocator|allocateFrom|ByteBuffer\.allocateDirect)\b" ), @@ -1349,58 +1237,44 @@ "telemetry": re.compile( r"\b(log|logger|LOGGER|LoggerFactory|LogManager|MDC|Tracer|Span)\.(?:info|error|warn|warning|debug|trace|log)\b|@Slf4j|@Log4j2" ), - # 39. print_hits (The Amateur) - "print_hits": re.compile( + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) + "debug_prints": re.compile( r"\b(System\.out\.(?:print|println|printf)|System\.err\.(?:print|println|printf)|\.printStackTrace\(\))\b" ), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile( r"\(\s*(?:int|long|short|byte|char|float|double|boolean|[A-Z][A-Za-z0-9_]*)\s*\)\s*[a-zA-Z_$]" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(throw|abort|System\.exit|halt)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile( - r"\b(Thread\.sleep|TimeUnit\.[A-Z_]+\.sleep|delay|CountDownLatch\.await)\b" - ), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>>?|\^|~"), - # 44. sync_locks (Thread Synchronization / Locks) + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|abort|System\.exit|halt)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(Thread\.sleep|TimeUnit\.[A-Z_]+\.sleep|delay|CountDownLatch\.await)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"<<|>>>?|\^|~"), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile( r"\b(mutex|lock|synchronized|Semaphore|ReentrantLock|ReadWriteLock|Condition)\b", re.I, ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile( - r"\b(final|immutable|unmodifiable[A-Z]\w*|Object\.freeze)\b" - ), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(final|immutable|unmodifiable[A-Z]\w*|Object\.freeze)\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"\b(close|dispose|shutdown|free|release|cleaner\.register)\b\s*\(" - ), + "cleanup": re.compile(r"\b(close|dispose|shutdown|free|release|cleaner\.register)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) "encapsulation": re.compile(r"\b(private|protected|internal)\b"), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\b(on[A-Z]\w*|addEventListener|subscribe|@KafkaListener|@RabbitListener)\b" - ), + "listeners": re.compile(r"\b(on[A-Z]\w*|addEventListener|subscribe|@KafkaListener|@RabbitListener)\b"), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"@(?:Ignore|Disabled)|test\.skip\(|mock\(|spy\(|verifyZeroInteractions" - ), + "test_skip": re.compile(r"@(?:Ignore|Disabled)|test\.skip\(|mock\(|spy\(|verifyZeroInteractions"), # --- PHASE 3: HYBRID DOMAIN SENSORS (Java Specifics) --- "serialization_parsing": re.compile( r"\b(ObjectMapper|readValue|readTree|fromJson|ObjectInputStream|DocumentBuilder|SAXParser)\b" ), - "regex_execution": re.compile( - r"\b(Pattern\.compile|Matcher\.find|\.matches\()\b" - ), + "regex_execution": re.compile(r"\b(Pattern\.compile|Matcher\.find|\.matches\()\b"), "time_date_logic": re.compile( r"\b(LocalDate(?:Time)?|ZonedDateTime|Instant|Duration|System\.currentTimeMillis|Calendar\.getInstance)\b" ), - "ipc_rpc_bridges": re.compile( - r"\b(ProcessBuilder|KafkaTemplate|RabbitTemplate|JmsTemplate|java\.rmi)\b" - ), + "ipc_rpc_bridges": re.compile(r"\b(ProcessBuilder|KafkaTemplate|RabbitTemplate|JmsTemplate|java\.rmi)\b"), }, }, "csharp": { @@ -1442,9 +1316,9 @@ # UPGRADED: Maps to Family 1 (Standard C) # Rationale: Uses '//' for line-level literature; multi-line literature # (/* */) is handled by the Section 2.3.C.3 Heuristic Pass. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Standard C-family line comment token (Includes XML Doc ///) "_line_anchor": re.compile(r"//"), # Inline comments follow the standard '//' delimiter. @@ -1453,7 +1327,7 @@ "_block_start": re.compile(r"/\*"), # Block comment end: */ "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Decisions and logical jumps. EXCLUDES throw (bailout_hits). # Includes pattern matching (and, or, not) and null-coalescing. @@ -1483,7 +1357,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: Access modifiers (encapsulation) and const/readonly (freeze_hits). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(var|return|class|interface|struct|record|enum|using|namespace|yield|await|delegate|event|init|required|field|implements|extends|declare)\b|=>" ), # 4. func_start (Executable Logic Anchors) @@ -1561,14 +1435,10 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Null-forgiving operator, dynamic, and unsafe bypasses. - "safety_neg": re.compile( - r"!\.|\bnull!|#pragma\s+warning\s+disable|\.Result\b|\.Wait\(\)|\b(dynamic)\b" - ), + "safety_bypasses": re.compile(r"!\.|\bnull!|#pragma\s+warning\s+disable|\.Result\b|\.Wait\(\)|\b(dynamic)\b"), # 8. danger (High-Risk Execution / System Calls) # Extreme tech debt/vulnerabilities. EXCLUDES TODO (debt) and Console (print). - "danger": re.compile( - r"\b(Thread\.Abort|Process\.Start|Environment\.FailFast|Environment\.Exit|goto)\b" - ), + "high_risk_execution": re.compile(r"\b(Thread\.Abort|Process\.Start|Environment\.FailFast|Environment\.Exit|goto)\b"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(File|Directory|Stream|HttpClient|Path|SqlConnection|SqlCommand|DbContext|DbSet|HttpRequest|HttpResponse)\b\.|\[Table\(" @@ -1580,17 +1450,15 @@ ), # 11. flux (State Mutation) # Mutation of state. EXCLUDES const/readonly (freeze_hits). - "flux": re.compile( + "state_mutation": re.compile( r"\b(set|field)\s*[{;]|volatile|ref\s|out\s|^[ \t]*(?:this\.)?\w+[ \t]*=|(?:\w+\.)?(?:Add|Remove|Clear|Insert|Push|Pop|Update)\s*\(" ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile( r"//[ \t]*(?:public|private|protected|internal|class|void|if|for|foreach|while|return|using)\b" ), # 13. doc (Structured Documentation) - "doc": re.compile( - r"///|///\s*|///\s*|///\s*" - ), + "doc": re.compile(r"///|///\s*|///\s*|///\s*"), # 14. test (Testing & Assertions) "test": re.compile( r"\[(?:Test|Fact|Theory|TestMethod|TestClass|SetUp|TearDown)\]|\b(?:Assert\.|Should\(\)|Mock\.|Substitute\.For)\b" @@ -1625,13 +1493,11 @@ ), # 23. heat_triggers (Metaprogramming & Reflection) # Reflection and dynamic Emit. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(System\.Reflection|DllImport|LibraryImport|MethodInfo|Activator|Marshal\.|Emit|ILGenerator)\b" ), # 24. import (Dependency Inclusions) - "import": re.compile( - r"^[ \t]*(?:global[ \t]+)?using\s+(?:static[ \t]+)?[\w.]+;", re.M - ), + "import": re.compile(r"^[ \t]*(?:global[ \t]+)?using\s+(?:static[ \t]+)?[\w.]+;", re.M), "_dependency_capture": re.compile( r"^[ \t]*(?:global[ \t\n]+)?using[ \t\n]+(?:static[ \t\n]+)?([\w.]+)[ \t\n]*;", re.M, @@ -1644,11 +1510,9 @@ # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (The Blazor/Razor Horizon) "ssr_boundaries": re.compile( r"@(?:page|rendermode|code|layout)|\[(?:Route|CascadingParameter)\]|\b(RenderFragment|ComponentBase|IViewComponentResult)\b" @@ -1668,9 +1532,7 @@ ), # 35. pointers (Pointer Arithmetic / Memory Addressing) # Native pointers and modern memory structures (Span/Memory). - "pointers": re.compile( - r"\b(?:fixed|stackalloc|Unsafe\.AsPointer|IntPtr|UIntPtr|nint|nuint)\b|->" - ), + "pointers": re.compile(r"\b(?:fixed|stackalloc|Unsafe\.AsPointer|IntPtr|UIntPtr|nint|nuint)\b|->"), # 36. memory_alloc (Manual Memory Management) "memory_alloc": re.compile( r"\b(Marshal\.AllocHGlobal|GC\.AllocateArray|MemoryPool|ArrayPool<[^>]*>\.Shared\.Rent|ref\s+struct|scoped\s+ref)\b" @@ -1682,57 +1544,43 @@ "telemetry": re.compile( r"\b(?:ILogger|_logger|Log|TelemetryClient|ActivitySource)\.(?:LogInformation|LogError|LogWarning|LogDebug|StartActivity|TrackEvent)\b|\[LoggerMessage" ), - # 39. print_hits (The Amateur) - "print_hits": re.compile( - r"\b(Console\.(?:Write|WriteLine|Error)|Debug\.(?:Write|WriteLine|Print))\b" - ), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) + "debug_prints": re.compile(r"\b(Console\.(?:Write|WriteLine|Error)|Debug\.(?:Write|WriteLine|Print))\b"), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile( r"\bas\s+[A-Z]\w*|\(\s*(?:int|long|short|byte|char|float|double|decimal|bool|string|[A-Z][A-Za-z0-9_]*)\s*\)\s*[a-zA-Z_$]" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(throw|abort|FailFast|Environment\.Exit)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile( - r"\b(sleep|delay|Wait\(\)|Task\.Delay|Thread\.Sleep)\b" - ), - # 43. bitwise_hits (Bitwise Operations) + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|abort|FailFast|Environment\.Exit)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(sleep|delay|Wait\(\)|Task\.Delay|Thread\.Sleep)\b"), + # 43. bitwise_ops (Bitwise Operations) # Low-level byte manipulation. Safely maps to C# bitwise operators without overlapping language-specific pipelines. - "bitwise_hits": re.compile(r"<<|>>|\^|~"), - # 44. sync_locks (Thread Synchronization / Locks) + "bitwise_ops": re.compile(r"<<|>>|\^|~"), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile( r"\b(mutex|lock|Monitor|Semaphore|Interlocked|SpinLock|ReaderWriterLockSlim)\b", re.I, ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile(r"\b(const|readonly|init|Immutable[A-Z]\w*)\b"), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(const|readonly|init|Immutable[A-Z]\w*)\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"\b(dispose|close|free|delete|GC\.Collect|GC\.SuppressFinalize)\b\s*\(" - ), + "cleanup": re.compile(r"\b(dispose|close|free|delete|GC\.Collect|GC\.SuppressFinalize)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) "encapsulation": re.compile(r"\b(private|protected|internal|file)\b"), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\b(on|addEventListener|subscribe|EventHandler)\b|\+=" - ), + "listeners": re.compile(r"\b(on|addEventListener|subscribe|EventHandler)\b|\+="), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"\[(?:Ignore|Skipped)\]|test\.skip\(|mock\(|stub\(|Substitute\.For" - ), + "test_skip": re.compile(r"\[(?:Ignore|Skipped)\]|test\.skip\(|mock\(|stub\(|Substitute\.For"), # --- PHASE 3: HYBRID DOMAIN SENSORS (C# Specifics) --- "serialization_parsing": re.compile( r"\b(JsonSerializer\.Deserialize|JsonConvert\.DeserializeObject|XmlSerializer|BinaryFormatter)\b" ), - "regex_execution": re.compile( - r"\b(Regex\.Match(?:es)?|Regex\.Replace|Regex\.IsMatch|new\s+Regex)\b" - ), + "regex_execution": re.compile(r"\b(Regex\.Match(?:es)?|Regex\.Replace|Regex\.IsMatch|new\s+Regex)\b"), "time_date_logic": re.compile( r"\b(DateTime\.Now|DateTime\.UtcNow|DateTimeOffset|TimeSpan|Stopwatch\.StartNew)\b" ), - "ipc_rpc_bridges": re.compile( - r"\b(Process\.Start|NamedPipeServerStream|ChannelFactory|GrpcChannel)\b" - ), + "ipc_rpc_bridges": re.compile(r"\b(Process\.Start|NamedPipeServerStream|ChannelFactory|GrpcChannel)\b"), }, }, "go": { @@ -1755,9 +1603,9 @@ "vendor/modules.txt", ], "shebangs": ["go", "gorun", "yaegi"], - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Standard C-family line comment token. "_line_anchor": re.compile(r"//"), # Inline comments follow the standard '//' delimiter. @@ -1766,7 +1614,7 @@ "_block_start": re.compile(r"/\*"), # Block comment end: */ "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Includes select/case and range-based loops. EXCLUDES panic (bailout_hits). "branch": re.compile( @@ -1774,14 +1622,10 @@ ), # 2. args (Parameters / Coupling) # Parameter blocks for functions and methods. Bounded generics [^\]]* and params [^)]*. - "args": re.compile( - r"func\s+(?:\([^)]*\)[ \t]+)?\w*(?:\[[^\]]*\])?\s*\([^)]*\)", re.M - ), + "args": re.compile(r"func\s+(?:\([^)]*\)[ \t]+)?\w*(?:\[[^\]]*\])?\s*\([^)]*\)", re.M), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: const/var (freeze_hits) and Capitalization (encapsulation). - "linear": re.compile( - r"\b(package|import|return|type|go|defer|chan|map|interface|struct)\b" - ), + "structural_boundaries": re.compile(r"\b(package|import|return|type|go|defer|chan|map|interface|struct)\b"), # 4. func_start (Executable Logic Anchors) # ONLY executable logic blocks. # Bypasses the 'func' keyword, skips optional method receivers (e.g. (s *Server)), @@ -1817,14 +1661,10 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Explicitly ignoring errors via blank identifier. - "safety_neg": re.compile( - r'_\s*,\s*err[ \t]*=|_[ \t]*=\s*\w+|\bimport\s+(?:\.[ \t]+)?"' - ), + "safety_bypasses": re.compile(r'_\s*,\s*err[ \t]*=|_[ \t]*=\s*\w+|\bimport\s+(?:\.[ \t]+)?"'), # 8. danger (High-Risk Execution / System Calls) # Process-killing commands and direct syscalls. EXCLUDES TODO (debt) and fmt.Print (print_hits). - "danger": re.compile( - r"\b(os\.Exit|syscall\.Kill|syscall\.RawSyscall|log\.Fatal(?:f|ln)?)\b" - ), + "high_risk_execution": re.compile(r"\b(os\.Exit|syscall\.Kill|syscall\.RawSyscall|log\.Fatal(?:f|ln)?)\b"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(os\.(?:Open|Create|ReadFile)|io\.(?:Reader|Writer|Copy)|net/http|database/sql|bufio\.|grpc\.|sqlx\.|pgx\.)\b" @@ -1837,22 +1677,14 @@ ), # 11. flux (State Mutation) # Mutation of state. Reassignment and channel sends. - "flux": re.compile( - r":=|(?])=(?![=])|<-|\bappend\(|\batomic\.(?:Add|Store|Swap)" - ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"//[ \t]*(?:func|type|var|const|import|if|for|switch|select|return)\b" - ), + "state_mutation": re.compile(r":=|(?])=(?![=])|<-|\bappend\(|\batomic\.(?:Add|Store|Swap)"), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"//[ \t]*(?:func|type|var|const|import|if|for|switch|select|return)\b"), # 13. doc (Structured Documentation) # GoDoc standard: comments immediately preceding a declaration. - "doc": re.compile( - r"^[ \t]*//\s+[A-Z][a-zA-Z0-9_]+\s+.*|^[ \t]*//\s*Package\s+", re.M - ), + "doc": re.compile(r"^[ \t]*//\s+[A-Z][a-zA-Z0-9_]+\s+.*|^[ \t]*//\s*Package\s+", re.M), # 14. test (Testing & Assertions) - "test": re.compile( - r"\b(?:Test|Benchmark|Fuzz)[A-Z]\w*\b|t\.Run\b|\b(?:assert|require|mock)\.\w+\(" - ), + "test": re.compile(r"\b(?:Test|Benchmark|Fuzz)[A-Z]\w*\b|t\.Run\b|\b(?:assert|require|mock)\.\w+\("), # --- PHASE 3: ARCHITECTURE & DOMAIN SENSORS --- # 15. concurrency (Asynchronous Execution) "concurrency": re.compile( @@ -1864,9 +1696,7 @@ r"\b(html/template|text/template|http\.HandleFunc|ServeHTTP|gin\.|echo\.|fiber\.)\b" ), # 17. closures (Closures / Anonymous Functions) - "closures": re.compile( - r"func\s*\([^)]*\)\s*(?:\[[^\]]*\])?\s*(?:\([^)]*\))?[ \t]*\{" - ), + "closures": re.compile(r"func\s*\([^)]*\)\s*(?:\[[^\]]*\])?\s*(?:\([^)]*\))?[ \t]*\{"), # 18. globals (Global / Shared State) "globals": re.compile( r"^[ \t]*var\s+[a-zA-Z_]\w*\s*(?:[a-zA-Z_]\w*\s*)?=|os\.Getenv|os\.Environ", @@ -1874,27 +1704,17 @@ ), # 19. decorators (Decorators / Annotations) # Go lacks @decorators; uses Struct Tags and Build Tags. - "decorators": re.compile( - r'`[^`]*?(?:json|xml|yaml|gorm|db|bson):"[^"]*"[^`]*?`|//go:build|//\s*\+build' - ), + "decorators": re.compile(r'`[^`]*?(?:json|xml|yaml|gorm|db|bson):"[^"]*"[^`]*?`|//go:build|//\s*\+build'), # 20. generics (Generics / Type Parameters) - "generics": re.compile( - r"\[[^\]]*\b(?:any|comparable|~[a-zA-Z_]\w*)\b[^\]]*\]|\bany\b" - ), + "generics": re.compile(r"\[[^\]]*\b(?:any|comparable|~[a-zA-Z_]\w*)\b[^\]]*\]|\bany\b"), # 21. comprehensions (Iterators / Comprehensions) # Functional iteration helpers from the slices/maps packages. - "comprehensions": re.compile( - r"\b(slices\.(?:Delete|Filter|Sort|Compact)|maps\.(?:Keys|Values))\b" - ), + "comprehensions": re.compile(r"\b(slices\.(?:Delete|Filter|Sort|Compact)|maps\.(?:Keys|Values))\b"), # 22. scientific (Numerical / Compute Libraries) - "scientific": re.compile( - r"\b(math\.|math/cmplx\.|math/rand\.|crypto/rand\.|gonum\.)\b" - ), + "scientific": re.compile(r"\b(math\.|math/cmplx\.|math/rand\.|crypto/rand\.|gonum\.)\b"), # 23. heat_triggers (Metaprogramming & Reflection) # Reflection, CGO, and Unsafe triggers. - "heat_triggers": re.compile( - r'import\s+"C"|\b(reflect\.|unsafe\.|cgo|go:linkname)\b' - ), + "reflection_metaprogramming": re.compile(r'import\s+"C"|\b(reflect\.|unsafe\.|cgo|go:linkname)\b'), # 24. import (Dependency Inclusions) "import": re.compile(r'^[ \t]*import\s*(?:\(|"[^"]+")', re.M), # ---> THE FIX: Strictly bounded to valid Go import path characters <--- @@ -1914,29 +1734,23 @@ # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) # Gofmt mandates tabs; finding spaces at start signals structural friction. - "civil_war": None, + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(html/template|ExecuteTemplate|http\.ResponseWriter|Render|gin\.Context)\b" ), # 32. events (Event Emitters / Pub-Sub) - "events": re.compile( - r"\b(EventBus|Publish|Subscribe|kafka\.|rabbitmq\.|Emit|OnEvent)\b" - ), + "events": re.compile(r"\b(EventBus|Publish|Subscribe|kafka\.|rabbitmq\.|Emit|OnEvent)\b"), # 33. dependency_injection (Dependency Injection / IoC) "dependency_injection": re.compile( r"\b(wire\.Build|wire\.NewSet|fx\.New|fx\.Provide|fx\.Invoke|dig\.Provide|do\.Provide)\b" ), # 34. macros (Preprocessor Directives / Macros) # Go lacks a preprocessor; //go: directives act as compile-time hooks. - "macros": re.compile( - r"^//go:(?:generate|build|noinline|nosplit|noescape|linkname)\b", re.M - ), + "macros": re.compile(r"^//go:(?:generate|build|noinline|nosplit|noescape|linkname)\b", re.M), # 35. pointers (Pointer Arithmetic / Memory Addressing) # Explicit pointer addressing and dereferencing. "pointers": re.compile( @@ -1951,31 +1765,25 @@ "telemetry": re.compile( r"\b(slog|logrus|zap|zerolog|log)\.(?:Info|Warn|Error|Debug|Trace)(?:f|ln)?\b|\btrace\.Span\b" ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile( - r"\b(fmt\.Print|fmt\.Println|fmt\.Printf|println|print)\b" - ), - # 40. cast_hits (Explicit Type Casting) + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\b(fmt\.Print|fmt\.Println|fmt\.Printf|println|print)\b"), + # # 40. explicit_casts (Explicit Type Casting) # Type assertions and conversions. - "cast_hits": re.compile( + "explicit_casts": re.compile( r"\.\([a-zA-Z_]\w*\)|\b(?:int|int8|int16|int32|int64|uint|uint8|uint16|uint32|uint64|float32|float64|byte|rune|uintptr|string)\s*\(" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(panic|os\.Exit|log\.Fatal)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\b(time\.Sleep|time\.After|runtime\.Gosched)\b"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>|\^|&\^"), - # 44. sync_locks (Thread Synchronization / Locks) - "sync_locks": re.compile( - r"\b(Mutex|RWMutex|Lock|Unlock|RLock|RUnlock|atomic\.|sync\.Map|sync\.Pool)\b" - ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile(r"\bconst\b"), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(panic|os\.Exit|log\.Fatal)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(time\.Sleep|time\.After|runtime\.Gosched)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"<<|>>|\^|&\^"), + # 44. sync_locks (Resource Management & Stability) + "sync_locks": re.compile(r"\b(Mutex|RWMutex|Lock|Unlock|RLock|RUnlock|atomic\.|sync\.Map|sync\.Pool)\b"), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\bconst\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"\b(defer|Close|Unlock|RUnlock|Stop|Cleanup)\b\s*\(" - ), + "cleanup": re.compile(r"\b(defer|Close|Unlock|RUnlock|Stop|Cleanup)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) # Unexported identifiers (lowercase) in Go are private/internal. "encapsulation": re.compile( @@ -1990,15 +1798,9 @@ "serialization_parsing": re.compile( r"\b(json\.Unmarshal|json\.Marshal|xml\.Unmarshal|xml\.Marshal|gob\.NewEncoder)\b" ), - "regex_execution": re.compile( - r"\b(regexp\.Compile|regexp\.MustCompile|\.MatchString)\b" - ), - "time_date_logic": re.compile( - r"\b(time\.Now\(\)|time\.Parse|time\.Duration|time\.Sleep|time\.Since)\b" - ), - "ipc_rpc_bridges": re.compile( - r"\b(net/rpc|grpc\.Dial|grpc\.NewServer|exec\.Command|syscall)\b" - ), + "regex_execution": re.compile(r"\b(regexp\.Compile|regexp\.MustCompile|\.MatchString)\b"), + "time_date_logic": re.compile(r"\b(time\.Now\(\)|time\.Parse|time\.Duration|time\.Sleep|time\.Since)\b"), + "ipc_rpc_bridges": re.compile(r"\b(net/rpc|grpc\.Dial|grpc\.NewServer|exec\.Command|syscall)\b"), }, }, "rust": { @@ -2027,9 +1829,9 @@ # UPGRADED: Maps to Family 2 (Nested C) # Rationale: Rust explicitly allows nested block comments (/* /* */ */), # unlike standard C/C++. Standard C parsing would prematurely terminate here. - "lexical_family": "recursive_c_style", + "lexical_family": "recursive_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Standard C-family line comment token (Includes /// and //!) "_line_anchor": re.compile(r"//"), # Inline comments follow the same '//' delimiter. @@ -2038,12 +1840,10 @@ "_block_start": re.compile(r"/\*"), # REQUIRED for Family 2: Recursive logic markers "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Decisions and logical jumps. EXCLUDES panic!/throw (bailout_hits). - "branch": re.compile( - r"\b(if|else|match|for|while|loop|break|continue)\b|\?|&&|\|\|" - ), + "branch": re.compile(r"\b(if|else|match|for|while|loop|break|continue)\b|\?|&&|\|\|"), # 2. args (Parameters / Coupling) # Parameter blocks of functions and closures. Bounded to prevent ReDoS on complex types. "args": re.compile( @@ -2060,7 +1860,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: Access modifiers (pub) and Immutability (const/static). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(let|struct|enum|union|trait|impl|use|mod|type|yield|await|where|mut|ref|move|return)\b" ), # 4. func_start (Executable Logic Anchors) @@ -2092,14 +1892,10 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Actively bypasses type safety (unwraps and forced expectations). - "safety_neg": re.compile( - r"\b(unwrap|expect|unwrap_err|unwrap_unchecked)\b" - ), + "safety_bypasses": re.compile(r"\b(unwrap|expect|unwrap_err|unwrap_unchecked)\b"), # 8. danger (High-Risk Execution / System Calls) # Process-killing commands. EXCLUDES TODO (debt) and println! (print_hits). - "danger": re.compile( - r"\b(panic!|todo!|unimplemented!|process::exit|abort)\b" - ), + "high_risk_execution": re.compile(r"\b(panic!|todo!|unimplemented!|process::exit|abort)\b"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(std::fs|File::|std::net|tokio::net|tokio::fs|reqwest|std::io|hyper::|sqlx::|diesel::|sea_orm::)\b" @@ -2109,13 +1905,9 @@ "api": re.compile(r"\bpub(?:\([^)]*\))?\b"), # 11. flux (State Mutation) # Mutation of state. EXCLUDES const (freeze_hits). - "flux": re.compile( - r"\bmut\b|\.borrow_mut\(\)|\.write\(\)|Cell::|RefCell::|Atomic[A-Za-z0-9]+" - ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"//[ \t]*(?:fn|let|struct|impl|mod|use|match|for|while|loop|if|return)\b" - ), + "state_mutation": re.compile(r"\bmut\b|\.borrow_mut\(\)|\.write\(\)|Cell::|RefCell::|Atomic[A-Za-z0-9]+"), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"//[ \t]*(?:fn|let|struct|impl|mod|use|match|for|while|loop|if|return)\b"), # 13. doc (Structured Documentation) "doc": re.compile(r"///|//!|#!?\[doc\b[^\]]*\]"), # 14. test (Testing & Assertions) @@ -2129,34 +1921,24 @@ r"\b(async|await|std::thread|spawn|tokio::spawn|mpsc::|async_trait|Future|Stream|Send|Sync)\b" ), # 16. ui_framework (UI / View Components) - "ui_framework": re.compile( - r"\b(yew::|dioxus::|iced::|html!|rsx!|view!|slint|leptos::|tauri::)\b" - ), + "ui_framework": re.compile(r"\b(yew::|dioxus::|iced::|html!|rsx!|view!|slint|leptos::|tauri::)\b"), # 17. closures (Closures / Anonymous Functions) "closures": re.compile(r"\|[^|]*\|[ \t]*\{"), # 18. globals (Global / Shared State) - "globals": re.compile( - r"\b(static\s+mut|lazy_static!|OnceCell|OnceLock|LazyLock|std::env::var)\b" - ), + "globals": re.compile(r"\b(static\s+mut|lazy_static!|OnceCell|OnceLock|LazyLock|std::env::var)\b"), # 19. decorators (Decorators / Annotations) "decorators": re.compile(r"^[ \t]*#!?\[[^\]]*\]", re.M), # 20. generics (Generics / Type Parameters) - "generics": re.compile( - r"<\s*[A-Z\'][^>]*>|\bwhere\b|\'[a-z]+\b|\bimpl\s+[A-Z]\w+" - ), + "generics": re.compile(r"<\s*[A-Z\'][^>]*>|\bwhere\b|\'[a-z]+\b|\bimpl\s+[A-Z]\w+"), # 21. comprehensions (Iterators / Comprehensions) "comprehensions": re.compile( r"\.(?:map|filter|fold|collect|flat_map|any|all|reduce|for_each|find|zip)\s*\(" ), # 22. scientific (Numerical / Compute Libraries) - "scientific": re.compile( - r"\b(ndarray::|nalgebra::|num::|f32|f64|std::simd)\b" - ), + "scientific": re.compile(r"\b(ndarray::|nalgebra::|num::|f32|f64|std::simd)\b"), # 23. heat_triggers (Metaprogramming & Reflection) # Metaprogramming and memory transmutation. - "heat_triggers": re.compile( - r"\b(macro_rules!|std::mem::transmute|Pin::|PhantomData|UnsafeCell)\b" - ), + "reflection_metaprogramming": re.compile(r"\b(macro_rules!|std::mem::transmute|Pin::|PhantomData|UnsafeCell)\b"), # 24. import (Dependency Inclusions) "import": re.compile(r"\b(?:pub[ \t]+)?use\s+[^;]+;", re.M), "_dependency_capture": re.compile( @@ -2185,36 +1967,28 @@ re.M, ), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"//\s*(?:Author|Maintainer|Copyright):\s+(.*)", re.I - ), + "ownership": re.compile(r"//\s*(?:Author|Maintainer|Copyright):\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(actix_web|axum|rocket|HttpResponse|Responder|IntoResponse|Html|askama::|tera::)\b" ), # 32. events (Event Emitters / Pub-Sub) - "events": re.compile( - r"\b(tokio::sync::broadcast|std::sync::mpsc|crossbeam_channel|Sender|Receiver)\b" - ), + "events": re.compile(r"\b(tokio::sync::broadcast|std::sync::mpsc|crossbeam_channel|Sender|Receiver)\b"), # 33. dependency_injection (Dependency Injection / IoC) "dependency_injection": re.compile( r"\b(axum::extract::State|actix_web::web::Data|Extension|Provider|shaku::)\b" ), # 34. macros (Preprocessor Directives / Macros) - "macros": re.compile( - r"\b(macro_rules!|proc_macro|proc_macro_derive|proc_macro_attribute)\b" - ), + "macros": re.compile(r"\b(macro_rules!|proc_macro|proc_macro_derive|proc_macro_attribute)\b"), # 35. pointers (Pointer Arithmetic / Memory Addressing) # Raw memory addressing. Shielded from standard multiplication by explicitly mapping to native Rust unsafe pointer primitives and dereferencing. "pointers": re.compile(r"\*const\b|\*mut\b|\bNonNull\b|\bstd::ptr\b|->"), @@ -2223,36 +1997,28 @@ r"\b(Box::new|Rc::new|Arc::new|Vec::with_capacity|String::with_capacity|alloc::|GlobalAlloc)\b" ), # 37. inline_asm (The Bare Metal) - "inline_asm": re.compile( - r"\b(?:core::arch::asm!|std::arch::asm!|asm!|global_asm!)\b" - ), + "inline_asm": re.compile(r"\b(?:core::arch::asm!|std::arch::asm!|asm!|global_asm!)\b"), # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry (Structured Logging / Telemetry) - "telemetry": re.compile( - r"\b(?:log::|tracing::)?(?:info!|warn!|error!|debug!|trace!|span!|instrument)\b" - ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile(r"\b(println!|print!|eprintln!|eprint!|dbg!)\b"), - # 40. cast_hits (Explicit Type Casting) + "telemetry": re.compile(r"\b(?:log::|tracing::)?(?:info!|warn!|error!|debug!|trace!|span!|instrument)\b"), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\b(println!|print!|eprintln!|eprint!|dbg!)\b"), + # # 40. explicit_casts (Explicit Type Casting) # Forceful type coercion bypassing the safety engine. Enforces strict mapping to the `as` keyword followed by standard primitive types. - "cast_hits": re.compile( + "explicit_casts": re.compile( r"\bas\s+(?:i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize|f32|f64|bool|char)\b" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(panic!|abort|process::exit|fatalError)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile( - r"\b(std::thread::sleep|tokio::time::sleep|Duration::from)\b" - ), - # 43. bitwise_hits (Bitwise Operations) + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(panic!|abort|process::exit|fatalError)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(std::thread::sleep|tokio::time::sleep|Duration::from)\b"), + # 43. bitwise_ops (Bitwise Operations) # Low-level byte manipulation. CRITICAL: Removed the pipe '|' (used for closures `|x| x+1` and patterns), ampersand '&' (used for references), and exclamation '!' (used for macros and logical NOT). - "bitwise_hits": re.compile(r"<<|>>|\^"), - # 44. sync_locks (Thread Synchronization / Locks) - "sync_locks": re.compile( - r"\b(Mutex|RwLock|lock|barrier|atomic|Semaphore)\b", re.I - ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile(r"\b(const|static|immutable|readonly)\b"), + "bitwise_ops": re.compile(r"<<|>>|\^"), + # 44. sync_locks (Resource Management & Stability) + "sync_locks": re.compile(r"\b(Mutex|RwLock|lock|barrier|atomic|Semaphore)\b", re.I), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(const|static|immutable|readonly)\b"), # 46. cleanup (Resource Cleanup / Teardown) "cleanup": re.compile(r"\b(drop|free|delete|close|shutdown)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) @@ -2319,9 +2085,9 @@ # UPGRADED: Maps to Family 1 (Standard C) # Rationale: Uses '//' for line-level literature; multi-line literature # (/* */) is handled by the Section 2.3.C.3 Heuristic Pass. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # C++ uses '//' for standard line-level Literature (Commented / Non-Executable Text). "_line_anchor": re.compile(r"//"), # Inline comments follow the standard '//' delimiter. @@ -2343,7 +2109,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: Access modifiers (encapsulation) and const (freeze_hits). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(namespace|using|class|struct|enum|union|template|typename|concept|requires|auto|return|void|inline|virtual|explicit|friend|module|export|import|typedef)\b" ), "func_start": re.compile( @@ -2414,14 +2180,10 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Swallowing errors or bypassing types. EXCLUDES standard casting (Phase 5). - "safety_neg": re.compile( - r"\b(std::any|void\s*\*)\b|catch\s*\(\s*\.\.\.\s*\)" - ), + "safety_bypasses": re.compile(r"\b(std::any|void\s*\*)\b|catch\s*\(\s*\.\.\.\s*\)"), # 8. danger (High-Risk Execution / System Calls) # Process killers and low-level blits. EXCLUDES prints (Phase 5). - "danger": re.compile( - r"\b(system|memcpy|memset|abort|exit|std::terminate|longjmp|setjmp)\b" - ), + "high_risk_execution": re.compile(r"\b(system|memcpy|memset|abort|exit|std::terminate|longjmp|setjmp)\b"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(std::fstream|std::ifstream|std::ofstream|std::filesystem|fopen|fclose|fread|fwrite|socket|recv|send|asio::|curl_easy_perform|std::cin)\b" @@ -2434,12 +2196,12 @@ ), # 11. flux (State Mutation) # Mutation of state. Includes moves and increments. - "flux": re.compile( + "state_mutation": re.compile( r"\b(mutable|std::move|std::exchange|std::swap|std::atomic)\b|(?])=(?![=])|&(?!\s*const)|\+\+|--|(?:\+=|-=|\*=|/=|%=|<<=|>>=|&=|\|=|\^=)" ), - # 12. graveyard (Dead / Commented-out Code) + # 12. dead_code (Commented Logic / Deprecated Trails) # Commented-out execution logic indicating dead features. MUST enforce that the structural keyword immediately follows the comment token. - "graveyard": re.compile( + "dead_code": re.compile( r"//[ \t]*(?:if|for|while|auto|class|struct|std::cout|std::print|printf|void|int|return)\b" ), # 13. doc (Structured Documentation) @@ -2457,9 +2219,7 @@ r"\b(std::thread|std::jthread|std::mutex|std::future|std::promise|std::async|std::latch|std::barrier|std::condition_variable|std::semaphore|co_await|std::coroutine_handle)\b" ), # 16. ui_framework (UI / View Components) - "ui_framework": re.compile( - r"\b(Q_OBJECT|slots:|signals:|QWidget|wxFrame|ImGui::|Fl_Window)\b" - ), + "ui_framework": re.compile(r"\b(Q_OBJECT|slots:|signals:|QWidget|wxFrame|ImGui::|Fl_Window)\b"), # 17. closures (Closures / Anonymous Functions) "closures": re.compile( r"\[[^\]]*\]\s*(?:<[^>]*>\s*)?(?:\([^)]*\))?\s*(?:(?:mutable|constexpr|consteval|noexcept)\s+)*(?:mutable|constexpr|consteval|noexcept)?\s*(?:->\s*[\w:<>_]+)?[ \t]*\{" @@ -2484,7 +2244,7 @@ ), # 23. heat_triggers (Metaprogramming & Reflection) # SFINAE, compile-time reflection, and macros. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(if\s+constexpr|if\s+consteval|std::enable_if|std::is_same|std::any_cast|std::bit_cast|decltype|sizeof\.\.\.)\b|#define\s+[a-zA-Z_]" ), # 24. import (Dependency Inclusions) @@ -2497,32 +2257,22 @@ re.M, ), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"(?:@author|\\author|Author:|Created by:|Copyright)\s+(.*)", re.I - ), + "ownership": re.compile(r"(?:@author|\\author|Author:|Created by:|Copyright)\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) - "ssr_boundaries": re.compile( - r"\b(FCGI_Accept|render_template|Inja::|ctemplate::)\b" - ), + "ssr_boundaries": re.compile(r"\b(FCGI_Accept|render_template|Inja::|ctemplate::)\b"), # 32. events (Event Emitters / Pub-Sub) - "events": re.compile( - r"\b(emit|signal|slot|notify|publish|subscribe|boost::signals2)\b" - ), + "events": re.compile(r"\b(emit|signal|slot|notify|publish|subscribe|boost::signals2)\b"), # 33. dependency_injection (Dependency Injection / IoC) - "dependency_injection": re.compile( - r"\b(boost\.di|fruit::|[I]nject|IServiceCollection)\b" - ), + "dependency_injection": re.compile(r"\b(boost\.di|fruit::|[I]nject|IServiceCollection)\b"), # 34. macros (Preprocessor Directives / Macros) "macros": re.compile( r"^[ \t]*#(?:define|undef|if|elif|else|endif|pragma|warning|error)\b", @@ -2534,75 +2284,55 @@ r"->|\b(?:uintptr_t|intptr_t|ptrdiff_t|size_t)\b|(?<=[=\s,(])&\w+|(?<=[=\s,(])\*(?:\s*const\s*)?\w+" ), # 36. memory_alloc (Manual Memory Management) - "memory_alloc": re.compile( - r"\b(new|malloc|calloc|realloc|aligned_alloc|mmap|alloca)\b" - ), + "memory_alloc": re.compile(r"\b(new|malloc|calloc|realloc|aligned_alloc|mmap|alloca)\b"), # 37. inline_asm (The Bare Metal) "inline_asm": re.compile( r"\b(?:__asm__|asm|__asm)\b(?:\s+(?:volatile|__volatile__))?\s*\(|\b(?:__asm__|asm|__asm)\b[ \t]*\{" ), # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry (Structured Logging / Telemetry) - "telemetry": re.compile( - r"\b(log|logger|LOGGER|spdlog|glog|syslog)\.(?:info|error|warn|debug|trace)\b" - ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile( + "telemetry": re.compile(r"\b(log|logger|LOGGER|spdlog|glog|syslog)\.(?:info|error|warn|debug|trace)\b"), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile( r"\b(std::cout|std::cerr|std::clog|printf|fprintf|vprintf|puts|putchar|std::print|std::println)\b" ), - # 40. cast_hits (Explicit Type Casting) + # # 40. explicit_casts (Explicit Type Casting) # Forceful type coercion bypassing the safety engine. Captures modern explicitly named casts and strict C-style groupings. - "cast_hits": re.compile( + "explicit_casts": re.compile( r"\b(?:static_cast|dynamic_cast|reinterpret_cast|const_cast|bit_cast)\b|<\s*[A-Za-z_]\w*\s*>|\(\s*(?:int|float|double|char|bool|long|short|unsigned|signed)\s*\)\s*[a-zA-Z_]" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile( - r"\b(throw|abort|exit|_Exit|quick_exit|std::terminate|longjmp)\b" - ), - # 42. halt_hits (Thread Blocking / Sleeps) + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|abort|exit|_Exit|quick_exit|std::terminate|longjmp)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) # Admission of race conditions or lazy polling. - "halt_hits": re.compile( - r"\b(sleep|delay|usleep|nanosleep|std::this_thread::sleep_for)\b" - ), - # 43. bitwise_hits (Bitwise Operations) + "thread_sleeps": re.compile(r"\b(sleep|delay|usleep|nanosleep|std::this_thread::sleep_for)\b"), + # 43. bitwise_ops (Bitwise Operations) # Low-level byte manipulation. CRITICAL: Removed bare `<<` and `>>` to prevent catastrophic false positives on `std::cout` and `std::cin` streams. Explicit bitwise assignments (`<<=`, `&=`) are retained as they are unambiguous. - "bitwise_hits": re.compile(r"\^|(?>=|&=|\|=|\^="), - # 44. sync_locks (Thread Synchronization / Locks) + "bitwise_ops": re.compile(r"\^|(?>=|&=|\|=|\^="), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile( r"\b(mutex|lock|synchronized|Semaphore|std::lock_guard|std::scoped_lock|std::unique_lock|mtx_lock)\b", re.I, ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile( - r"\b(const|constexpr|consteval|constinit|final|readonly|Immutable)\b" - ), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(const|constexpr|consteval|constinit|final|readonly|Immutable)\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"\b(delete|free|close|fclose|dispose|shutdown|std::destroy|reset)\b\s*\(" - ), + "cleanup": re.compile(r"\b(delete|free|close|fclose|dispose|shutdown|std::destroy|reset)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) "encapsulation": re.compile(r"\b(private:|protected:|internal:)\b"), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\b(on|addEventListener|subscribe|connect|handler|callback)\b" - ), + "listeners": re.compile(r"\b(on|addEventListener|subscribe|connect|handler|callback)\b"), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"\b(GTEST_SKIP|test\.skip|it\.skip|mock\(|fake\()\b" - ), + "test_skip": re.compile(r"\b(GTEST_SKIP|test\.skip|it\.skip|mock\(|fake\()\b"), # --- PHASE 3: HYBRID DOMAIN SENSORS (C++ Specifics) --- "serialization_parsing": re.compile( r"\b(nlohmann::json|rapidjson|boost::archive|ParseFromString|SerializeToString)\b" ), - "regex_execution": re.compile( - r"\b(std::regex|std::regex_match|std::regex_search|std::regex_replace)\b" - ), + "regex_execution": re.compile(r"\b(std::regex|std::regex_match|std::regex_search|std::regex_replace)\b"), "time_date_logic": re.compile( r"\b(std::chrono::(?:system_clock|steady_clock|duration)|std::time_t|std::localtime)\b" ), - "ipc_rpc_bridges": re.compile( - r"\b(boost::interprocess|mmap|shm_open|pipe|fork|grpc::ServerBuilder)\b" - ), + "ipc_rpc_bridges": re.compile(r"\b(boost::interprocess|mmap|shm_open|pipe|fork|grpc::ServerBuilder)\b"), }, }, "c": { @@ -2642,9 +2372,9 @@ # UPGRADED: Maps to Family 1 (Standard C) # Rationale: Uses '//' for line-level literature; multi-line literature # (/* */) is handled by the Section 2.3.C.3 Heuristic Pass. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Modern C (C99+) uses '//' for standard line-level Commented / Non-Executable Text. "_line_anchor": re.compile(r"//"), # Inline comments follow the standard '//' delimiter. @@ -2653,12 +2383,10 @@ "_block_start": re.compile(r"/\*"), # Block comment end: */ "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Decisions and jumps. EXCLUDES exit/abort (bailout_hits). - "branch": re.compile( - r"\b(if|else|switch|case|default|for|while|do|break|continue|goto)\b|&&|\|\||\?" - ), + "branch": re.compile(r"\b(if|else|switch|case|default|for|while|do|break|continue|goto)\b|&&|\|\||\?"), # 2. args (Parameters / Coupling) # Parameter blocks. Bounded negation [^)]* to prevent ReDoS on massive param lists. "args": re.compile( @@ -2675,7 +2403,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: Access modifiers (encapsulation) and const (freeze_hits). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(struct|union|enum|typedef|return|void|restrict|auto|bool|true|false|_BitInt|alignas|alignof)\b" ), "func_start": re.compile( @@ -2722,9 +2450,7 @@ ), # 5. class_start (Object / Entity Declarations) # C uses structs/unions/enums as the primary entity entities. - "class_start": re.compile( - r"^[ \t]*(?:typedef[ \t]+)?(?:struct|union|enum)\s+[a-zA-Z_]\w*", re.M - ), + "class_start": re.compile(r"^[ \t]*(?:typedef[ \t]+)?(?:struct|union|enum)\s+[a-zA-Z_]\w*", re.M), # --- PHASE 2: RISK & STRUCTURAL INTEGRITY --- # 6. safety (Defensive Programming / Validation) "safety": re.compile( @@ -2732,12 +2458,10 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Dangerous legacy functions and raw void manipulation. - "safety_neg": re.compile( - r"\b(strcpy|strcat|sprintf|gets|alloca)\b|\([a-zA-Z_]\w*\s*\*\)\s*[a-zA-Z_]\w*" - ), + "safety_bypasses": re.compile(r"\b(strcpy|strcat|sprintf|gets|alloca)\b|\([a-zA-Z_]\w*\s*\*\)\s*[a-zA-Z_]\w*"), # 8. danger (High-Risk Execution / System Calls) # Process killers and context switches. EXCLUDES prints (Phase 5). - "danger": re.compile(r"\b(system|popen|execl|execv|fork|longjmp|setjmp)\b"), + "high_risk_execution": re.compile(r"\b(system|popen|execl|execv|fork|longjmp|setjmp)\b"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(fopen|fclose|fread|fwrite|fscanf|sscanf|socket|recv|send|open|read|write|close|stat|fseek|remove|rename)\b" @@ -2760,17 +2484,11 @@ ), # 11. flux (State Mutation) # Mutation of state. EXCLUDES const/constexpr (freeze_hits). - "flux": re.compile( - r"(?])=(?![=])|\*(?!\s*const)\w+[ \t]*=|(?:\+\+|--)" - ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"(?://|/\*)[ \t]*(?:if|for|while|struct|union|enum|void|int|return)\b" - ), + "state_mutation": re.compile(r"(?])=(?![=])|\*(?!\s*const)\w+[ \t]*=|(?:\+\+|--)"), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"(?://|/\*)[ \t]*(?:if|for|while|struct|union|enum|void|int|return)\b"), # 13. doc (Structured Documentation) - "doc": re.compile( - r"///|/\*\*|@param|@return|@brief|@details|\\param|\\return|\\brief|\\details" - ), + "doc": re.compile(r"///|/\*\*|@param|@return|@brief|@details|\\param|\\return|\\brief|\\details"), # 14. test (Testing & Assertions) "test": re.compile( r"\b(?:TEST|TEST_F|TEST_CASE|CU_ASSERT|RUN_TEST|EXPECT_[A-Z_]+|ASSERT_[A-Z_]+)\b|\bassert\s*\(" @@ -2808,43 +2526,27 @@ ), # 23. heat_triggers (Metaprogramming & Reflection) # Macros with args and unstructured jumps. - "heat_triggers": re.compile( - r"^#\s*define\s+[a-zA-Z_]\w*\([^)]*\)|\bgoto\b", re.M - ), + "reflection_metaprogramming": re.compile(r"^#\s*define\s+[a-zA-Z_]\w*\([^)]*\)|\bgoto\b", re.M), # 24. import (Dependency Inclusions) - "import": re.compile( - r'^[ \t]*#[ \t]*(?:include|embed)\s*[<"][^>"]+[>"]', re.M - ), - "_dependency_capture": re.compile( - r'^[ \t]*#[ \t\n]*(?:include|embed)[ \t\n]*[<"]([^>"]+)[>"]', re.M - ), + "import": re.compile(r'^[ \t]*#[ \t]*(?:include|embed)\s*[<"][^>"]+[>"]', re.M), + "_dependency_capture": re.compile(r'^[ \t]*#[ \t\n]*(?:include|embed)[ \t\n]*[<"]([^>"]+)[>"]', re.M), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"(?:@author|\\author|Author:|Created by:|Copyright)\s+(.*)", re.I - ), + "ownership": re.compile(r"(?:@author|\\author|Author:|Created by:|Copyright)\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) - "ssr_boundaries": re.compile( - r"\b(FCGI_Accept|khttp_parse|MHD_start_daemon|facil\.io)\b" - ), + "ssr_boundaries": re.compile(r"\b(FCGI_Accept|khttp_parse|MHD_start_daemon|facil\.io)\b"), # 32. events (Event Emitters / Pub-Sub) - "events": re.compile( - r"\b(epoll_wait|epoll_ctl|kqueue|kevent|select|poll|libev|libuv)\b" - ), + "events": re.compile(r"\b(epoll_wait|epoll_ctl|kqueue|kevent|select|poll|libev|libuv)\b"), # 33. dependency_injection (Dependency Injection / IoC) - "dependency_injection": re.compile( - r"\b(plugin_register|vtable|struct\s+[a-zA-Z_]\w*_ops)\b" - ), + "dependency_injection": re.compile(r"\b(plugin_register|vtable|struct\s+[a-zA-Z_]\w*_ops)\b"), # 34. macros (Preprocessor Directives / Macros) "macros": re.compile( r"^[ \t]*#[ \t]*(?:define|undef|if|elif|else|endif|pragma|warning|error)\b", @@ -2855,24 +2557,18 @@ r"->|\b(?:uintptr_t|intptr_t|ptrdiff_t|size_t)\b|(?<=[=\s,(])&\w+|(?<=[=\s,(])\*(?:\s*const\s*)?\w+" ), # 36. memory_alloc (Manual Memory Management) - "memory_alloc": re.compile( - r"\b(malloc|calloc|realloc|free|aligned_alloc|mmap|alloca)\b" - ), + "memory_alloc": re.compile(r"\b(malloc|calloc|realloc|free|aligned_alloc|mmap|alloca)\b"), # 37. inline_asm (The Bare Metal) "inline_asm": re.compile( r"\b(?:__asm__|asm|__asm)\b(?:\s+(?:volatile|__volatile__))?\s*\(|\b(?:__asm__|asm|__asm)\b[ \t]*\{" ), # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry (Structured Logging / Telemetry) - "telemetry": re.compile( - r"\b(?:syslog|openlog|log_info|log_error|log_warn|log_debug|vsyslog)\b" - ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile( - r"\b(printf|fprintf|vprintf|puts|putchar|perror)\b" - ), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( + "telemetry": re.compile(r"\b(?:syslog|openlog|log_info|log_error|log_warn|log_debug|vsyslog)\b"), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\b(printf|fprintf|vprintf|puts|putchar|perror)\b"), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile( # ===================================================================== # [ROADMAP: NESTED OPTIONAL SPACES (ReDoS TRAP)] # FIX 2: `\s*[*]*\s*` is highly vulnerable to ReDoS if the payload @@ -2881,44 +2577,32 @@ # ===================================================================== r"\(\s*(?:int|float|double|char|bool|long|short|unsigned|signed|void)[ \t\n]*(?:\*[ \t\n]*)*\)\s*[a-zA-Z_]" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile( - r"\b(abort|exit|_Exit|quick_exit|return\s+-1)\b" - ), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\b(sleep|usleep|nanosleep|thrd_sleep)\b"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>|\^|(?>=|&=|\|=|\^="), - # 44. sync_locks (Thread Synchronization / Locks) + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(abort|exit|_Exit|quick_exit|return\s+-1)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(sleep|usleep|nanosleep|thrd_sleep)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"<<|>>|\^|(?>=|&=|\|=|\^="), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile( r"\b(mtx_lock|mtx_unlock|pthread_mutex_lock|atomic_flag_test_and_set|atomic_store)\b" ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile(r"\b(const|constexpr|alignas|restrict)\b"), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(const|constexpr|alignas|restrict)\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"\b(free|fclose|close|munmap|destroy|shutdown)\b\s*\(" - ), + "cleanup": re.compile(r"\b(free|fclose|close|munmap|destroy|shutdown)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) # Physical Reality: Static functions/variables are internal/private to the translation unit. "encapsulation": re.compile(r"^[ \t]*static\b", re.M), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\b(on_event|handler|callback|signal\(|sigaction\()" - ), + "listeners": re.compile(r"\b(on_event|handler|callback|signal\(|sigaction\()"), # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(IGNORE_TEST|test\.skip|mock\(|fake\()\b"), # --- PHASE 3: HYBRID DOMAIN SENSORS (C Specifics) --- - "serialization_parsing": re.compile( - r"\b(cJSON_Parse|json_loads|xmlReadMemory|xmlParseFile|jansson)\b" - ), + "serialization_parsing": re.compile(r"\b(cJSON_Parse|json_loads|xmlReadMemory|xmlParseFile|jansson)\b"), "regex_execution": re.compile(r"\b(regcomp|regexec|regfree)\b"), - "time_date_logic": re.compile( - r"\b(time_t|clock_gettime|gettimeofday|localtime_r?|strftime)\b" - ), - "ipc_rpc_bridges": re.compile( - r"\b(fork|pipe|shmget|shmat|mmap|socket|bind|listen|accept)\b" - ), + "time_date_logic": re.compile(r"\b(time_t|clock_gettime|gettimeofday|localtime_r?|strftime)\b"), + "ipc_rpc_bridges": re.compile(r"\b(fork|pipe|shmget|shmat|mmap|socket|bind|listen|accept)\b"), }, }, "php": { @@ -2928,7 +2612,7 @@ "blueprint_version": "v5.0", "status": "production", }, - # COMPREHENSIVE SURFACE AREA: Merged standard suffixes, legacy formats, UI templates (.phtml, .ctp), and CMS "dark matter" (.module, .inc). + # COMPREHENSIVE SURFACE AREA: Merged standard suffixes, legacy formats, UI templates (.phtml, .ctp), and CMS "unparsable artifacts" (.module, .inc). "extensions": [ ".php", ".phtml", @@ -2966,9 +2650,9 @@ # Rationale: PHP fundamentally operates within an HTML context, requiring the parser # to explicitly hunt for |::)[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*[ \t]*=|array_(?:push|pop|shift|unshift|splice)\b|(?:\+\+|--)" ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile( r"//\s*[;{}]|/\*\s*(?:function|class|namespace|use|if|foreach)\s|#\s*\$|//\s*(?:echo|print|\$|return|var_dump)" ), # 13. doc (Structured Documentation) - "doc": re.compile( - r"/\*\*|@param|@return|@throws|@var|@deprecated|@property|@method" - ), + "doc": re.compile(r"/\*\*|@param|@return|@throws|@var|@deprecated|@property|@method"), # 14. test (Testing & Assertions) "test": re.compile( r"\b(PHPUnit|TestCase|assertSame|assertEquals|assertTrue|assertFalse|mock|spy|expects|toBe|test|it)\b|#\[Test\]" @@ -3054,13 +2732,9 @@ r'\b(view\s*\(|render\s*\(|renderView|extends\s+Controller|Blade::|Twig\\Environment)\b|@(?:if|foreach|yield|section|extends)\b|<\?=|echo\s+[\'"]<|\{\{[^}]*\}\}|\{%\s*[^%]*\s*%\}' ), # 17. closures (Closures / Anonymous Functions) - "closures": re.compile( - r"\b(?:function\s*\([^)]*\)\s*(?:use\s*\([^)]*\)\s*)?\{|fn\s*\([^)]*\)[ \t]*=>)" - ), + "closures": re.compile(r"\b(?:function\s*\([^)]*\)\s*(?:use\s*\([^)]*\)\s*)?\{|fn\s*\([^)]*\)[ \t]*=>)"), # 18. globals (Global / Shared State) - "globals": re.compile( - r"\b(\$_SERVER|\$_SESSION|\$_ENV|\$GLOBALS)\b|\bglobal\s+\$" - ), + "globals": re.compile(r"\b(\$_SERVER|\$_SESSION|\$_ENV|\$GLOBALS)\b|\bglobal\s+\$"), # 19. decorators (Decorators / Annotations) "decorators": re.compile(r"#\[\s*[a-zA-Z0-9_:\\]+[^\]]*\]", re.M), # 20. generics (Generics / Type Parameters) @@ -3073,12 +2747,10 @@ r"\b(array_map|array_filter|array_reduce|array_walk|array_column|array_find|array_any|array_all)\b" ), # 22. scientific (Numerical / Compute Libraries) - "scientific": re.compile( - r"\b(bcadd|bcsub|bcmul|bcdiv|gmp_add|gmp_mul|abs|cos|sin|tan|sqrt|log|exp|pow)\b" - ), + "scientific": re.compile(r"\b(bcadd|bcsub|bcmul|bcdiv|gmp_add|gmp_mul|abs|cos|sin|tan|sqrt|log|exp|pow)\b"), # 23. heat_triggers (Metaprogramming & Reflection) # Magic methods, reflection, and variable variables. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(__(?:get|set|call|callStatic|invoke|destruct|clone)|Reflection(?:Class|Method|Property)|call_user_func(?:_array)?)\b|\$\$[a-zA-Z_\x80-\xff]" ), # 24. import (Dependency Inclusions) @@ -3114,20 +2786,16 @@ re.M, ), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"@(?:author|copyright)\s+(.*)|(?:Created by|Maintainer):?\s+(.*)", re.I - ), + "ownership": re.compile(r"@(?:author|copyright)\s+(.*)|(?:Created by|Maintainer):?\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(Response|JsonResponse|HtmlResponse|RedirectResponse|Symfony\\Component\\HttpFoundation|Illuminate\\Http\\Response)\b" @@ -3144,10 +2812,8 @@ "macros": re.compile(r"\b(?:Macroable|macro\s*\(|mixin\s*\()\b"), # 35. pointers (Pointer Arithmetic / Memory Addressing) "pointers": re.compile(r"\b(FFI::cast|FFI::addr|FFI::scope|FFI::new)\b"), - # 36. memory_alloc - "memory_alloc": re.compile( - r"\bnew\s+[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*" - ), + # 36. memory_alloc + "memory_alloc": re.compile(r"\bnew\s+[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*"), # 37. inline_asm "inline_asm": None, # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- @@ -3156,42 +2822,30 @@ r"\b(?:Log::|LoggerInterface|logger\(|Monolog\\|error_log|Psr\\Log)\b.*?(?:info|error|warning|debug|trace|notice|critical|alert|emergency)\b", re.I, ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile( - r"\b(echo|print|var_dump|print_r|printf|vprintf|var_export|die|exit|dd|dump)\b" - ), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\b(echo|print|var_dump|print_r|printf|vprintf|var_export|die|exit|dd|dump)\b"), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile( r"\((?:int|integer|bool|boolean|float|double|string|array|object|unset)\)\s*|\bsettype\s*\(" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(throw|die|exit|abort)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile( - r"\b(sleep|usleep|time_nanosleep|time_sleep_until)\b" - ), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>|(?>|(?' syntax for multi-line block comments, requiring hybrid parsing logic. - "lexical_family": "embedded_syntax", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # PowerShell uses '#' for standard line-level literature. "_line_anchor": re.compile(r"#"), # Inline comments are also triggered by the '#' token. @@ -3241,19 +2891,17 @@ "_block_start": re.compile(r"<#"), # Block comment end: #> "_block_end": re.compile(r"#>"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # branch: decisions that split flow. Includes ternary operators (?) and null-coalescing (??). "branch": re.compile( r"\b(if|else|elseif|switch|for|foreach|while|do|until|try|catch|finally|throw|trap|break|continue|return)\b|-and|-or|-not|-xor|\?|\?\?", re.I, ), # args: Parameters / Coupling. Captures the param block mass of functions and script files. - "args": re.compile( - r"\bparam\s*\([^)]*\)|\bfunction\s+[a-zA-Z0-9_-]+\s*\([^)]*\)", re.I - ), + "args": re.compile(r"\bparam\s*\([^)]*\)|\bfunction\s+[a-zA-Z0-9_-]+\s*\([^)]*\)", re.I), # linear: Sequential I/O & Network Boundaries. Structural boundaries defining scope (process, begin, end). - # EXCLUDES access modifiers (hidden, static) to prevent Geometry Inflation. - "linear": re.compile( + # EXCLUDES access modifiers (hidden, static) to prevent Structural Complexity Inflation. + "structural_boundaries": re.compile( r"\b(function|filter|workflow|configuration|class|enum|process|begin|end|clean|return|exit|using|namespace)\b", re.I, ), @@ -3264,9 +2912,7 @@ re.I | re.M, ), # class_start: Object / Entity Declarations. Defines OO boundaries (Classes and Enums). - "class_start": re.compile( - r"^[ \t]*(?:class|enum)\s+[a-zA-Z_]\w*", re.I | re.M - ), + "class_start": re.compile(r"^[ \t]*(?:class|enum)\s+[a-zA-Z_]\w*", re.I | re.M), # --- PHASE 2: RISK & STRUCTURAL INTEGRITY --- # safety: Defensive Programming. Strict mode, validation attributes, and null-conditional access (?.). "safety": re.compile( @@ -3274,14 +2920,12 @@ re.I, ), # safety_neg: Safety Bypasses. Actively bypassing errors or type checks (Out-Null, SilentlyContinue). - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"-ErrorAction\s+SilentlyContinue|-WarningAction\s+SilentlyContinue|Out-Null|\[void\]|ExecutionPolicy\s+Bypass|\bIgnore\b", re.I, ), # danger: High-Risk Execution. Dynamic code execution and process terminators. - "danger": re.compile( - r"\b(Invoke-Expression|iex|Stop-Process|kill|Exit)\b", re.I - ), + "high_risk_execution": re.compile(r"\b(Invoke-Expression|iex|Stop-Process|kill|Exit)\b", re.I), # io: I/O & Network Boundaries. Disk, Network, and URL fetching (Includes CERN/TBL legacy emulation triggers). "io": re.compile( r"\b(Get-Content|Set-Content|Out-File|Invoke-WebRequest|iwr|Invoke-RestMethod|irm|TcpClient|HttpListener|HTLoad|HTGet|ENQUIRE)\b", @@ -3294,7 +2938,7 @@ ), # 11. flux (State Mutation) # Mutation of state. Captures assignments, scoped variables, array indexing, and anchored increments. - "flux": re.compile( + "state_mutation": re.compile( # PATH A: EXPLICIT CMDLET MUTATION r"\bSet-Variable\b|" # PATH B: STANDARD ASSIGNMENT (Variables, Scopes, Properties, and Arrays) @@ -3312,11 +2956,9 @@ r"\$(?:[a-zA-Z]+:)?[a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*|\[[^\]\n]+\]){0,4}[ \t]*(?:\+\+|--)", re.I, ), - # 12. graveyard (Dead / Commented-out Code) + # 12. dead_code (Commented Logic / Deprecated Trails) # Commented-out execution logic indicating dead features. Supports both `//` and `#` style comments. - "graveyard": re.compile( - r"(?:#|<#)[ \t]*(?:function|class|if|foreach|while|return)\b", re.I - ), + "dead_code": re.compile(r"(?:#|<#)[ \t]*(?:function|class|if|foreach|while|return)\b", re.I), # doc: Structured Documentation. Get-Help comment-based documentation. "doc": re.compile( r"\.(?:SYNOPSIS|DESCRIPTION|PARAMETER|EXAMPLE|NOTES|LINK|INPUTS|OUTPUTS|ROLE)\b", @@ -3364,7 +3006,7 @@ re.I, ), # heat_triggers: Metaprogramming & Reflection. Reflection and on-the-fly C# compilation via Add-Type. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(Add-Type|System\.Reflection|System\.Management\.Automation\.Language|Invoke-Expression|iex)\b|&\s*\$[a-zA-Z_]\w*", re.I, ), @@ -3387,9 +3029,9 @@ "planned_debt": GLOBAL_PLANNED_DEBT, "fragile_debt": GLOBAL_FRAGILE_DEBT, "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)\]", re.I), - # 30. civil_war (Formatting Inconsistencies) + # 30. tabs_vs_spaces (Formatting Inconsistencies) # Structural formatting violating norms. Handled natively by the GitGalaxy Signal Processor. - "civil_war": None, + "tabs_vs_spaces": None, "ssr_boundaries": re.compile( r"\b(New-PodeServer|Add-PodeRoute|Write-PodeHtmlResponse|New-UDEndpoint|New-UDPage)\b", re.I, @@ -3405,9 +3047,7 @@ "macros": None, # PowerShell lacks a preprocessor # 35. pointers (Pointer Arithmetic / Memory Addressing) # PHP natively lacks pointers, but FFI (Foreign Function Interface) memory bounds are safely captured. - "pointers": re.compile( - r"\[(?:IntPtr|UIntPtr)\]|\[ref\]\s*\$[a-zA-Z_]\w*", re.I - ), + "pointers": re.compile(r"\[(?:IntPtr|UIntPtr)\]|\[ref\]\s*\$[a-zA-Z_]\w*", re.I), "memory_alloc": re.compile( r"\[System\.Runtime\.InteropServices\.Marshal\]::(?:AllocHGlobal|AllocCoTaskMem)", re.I, @@ -3419,39 +3059,35 @@ r"\b(Write-Verbose|Write-Debug|Write-Information|Write-Warning|Start-Transcript|Write-Log)\b", re.I, ), - # print_hits: Standard output. Raw terminal pollution. - "print_hits": re.compile(r"\b(Write-Host|echo)\b", re.I), - # 40. cast_hits (Explicit Type Casting) + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) + "debug_prints": re.compile(r"\b(Write-Host|echo)\b", re.I), + # # 40. explicit_casts (Explicit Type Casting) # Forceful type coercion. PHP has a strict, built-in casting syntax which prevents false positives naturally. - "cast_hits": re.compile( + "explicit_casts": re.compile( r"\[(?:int|long|string|char|byte|bool|double|float|decimal|array|hashtable)\]\s*[\$\(]", re.I, ), - # bailout_hits: Detonators. Aborting execution context. - "bailout_hits": re.compile(r"\b(throw|Exit)\b|-ErrorAction\s+Stop", re.I), - # halt_hits: Temporal Duct Tape. Forcing threads to sleep. - "halt_hits": re.compile(r"\b(Start-Sleep|sleep)\b", re.I), - # 43. bitwise_hits (Bitwise Operations) + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|Exit)\b|-ErrorAction\s+Stop", re.I), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(Start-Sleep|sleep)\b", re.I), + # 43. bitwise_ops (Bitwise Operations) # Low-level byte manipulation. CRITICAL: Removed the pipe '|' (PHP 8 Union Types), ampersand '&' (Pass-by-reference `&$var`), and used lookarounds for `<<` to prevent triggering on Heredocs (`<<\s*/dev/null(?:\s*2>&1)?|\bcurl\s+[^|\n]{1,200}\|[ \t]*(?:bash|sh|zsh)\b' ), # 8. danger (High-Risk Execution / System Calls) # Destructive commands and privilege elevation. EXCLUDES echo (Phase 5). - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(rm\s+-[rR]f|sudo|chmod\s+(?:-R[ \t]+)?777|chown\s+(?:-R[ \t]+)?root|mkfs|dd|kill(?:all)?)\b" ), # 9. io (I/O & Network Boundaries) # Redirections, pipes, and network clients. - "io": re.compile( - r">|>>|<|\|(?:&)?|\b(curl|wget|nc|ssh|scp|ftp|rsync|cat|tail|grep|find|xargs|jq)\b" - ), + "io": re.compile(r">|>>|<|\|(?:&)?|\b(curl|wget|nc|ssh|scp|ftp|rsync|cat|tail|grep|find|xargs|jq)\b"), # 10. api (Public Surface Area) # Exported variables and identifiers modifying the global environment. "api": re.compile(r"^[ \t]*export\s+[a-zA-Z_]\w*", re.M), # 11. flux (State Mutation) # Mutation of state via assignment or arithmetic. - "flux": re.compile( + "state_mutation": re.compile( r"^[ \t]*[a-zA-Z_]\w*(?:\[[^\]]+\])?=(?![=~])|\b(?:let|declare)\s+[a-zA-Z_]\w*=|\[\+\]=|\(\([^)]*(?:\+\+|--|[-+*/%]=)[^)]*\)\)", re.M, ), - # 12. graveyard (Dead / Commented-out Code) + # 12. dead_code (Commented Logic / Deprecated Trails) # Commented-out execution logic indicating dead features. - "graveyard": re.compile( - r"#[ \t]*(?:if|for|while|function|export|echo|printf|cd|rm|sudo|ls)\b" - ), + "dead_code": re.compile(r"#[ \t]*(?:if|for|while|function|export|echo|printf|cd|rm|sudo|ls)\b"), # 13. doc (Structured Documentation) "doc": re.compile( r"^[ \t]*#\s*(?:@param|@return|Usage:|Description:|Examples:|Options:)|#\s*shellcheck\s+disable", @@ -3615,20 +3244,16 @@ "generics": None, # 21. comprehensions (Iterators / Comprehensions) # Brace expansions acting as inline loops. - "comprehensions": re.compile( - r"\{[0-9]+(?:\.\.|,)[0-9]+(?:\.\.[0-9]+)?\}|\{[a-zA-Z]\.\.[a-zA-Z]\}" - ), + "comprehensions": re.compile(r"\{[0-9]+(?:\.\.|,)[0-9]+(?:\.\.[0-9]+)?\}|\{[a-zA-Z]\.\.[a-zA-Z]\}"), # 22. scientific (Numerical / Compute Libraries) "scientific": re.compile(r"\b(bc|awk|dc|expr|jq|RANDOM|SRANDOM)\b|\$\(\("), # 23. heat_triggers (Metaprogramming & Reflection) # Sub-languages and indirect expansion. (ReDoS Shielded) - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r'\$\([^)]+\)|`[^`]+`|\b(?:awk|sed|perl|python[23]?|ruby)\s+[\'"][^\'"]{0,500}|\beval\s+\$|\$\{!?[a-zA-Z0-9_]+\}' ), # 24. import (Dependency Inclusions) - "import": re.compile( - r"(?:^|[ \t;|&])(?:source\b|\.(?=[ \t]))[ \t]+[^\s;]+", re.M - ), + "import": re.compile(r"(?:^|[ \t;|&])(?:source\b|\.(?=[ \t]))[ \t]+[^\s;]+", re.M), "_dependency_capture": re.compile( # ===================================================================== # [ FUTURE LLM CONTEXT: THE DYNAMIC EXECUTION SHIFT (SHELL) ] @@ -3667,11 +3292,9 @@ # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"#\s*\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"#\s*\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) # Legacy CGI shell environments. "ssr_boundaries": re.compile( @@ -3683,9 +3306,7 @@ r"\b(mkfifo|mknod|inotifywait|inotifywatch|fswatch|tail\s+-f|kill\s+-(?:SIG)?(?:USR1|USR2|HUP|TERM))\b" ), # 33. dependency_injection (Dependency Injection / IoC) - "dependency_injection": re.compile( - r"\$\{1:-\w+\}|\$\{2:-\w+\}|\b(?:command\s+-v|type\s+-p)\b" - ), + "dependency_injection": re.compile(r"\$\{1:-\w+\}|\$\{2:-\w+\}|\b(?:command\s+-v|type\s+-p)\b"), # 34. macros (Preprocessor Directives / Macros) "macros": re.compile(r"^[ \t]*(?:alias|shopt)\b", re.M), # 35. pointers (Pointer Arithmetic / Memory Addressing) @@ -3700,22 +3321,20 @@ "telemetry": re.compile( r"\b(?:logger|syslog|log_info|log_err|log_warn|log_debug)\b|>\s*/dev/(?:stderr|console)" ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile(r"\b(echo|printf|print|read)\b"), - # 40. cast_hits - "cast_hits": None, - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile( - r"\b(exit|kill|abort|halt|return\s+[1-9][0-9]*)\b" - ), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\b(sleep|read\s+-t)\b"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": None, - # 44. sync_locks (Thread Synchronization / Locks) + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\b(echo|printf|print|read)\b"), + # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": None, + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(exit|kill|abort|halt|return\s+[1-9][0-9]*)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(sleep|read\s+-t)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": None, + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile(r"\b(flock|mkdir|mkfifo|lockfile|sem)\b"), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile(r"\b(readonly|declare\s+-r|typeset\s+-r)\b"), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(readonly|declare\s+-r|typeset\s+-r)\b"), # 46. cleanup (Resource Cleanup / Teardown) "cleanup": re.compile(r"\b(rm\s+-f|trap\s+.*EXIT|unset|exit|logout)\b"), # 47. encapsulation (Access Modifiers / Encapsulation) @@ -3729,9 +3348,7 @@ "serialization_parsing": re.compile(r"\b(jq|yq|awk|sed|xmlstarlet)\b"), "regex_execution": re.compile(r"\b(grep|egrep|sed|awk)\b|=~"), "time_date_logic": re.compile(r"\b(date\s+|sleep\s+|uptime|times)\b"), - "ipc_rpc_bridges": re.compile( - r"\b(curl|wget|nc|netcat|ssh|scp|xargs|socat)\b" - ), + "ipc_rpc_bridges": re.compile(r"\b(curl|wget|nc|netcat|ssh|scp|xargs|socat)\b"), }, }, "ruby": { @@ -3782,9 +3399,9 @@ # UPGRADED: Maps to Family 4 (Hybrid Hash) # Rationale: Uses '#' for single-line comments, but multi-line literature # utilizes the `=begin ... =end` block syntax, requiring hybrid parsing rules. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Ruby uses '#' for standard line-level literature (Commented / Non-Executable Text). "_line_anchor": re.compile(r"#"), # Inline comments are also triggered by the '#' token. @@ -3806,7 +3423,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: Access modifiers (encapsulation) and const (freeze_hits). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(class|module|def|yield|return|super|alias|undef|require|require_relative|include|extend|prepend|attr_reader|attr_writer|attr_accessor|Data\.define)\b" ), # 4. func_start (Executable Logic Anchors) @@ -3835,14 +3452,12 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Dynamic logic bypasses and Sorbet escape hatches. - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(eval|class_eval|instance_eval|module_eval|send|__send__|public_send|binding|instance_variable_set|unsafe_load|T\.unsafe|T\.untyped)\b" ), # 8. danger (High-Risk Execution / System Calls) # Process killers and shell execution. EXCLUDES puts (Phase 5). - "danger": re.compile( - r"\b(abort|exit|exit!|system|exec|spawn|fork)\b|`[^`]+`|IO\.popen" - ), + "high_risk_execution": re.compile(r"\b(abort|exit|exit!|system|exec|spawn|fork)\b|`[^`]+`|IO\.popen"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(File|Dir|IO|Net::HTTP|URI\.open|Socket|TCPSocket|FileUtils|ActiveRecord::Base|find|where|create|update|destroy)\b" @@ -3855,13 +3470,11 @@ ), # 11. flux (State Mutation) # Mutation of state. EXCLUDES const (freeze_hits). - "flux": re.compile( + "state_mutation": re.compile( r"@[a-zA-Z_]\w*\s*(?:\+|-|\*|/)?=|@@[a-zA-Z_]\w*\s*(?:\+|-|\*|/)?=|\b(?:push|pop|shift|unshift|delete|clear|merge!|update!|gsub!|map!|select!|reject!)\b|<<" ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"#[ \t]*(?:def|class|module|if|unless|while|puts|p)\b" - ), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"#[ \t]*(?:def|class|module|if|unless|while|puts|p)\b"), # 13. doc (Structured Documentation) # Captures YARD tags, Sorbet signatures, RDoc blocks/modifiers, and standard documentation headers. "doc": re.compile( @@ -3882,13 +3495,9 @@ r"\b(ActionView|render|render_to_string|ViewComponent::Base|Phlex::HTML|form_with|form_for|link_to|stylesheet_link_tag|Turbo|Stimulus|Hotwire)\b|<%|%>" ), # 17. closures (Closures / Anonymous Functions) - "closures": re.compile( - r"\b(?:do\s*\|[^|]*\||do\b|\{\s*\|[^|]*\||->\s*(?:\([^)]*\))?[ \t]*\{)" - ), + "closures": re.compile(r"\b(?:do\s*\|[^|]*\||do\b|\{\s*\|[^|]*\||->\s*(?:\([^)]*\))?[ \t]*\{)"), # 18. globals (Global / Shared State) - "globals": re.compile( - r"\$[a-zA-Z_]\w*|\b(ENV|ARGV|ARGF|STDIN|STDOUT|STDERR|RUBY_VERSION)\b" - ), + "globals": re.compile(r"\$[a-zA-Z_]\w*|\b(ENV|ARGV|ARGF|STDIN|STDOUT|STDERR|RUBY_VERSION)\b"), # 19. decorators (Decorators / Annotations) # Rails class macros acting as metadata descriptors. "decorators": re.compile( @@ -3897,26 +3506,20 @@ ), # 20. generics (Generics / Type Parameters) # Sorbet parameterized types. - "generics": re.compile( - r"\b(?:T::|::T::)?(?:Array|Hash|Set|Enumerable|Class)\[[^\]]*\]" - ), + "generics": re.compile(r"\b(?:T::|::T::)?(?:Array|Hash|Set|Enumerable|Class)\[[^\]]*\]"), # 21. comprehensions (Iterators / Comprehensions) "comprehensions": re.compile( r"\.(?:map|collect|select|reject|reduce|inject|filter_map|flat_map|each_with_object|partition|group_by)\b(?:[ \t]*\{|\s*do)" ), # 22. scientific (Numerical / Compute Libraries) - "scientific": re.compile( - r"\b(Math|Complex|Rational|Matrix|Vector|Numo::NArray|BigDecimal)\b" - ), + "scientific": re.compile(r"\b(Math|Complex|Rational|Matrix|Vector|Numo::NArray|BigDecimal)\b"), # 23. heat_triggers (Metaprogramming & Reflection) # Metaprogramming and runtime object extensions. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(method_missing|define_method|const_missing|respond_to_missing\?|included|extended|prepended|class\s*<<\s*self)\b" ), # 24. import (Dependency Inclusions) - "import": re.compile( - r"\b(?:require|require_relative|load|autoload)\b[^'\"]*['\"]", re.M - ), + "import": re.compile(r"\b(?:require|require_relative|load|autoload)\b[^'\"]*['\"]", re.M), "_dependency_capture": re.compile( # ===================================================================== # [ FUTURE LLM CONTEXT: THE DYNAMIC EXECUTION SHIFT (RUBY) ] @@ -3943,32 +3546,24 @@ re.M, ), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"#\s*(?:Author|Created by|Maintainer|Copyright):\s+(.*)", re.I - ), + "ownership": re.compile(r"#\s*(?:Author|Created by|Maintainer|Copyright):\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(ActionController::Base|ActionController::API|Sinatra::Base|Hanami::Action|respond_to|format\.html|format\.json)\b" ), # 32. events (Event Emitters / Pub-Sub) - "events": re.compile( - r"\b(Wisper|broadcast|subscribe|ActiveSupport::Notifications\.instrument|publish)\b" - ), + "events": re.compile(r"\b(Wisper|broadcast|subscribe|ActiveSupport::Notifications\.instrument|publish)\b"), # 33. dependency_injection (Dependency Injection / IoC) - "dependency_injection": re.compile( - r"\b(Dry::Container|Dry::AutoInject|include\s+Import|inject)\b" - ), + "dependency_injection": re.compile(r"\b(Dry::Container|Dry::AutoInject|include\s+Import|inject)\b"), # 34. macros (Preprocessor Directives / Macros) # Ruby DSL macros. "macros": re.compile( @@ -3976,13 +3571,9 @@ re.M, ), # 35. pointers (Pointer Arithmetic / Memory Addressing) - "pointers": re.compile( - r"\b(FFI::Pointer|Fiddle::Pointer|Fiddle::Function)\b" - ), + "pointers": re.compile(r"\b(FFI::Pointer|Fiddle::Pointer|Fiddle::Function)\b"), # 36. memory_alloc (Manual Memory Management) - "memory_alloc": re.compile( - r"\b(ObjectSpace|GC\.start|GC\.disable|GC\.enable|FFI::MemoryPointer)\b" - ), + "memory_alloc": re.compile(r"\b(ObjectSpace|GC\.start|GC\.disable|GC\.enable|FFI::MemoryPointer)\b"), # 37. inline_asm "inline_asm": None, # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- @@ -3990,28 +3581,22 @@ "telemetry": re.compile( r"\b(?:Rails\.logger|Logger\.new|SemanticLogger|[a-zA-Z_]\w*logger)\.(?:debug|info|warn|error|fatal|unknown)\b" ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile(r"\b(puts|print|p|pp|warn)\b"), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( - r"\b(Integer|Float|String|Array|Hash|Complex|Rational)\b\s*\(" - ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(raise|fail|abort|exit!)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\bsleep\b\s*[0-9.]+"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"\^|(?]*>|\bwhere\s+[a-zA-Z_]\w*\s*:|\b(?:some|any|each)\s+[A-Z]\w*" - ), + "generics": re.compile(r"<\s*[A-Z][^>]*>|\bwhere\s+[a-zA-Z_]\w*\s*:|\b(?:some|any|each)\s+[A-Z]\w*"), # 21. comprehensions (Iterators / Comprehensions) "comprehensions": re.compile( r"\.(?:map|compactMap|flatMap|filter|reduce|forEach|allSatisfy|contains)\s*(?:\(|\{)" @@ -4183,32 +3752,26 @@ ), # 23. heat_triggers (Metaprogramming & Reflection) # Reflection and Dynamic Dispatch. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(@objc|dynamic|Mirror\(|unsafeBitCast|withUnsafe\w+|KeyPath|WritableKeyPath)\b|\\\.[\w.]+" ), # 24. import (Dependency Inclusions) - "import": re.compile( - r"^[ \t]*(?:@_exported[ \t]+)?import\s+[a-zA-Z_]\w*", re.M - ), + "import": re.compile(r"^[ \t]*(?:@_exported[ \t]+)?import\s+[a-zA-Z_]\w*", re.M), "_dependency_capture": re.compile( r"^[ \t]*(?:@_exported[ \t]+)?import\s+(?:(?:typealias|struct|class|enum|protocol|let|var|func)\s+)?([a-zA-Z_][\w.]+)", re.M, ), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"//\s*(?:Created by|Author:|Copyright):\s+(.*)", re.I - ), + "ownership": re.compile(r"//\s*(?:Created by|Author:|Copyright):\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(Vapor|Hummingbird|Request|Response|Route|app\.get|app\.post|EventLoopFuture)\b" @@ -4241,39 +3804,31 @@ r"\b(?:Logger|OSLog|os_log)\b|\bLogger\([^)]*\)\.(?:info|error|warning|debug|trace|notice|critical|fault)\b", re.I, ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile(r"\b(print|debugPrint|dump)\b"), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\b(print|debugPrint|dump)\b"), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile( r"\bas[!?]?\s+[A-Z]\w*|\bis\s+[A-Z]\w*|\b(?:Int|Double|Float|Float16|CGFloat|String|Bool)\s*\(" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile( - r"\b(throw|fatalError|abort|exit|preconditionFailure)\b" - ), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\b(sleep|delay|Task\.sleep)\b"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>|\^|(?>=|\^="), - # 44. sync_locks (Thread Synchronization / Locks) + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|fatalError|abort|exit|preconditionFailure)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(sleep|delay|Task\.sleep)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"<<|>>|\^|(?>=|\^="), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile( r"\b(mutex|lock|synchronized|Semaphore|OSAllocatedUnfairLock|MainActor|distributed)\b", re.I, ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile( - r"\b(let|final|static|readonly|Immutable|Sendable)\b" - ), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(let|final|static|readonly|Immutable|Sendable)\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"\b(deinit|close|free|dispose|shutdown|removeAll)\b\s*\(" - ), + "cleanup": re.compile(r"\b(deinit|close|free|dispose|shutdown|removeAll)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) "encapsulation": re.compile(r"\b(private|fileprivate|internal)\b"), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\.onAppear\(|\.onChange\(|\.sink\(|addObserver|subscribe" - ), + "listeners": re.compile(r"\.onAppear\(|\.onChange\(|\.sink\(|addObserver|subscribe"), # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(XCTSkip|mock\(|stub\(|fake\(|double\()\b"), # --- PHASE 3: HYBRID DOMAIN SENSORS (Swift Specifics) --- @@ -4314,9 +3869,9 @@ # UPGRADED: Maps to Family 2 (Nested C) # Rationale: (CORRECTION) While Kotlin uses // and /* */, it officially allows nested # block comments (/* /* */ */). Using standard C parsing would cause early termination here. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # Standard C-family line comment token "_line_anchor": re.compile(r"//"), # Inline comments follow the same '//' delimiter. @@ -4325,7 +3880,7 @@ "_block_start": re.compile(r"/\*"), # Block comment end: */ "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Decisions and logical jumps. Includes modern 'when' and Elvis operator. # EXCLUDES throw (bailout_hits). @@ -4376,12 +3931,10 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Force unwrapping, unsafe casts, and suppression. - "safety_neg": re.compile(r"!!|as(?!\?)\b|\blateinit\s+var\b|@Suppress\b"), + "safety_bypasses": re.compile(r"!!|as(?!\?)\b|\blateinit\s+var\b|@Suppress\b"), # 8. danger (High-Risk Execution / System Calls) # Process killers and raw system triggers. EXCLUDES println (Phase 5). - "danger": re.compile( - r"\b(System\.exit|exitProcess|Runtime\.getRuntime|Thread\.stop)\b" - ), + "high_risk_execution": re.compile(r"\b(System\.exit|exitProcess|Runtime\.getRuntime|Thread\.stop)\b"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(File|InputStream|OutputStream|Retrofit|OkHttpClient|Ktor|HttpClient|RoomDatabase|Dao|SharedPreferences|DataStore|java\.nio)\b" @@ -4393,18 +3946,14 @@ ), # 11. flux (State Mutation) # CRITICAL FIX: Added re.M so it scans every line, not just the first line of the file! - "flux": re.compile( + "state_mutation": re.compile( r"\b(var|MutableList|MutableMap|MutableSet|MutableState|MutableStateFlow|Atomic[A-Za-z0-9]+)\b|^[ \t]*(?:this\.)?[a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*\s*[-+*/%]?=|\.(?:add|addAll|remove|put|set|update)\(", re.M, ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"//[ \t]*(?:val|var|fun|class|interface|object|if|when|for|return|import)\b" - ), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"//[ \t]*(?:val|var|fun|class|interface|object|if|when|for|return|import)\b"), # 13. doc (Structured Documentation) - "doc": re.compile( - r"/\*\*|@param|@return|@property|@receiver|@constructor|@throws|@see|@since" - ), + "doc": re.compile(r"/\*\*|@param|@return|@property|@receiver|@constructor|@throws|@see|@since"), # 14. test (Testing & Assertions) "test": re.compile( r"@(?:Test|ParameterizedTest|BeforeTest|AfterTest)|\b(?:assert[A-Za-z0-9_]*|mockk|spyk|test)\s*\(|\b(?:shouldBe|shouldNotBe)\b|\b(?:every|verify)\s*\{" @@ -4420,9 +3969,7 @@ ), # 17. closures (Closures / Anonymous Functions) # OPTIMIZED: Removed overlapping whitespace quantifiers to fix ReDoS. - "closures": re.compile( - r"\{[ \t\n]*[a-zA-Z_][a-zA-Z0-9_ \t\n:<>,.?]{0,150}?->" - ), + "closures": re.compile(r"\{[ \t\n]*[a-zA-Z_][a-zA-Z0-9_ \t\n:<>,.?]{0,150}?->"), # 18. globals (Global / Shared State) "globals": re.compile( r"\b(object|companion\s+object)\b|^[ \t]*(?:const[ \t]+)?val\s+[A-Z_0-9]+[ \t]*=", @@ -4430,14 +3977,10 @@ ), # 19. decorators (Decorators / Annotations) # OPTIMIZED: Bounded arguments. - "decorators": re.compile( - r"@[a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*(?:\([^)\{]{0,300}\))?" - ), + "decorators": re.compile(r"@[a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*(?:\([^)\{]{0,300}\))?"), # 20. generics (Generics / Type Parameters) # Prevented catastrophic backtracking across newlines. - "generics": re.compile( - r"<\s*(?:in|out)?\s*[A-Z][^>\n]{0,100}>|\breified\b|\bwhere\b" - ), + "generics": re.compile(r"<\s*(?:in|out)?\s*[A-Z][^>\n]{0,100}>|\breified\b|\bwhere\b"), # 21. comprehensions (Iterators / Comprehensions) # Functional collection transformations. "comprehensions": re.compile( @@ -4449,14 +3992,12 @@ ), # 23. heat_triggers (Metaprogramming & Reflection) # Reflection and optimization hooks. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"::class|javaClass|@JvmOverloads|@JvmStatic|@JvmField|@JvmName|\b(inline|crossinline|noinline|invoke|context|tailrec)\b" ), # 24. import (Dependency Inclusions) "import": re.compile(r"^[ \t]*import\s+(?:static[ \t]+)?[\w.]+;?", re.M), - "_dependency_capture": re.compile( - r"^[ \t]*import[ \t\n]+(?:static[ \t\n]+)?([\w.*]+)", re.M - ), + "_dependency_capture": re.compile(r"^[ \t]*import[ \t\n]+(?:static[ \t\n]+)?([\w.*]+)", re.M), # 25. ownership (Authorship Metadata) "ownership": re.compile( r"@(?:author|since)\s+(.*)|//\s*(?:Created by|Maintainer|Copyright):\s+(.*)", @@ -4468,11 +4009,9 @@ # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(ApplicationCall|call\.respond|call\.respondText|call\.respondHtml|ServerResponse|ModelAndView)\b" @@ -4486,18 +4025,12 @@ r"@(?:Inject|Module|Provides|Binds|HiltViewModel|AndroidEntryPoint|Component|Autowired)|(?:koin|get|inject)\(\)" ), # 34. macros (Preprocessor Directives / Macros) - "macros": re.compile( - r"@(?:OptIn|RequiresOptIn|Suppress|SuppressWarnings)\b" - ), + "macros": re.compile(r"@(?:OptIn|RequiresOptIn|Suppress|SuppressWarnings)\b"), # 35. pointers (Pointer Arithmetic / Memory Addressing) # Kotlin/Native FFI boundaries. - "pointers": re.compile( - r"\b(?:CPointer|COpaquePointer|CFunction|CValue|CPointed)\b" - ), + "pointers": re.compile(r"\b(?:CPointer|COpaquePointer|CFunction|CValue|CPointed)\b"), # 36. memory_alloc (Manual Memory Management) - "memory_alloc": re.compile( - r"\b(?:memScoped|alloc|allocArray|nativeHeap\.alloc|nativeHeap\.free)\b" - ), + "memory_alloc": re.compile(r"\b(?:memScoped|alloc|allocArray|nativeHeap\.alloc|nativeHeap\.free)\b"), # 37. inline_asm "inline_asm": None, # Usually bridged via C-headers in Native. # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- @@ -4505,51 +4038,39 @@ "telemetry": re.compile( r"\b(?:Timber|Log|Logger|LoggerFactory)\.(?:i|e|w|d|v|info|error|warn|warning|debug|trace|verbose)\b|@Slf4j" ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile(r"\b(println|print)\b\s*\("), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\b(println|print)\b\s*\("), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile( r"\bas\??\s+[A-Z]\w*|\.to(?:Int|Long|Short|Byte|Double|Float|String|Boolean|UInt|ULong|UShort|UByte)\(\)" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(throw|raise|exitProcess|return|panic)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\b(delay|Thread\.sleep|yield)\b"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile( - r"\.(?:shl|shr|ushr|and|or|xor|inv)\(|\b(?:shl|shr|ushr|xor)\b" - ), - # 44. sync_locks (Thread Synchronization / Locks) - "sync_locks": re.compile( - r"\b(mutex|lock|synchronized|Semaphore|Atomic[A-Z]\w*)\b", re.I - ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile(r"\b(val|const|immutable|readonly)\b"), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|raise|exitProcess|return|panic)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(delay|Thread\.sleep|yield)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"\.(?:shl|shr|ushr|and|or|xor|inv)\(|\b(?:shl|shr|ushr|xor)\b"), + # 44. sync_locks (Resource Management & Stability) + "sync_locks": re.compile(r"\b(mutex|lock|synchronized|Semaphore|Atomic[A-Z]\w*)\b", re.I), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(val|const|immutable|readonly)\b"), # 46. cleanup (Resource Cleanup / Teardown) "cleanup": re.compile(r"\b(close|dispose|shutdown|use|cleanup)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) "encapsulation": re.compile(r"\b(private|protected|internal)\b"), # 48. listeners (Event Listeners / Observers) - "listeners": re.compile( - r"\.(?:collect|observe|subscribe|on[A-Z]\w*|set[A-Z]\w*Listener)\(" - ), + "listeners": re.compile(r"\.(?:collect|observe|subscribe|on[A-Z]\w*|set[A-Z]\w*Listener)\("), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"@(?:Ignore|Disabled)|test\.skip\(|mockk|spyK|fake\(" - ), + "test_skip": re.compile(r"@(?:Ignore|Disabled)|test\.skip\(|mockk|spyK|fake\("), # --- PHASE 3: HYBRID DOMAIN SENSORS (Kotlin Specifics) --- "serialization_parsing": re.compile( r"\b(Json\.decodeFromString|Json\.encodeToString|Gson\(\)|Moshi|ObjectMapper)\b" ), - "regex_execution": re.compile( - r"\b(Regex\(\)|\.toRegex\(\)|\.matches\(|\.find\()\b" - ), + "regex_execution": re.compile(r"\b(Regex\(\)|\.toRegex\(\)|\.matches\(|\.find\()\b"), "time_date_logic": re.compile( r"\b(Clock\.System\.now|Instant\.now|System\.currentTimeMillis|Duration\.minutes|LocalDate)\b" ), - "ipc_rpc_bridges": re.compile( - r"\b(Intent\(|BroadcastReceiver|HttpClient\(|ProcessBuilder|bindService)\b" - ), + "ipc_rpc_bridges": re.compile(r"\b(Intent\(|BroadcastReceiver|HttpClient\(|ProcessBuilder|bindService)\b"), }, }, "sqlite": { @@ -4577,9 +4098,9 @@ "shebangs": ["sqlite3", "sqlite"], # UPGRADED: Maps to Family 5 (Hybrid Dash) # Rationale: Uses '--' for line-level and '/*' '*/' for block-level Commented / Non-Executable Text. - "lexical_family": "multi_style_dash", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # SQLite uses '--' for standard line-level literature. "_line_anchor": re.compile(r"--"), # Inline comments are also triggered by the '--' token. @@ -4588,7 +4109,7 @@ "_block_start": re.compile(r"/\*"), # Block comment end: */ "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Decisions and logical filters. Includes case logic and modern IIF(). "branch": re.compile( @@ -4604,7 +4125,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries defining query execution flow. - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(SELECT|FROM|JOIN|INNER\s+JOIN|LEFT\s+JOIN|CROSS\s+JOIN|GROUP\s+BY|ORDER\s+BY|LIMIT|OFFSET|UNION|INTERSECT|EXCEPT|RETURNING|AS|INTO|WINDOW|STRICT|WITHOUT\s+ROWID|PARTITION\s+BY|PRECEDING|FOLLOWING|UNBOUNDED|CURRENT\s+ROW)\b", re.I, ), @@ -4638,15 +4159,13 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Bypassing safety checks and structural removals. - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(DROP\s+TABLE|DROP\s+VIEW|DROP\s+INDEX|PRAGMA\s+foreign_keys[ \t]*=\s*(?:0|OFF)|PRAGMA\s+writable_schema[ \t]*=\s*(?:1|ON)|PRAGMA\s+ignore_check_constraints[ \t]*=\s*(?:1|ON)|IF\s+EXISTS)\b", re.I, ), # 8. danger (High-Risk Execution / System Calls) # Destructive schema actions and system bypasses. - "danger": re.compile( - r"\b(PRAGMA\s+legacy_alter_table|DROP\s+DATABASE|\.shell|\.system|\.exit|\.quit)\b" - ), + "high_risk_execution": re.compile(r"\b(PRAGMA\s+legacy_alter_table|DROP\s+DATABASE|\.shell|\.system|\.exit|\.quit)\b"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(SELECT|INSERT|UPDATE|DELETE|REPLACE|ATTACH\s+DATABASE|DETACH\s+DATABASE|\.import|\.output|\.dump|\.read|readfile|writefile)\b", @@ -4660,19 +4179,17 @@ ), # 11. flux (State Mutation) # Mutation of state. Includes UPSERT. - "flux": re.compile( + "state_mutation": re.compile( r"\b(UPDATE|SET|ALTER\s+TABLE|ADD\s+COLUMN|DROP\s+COLUMN|RENAME\s+TO|UPSERT|ON\s+CONFLICT\s+DO\s+UPDATE|ON\s+CONFLICT\s+DO\s+NOTHING|REPLACE\s+INTO|EXCLUDED\.[a-zA-Z_]\w*|jsonb?_(?:insert|replace|set|remove|patch))\b", re.I, ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile( r"--[ \t]*(?:SELECT|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER|PRAGMA)\b", re.I, ), # 13. doc (Structured Documentation) - "doc": re.compile( - r"--\s*@(?:param|return|brief|table|column)|/\*\*|--\s*Description:" - ), + "doc": re.compile(r"--\s*@(?:param|return|brief|table|column)|/\*\*|--\s*Description:"), # 14. test (Testing & Assertions) "test": re.compile( r"\b(?:EXPLAIN[ \t]+QUERY[ \t]+PLAN|PRAGMA[ \t]+integrity_check|PRAGMA[ \t]+foreign_key_check|\.testcase|\.lint)\b", @@ -4715,7 +4232,7 @@ ), # 23. heat_triggers (Metaprogramming & Reflection) # Recursive logic and JSON paths. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(WITH\s+RECURSIVE|GENERATED\s+ALWAYS\s+AS|STORED|VIRTUAL)\b|->>|->|\b(?:json_extract|jsonb_extract)\b", re.I, ), @@ -4729,20 +4246,16 @@ re.I | re.M, ), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"--\s*(?:Author|Created by|Maintainer|Copyright):\s+(.*)", re.I - ), + "ownership": re.compile(r"--\s*(?:Author|Created by|Maintainer|Copyright):\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"--\s*\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"--\s*\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries "ssr_boundaries": None, # 32. events (Event Emitters / Pub-Sub) @@ -4775,44 +4288,32 @@ r"\b(?:sqlite_stat1|sqlite_stat4|ANALYZE)\b|^[ \t]*\.(?:trace|log|show|stats|timer)\b", re.I | re.M, ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile( + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile( r"\b(?:disp|warning|fprintf(?![ \t]*\([ \t]*[a-zA-Z_]))\b|^\.print\b|^\.echo\b", re.I | re.M, ), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( - r"\bCAST[ \t]*\([^)]+[ \t]+AS[ \t]+[a-zA-Z_]+\s*\)", re.I - ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile( - r"\b(ABORT|RAISE|EXIT|QUIT)\b|\.exit|\.quit", re.I - ), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\bPRAGMA\s+busy_timeout\b|\.pause", re.I), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>|\^|~|(?>|\^|~|(? and permissive HTML parser form --!>. "_block_end": re.compile(r"--!?>"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # User-driven branching and declarative framework conditionals. "branch": re.compile( @@ -4887,7 +4386,7 @@ ), # 3. linear (Sequential Boundaries) # Structural document flow tags. Includes 1990 CERN tags (,
) alongside modern semantic ones. - "linear": re.compile( + "structural_boundaries": re.compile( r"<(?:html|head|body|main|section|article|header|footer|div|span|p|h[1-6]|ul|ol|li|dl|dt|dd|nav|aside|figure|figcaption|search|address|nextid|hp[1-2]|dir|menu)\b", re.I, ), @@ -4908,13 +4407,13 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Actively bypasses standard browser safety (e.g. target="_blank" without noopener). - "safety_neg": re.compile( + "safety_bypasses": re.compile( r'target="_blank"(?!\s+rel="noopener")|href="javascript:[^"]*"|on[a-z]+="[^"]*(?:eval\(|document\.write\()', re.I, ), # 8. danger (High-Risk Execution / System Calls) # HTML is declarative markup. Execution dangers (eval, setTimeout) belong in JS. - "danger": None, + "high_risk_execution": None, # 9. io (I/O & Network Boundaries) # Hyperlink navigation and resource fetching. (The core of the Web). "io": re.compile( @@ -4929,10 +4428,10 @@ ), # 11. flux (State Mutation) # HTML is declarative markup. State mutation (DOM manipulation) belongs in JS. - "flux": None, - # 12. graveyard (Dead / Commented-out Code) + "state_mutation": None, + # 12. dead_code (Commented Logic / Deprecated Trails) # Commented-out structural logic. - "graveyard": re.compile( + "dead_code": re.compile( r"' "_line_anchor": re.compile(r"--+(?![!#$%&*+./<=>?@\\^|~-])"), @@ -6707,17 +6063,13 @@ "_block_start": re.compile(r"\{-"), # Block comment end: -} "_block_end": re.compile(r"-\}"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # branch: decisions that split flow. Includes guards (|) and modern \cases. - "branch": re.compile( - r"\b(if|then|else|case|of|MultiWayIf)\b|\\cases?|^[ \t]*\|", re.M - ), + "branch": re.compile(r"\b(if|then|else|case|of|MultiWayIf)\b|\\cases?|^[ \t]*\|", re.M), # args: Parameters / Coupling. Captures type signatures, lambda bindings, and explicit @type apps. - "args": re.compile( - r"::\s*[^=\n]+(?:->|=>|⊸)|\\[a-zA-Z0-9_\'\s,()\[\]]+->|@[A-Z][a-zA-Z0-9_\']*" - ), + "args": re.compile(r"::\s*[^=\n]+(?:->|=>|⊸)|\\[a-zA-Z0-9_\'\s,()\[\]]+->|@[A-Z][a-zA-Z0-9_\']*"), # linear: Sequential I/O & Network Boundaries. Structural boundaries defining scope and data definitions. - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(module|data|type|newtype|class|instance|let|in|where|do|mdo|deriving|family|pattern)\b|%1\s*->|⊸" ), # 4. func_start: Executable Logic Anchors. Anchors executable logic (Type Signatures). @@ -6745,13 +6097,11 @@ r"\b(Maybe|Either|Just|Nothing|Right|Left|try|catch|bracket|finally|onException|SafeT|mask|pure|return)\b" ), # safety_neg: Safety Bypasses. Bypassing purity (unsafePerformIO) and partial functions. - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(unsafePerformIO|unsafeCoerce|error|undefined|fromJust|head|tail|init|last|throw|unsafeFixIO)\b" ), # danger: High-Risk Execution. Forceful aborts and Debug-trace leaks in production. - "danger": re.compile( - r"\b(die|exitWith|exitFailure|Debug\.Trace|trace|traceShow|traceIO|traceM)\b" - ), + "high_risk_execution": re.compile(r"\b(die|exitWith|exitFailure|Debug\.Trace|trace|traceShow|traceIO|traceM)\b"), # io: I/O & Network Boundaries. IO Monad and hardware interactions. "io": re.compile( r"\b(IO|readFile|writeFile|appendFile|hGetContents|hPutStr|openFile|withFile|getLine|getChar|Socket|Connection|runDB)\b" @@ -6762,11 +6112,11 @@ re.M, ), # flux: State Mutation. State mutation (IORef/MVar) and monadic binds (<-). - "flux": re.compile( + "state_mutation": re.compile( r"\b(IORef|STRef|TVar|MVar|TMVar|modifyIORef\'?|writeIORef|putMVar|modify|put|StateT)\b|<-" ), - # graveyard: Dead / Commented-out Code. Commented out structural code trails. - "graveyard": re.compile( + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile( r"--\s*(?:data|type|newtype|class|instance|let|where|import|putStrLn)\b", re.M, ), @@ -6778,13 +6128,9 @@ ), # --- PHASE 3: ARCHITECTURE & DOMAIN SENSORS --- # concurrency: Temporal Static. STM, async, and thread forking. - "concurrency": re.compile( - r"\b(forkIO|forkOS|async|wait|cancel|MVar|TVar|STM|atomically|threadDelay)\b" - ), + "concurrency": re.compile(r"\b(forkIO|forkOS|async|wait|cancel|MVar|TVar|STM|atomically|threadDelay)\b"), # ui_framework: UI / View Components. Functional reactive GUI and web components. - "ui_framework": re.compile( - r"\b(Threepenny|Brick|Reflex|Miso|Gtk|widget|vBox|hBox|Lucid|Blaze|Monomer)\b" - ), + "ui_framework": re.compile(r"\b(Threepenny|Brick|Reflex|Miso|Gtk|widget|vBox|hBox|Lucid|Blaze|Monomer)\b"), # closures: Closures / Anonymous Functions. Anonymous lambda depth. "closures": re.compile(r"\\[a-zA-Z0-9_\'\s(),\[\]]+\s*->|\\cases?"), # globals: Global / Shared State. Top-level state hacks (typically MVars using unsafePerformIO). @@ -6793,9 +6139,7 @@ re.M, ), # decorators: Decorators / Annotations. GHC pragmas (INLINE, LANGUAGE). - "decorators": re.compile( - r"\{-#\s*(?:INLINE|NOINLINE|LANGUAGE|OPTIONS_GHC|RULES|MINIMAL)\s+[^#]*#-\}" - ), + "decorators": re.compile(r"\{-#\s*(?:INLINE|NOINLINE|LANGUAGE|OPTIONS_GHC|RULES|MINIMAL)\s+[^#]*#-\}"), # generics: Generics / Type Parameters. forall quantification and constraints. "generics": re.compile( r"\bforall\s+[^.]+\.|\b(?:[A-Z][a-zA-Z0-9_\']*\s+[a-z][a-zA-Z0-9_\']*[ \t]*=>)|\([^)]+\)[ \t]*=>" @@ -6807,92 +6151,68 @@ r"\b(Complex|RealFloat|Floating|Numeric\.LinearAlgebra|Matrix|Vector|ad|grad|jacobian|sin|cos|tan|exp|log|pi)\b" ), # heat_triggers: Metaprogramming & Reflection. QuasiQuotes and Template Haskell. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(TemplateHaskell|QuasiQuotes|TypeFamilies|GHC\.Generics|Generic)\b|\[[a-z_]+\||\$\([a-zA-Z0-9_\']+\)" ), # import: Dependency Inclusions. Module resolution. - "import": re.compile( - r"^[ \t]*import\s+(?:qualified[ \t]+)?[A-Z][a-zA-Z0-9_.]*", re.M - ), - "_dependency_capture": re.compile( - r"^[ \t]*import\s+(?:qualified\s+)?([A-Z][a-zA-Z0-9_.]*)", re.M - ), + "import": re.compile(r"^[ \t]*import\s+(?:qualified[ \t]+)?[A-Z][a-zA-Z0-9_.]*", re.M), + "_dependency_capture": re.compile(r"^[ \t]*import\s+(?:qualified\s+)?([A-Z][a-zA-Z0-9_.]*)", re.M), # ownership: Authorship indicators in comments. - "ownership": re.compile( - r"--\s*\|?\s*(?:Author|Maintainer|Copyright|License):\s+([^\n]+)", re.I - ), + "ownership": re.compile(r"--\s*\|?\s*(?:Author|Maintainer|Copyright|License):\s+([^\n]+)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- "planned_debt": GLOBAL_PLANNED_DEBT, "fragile_debt": GLOBAL_FRAGILE_DEBT, "spec_exposure": re.compile(r"\[(?:spec-[0-9]+|audit|rfc)\]", re.I), - "civil_war": None, + "tabs_vs_spaces": None, "ssr_boundaries": re.compile( r"\b(Yesod|Servant|ScottyM|ActionM|lucid|blaze-html|ToJSON|FromJSON|Handler|respond)\b" ), "events": re.compile( r"\b(Event|Behavior|Dynamic|reactive-banana|reflex|frp|stepper|accumE|conduit|Pipes|Stream)\b" ), - "dependency_injection": re.compile( - r"\b(ReaderT|MonadReader|Has[A-Z][a-zA-Z0-9_\']+|ask|asks|local)\b" - ), + "dependency_injection": re.compile(r"\b(ReaderT|MonadReader|Has[A-Z][a-zA-Z0-9_\']+|ask|asks|local)\b"), "macros": re.compile( r"\{-#\s*LANGUAGE\s+[^#]*#-\}|\$[(a-z_A-Z0-9\']|^[ \t]*#(?:define|undef|if|ifdef|ifndef|elif|else|endif|include)\b", re.M, ), - "pointers": re.compile( - r"\b(Ptr|ForeignPtr|FunPtr|StablePtr|peek|poke|castPtr|plusPtr|nullPtr|Storable)\b" - ), - "memory_alloc": re.compile( - r"\b(malloc|mallocBytes|alloca|allocaBytes|free|Foreign\.Marshal)\b" - ), - "inline_asm": re.compile( - r"\bforeign\s+import\s+(?:ccall|cplusplus|prim|capi)\b" - ), + "pointers": re.compile(r"\b(Ptr|ForeignPtr|FunPtr|StablePtr|peek|poke|castPtr|plusPtr|nullPtr|Storable)\b"), + "memory_alloc": re.compile(r"\b(malloc|mallocBytes|alloca|allocaBytes|free|Foreign\.Marshal)\b"), + "inline_asm": re.compile(r"\bforeign\s+import\s+(?:ccall|cplusplus|prim|capi)\b"), # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # telemetry: Professional structured logging. - "telemetry": re.compile( - r"\b(?:logDebug|logInfo|logWarn|logError|logOther|katip|MonadLogger|LoggerT)\b" - ), - # print_hits: Standard output. - "print_hits": re.compile(r"\b(putStr|putStrLn|print|putChar)\b"), - # cast_hits: "Trust Me" Tax. - "cast_hits": re.compile( - r"\b(unsafeCoerce|coerce|fromIntegral|realToFrac|floor|ceiling|truncate|round)\b" - ), - # bailout_hits: The Detonators. - "bailout_hits": re.compile(r"\b(throw|throwIO|panic|error)\b"), - # halt_hits: Temporal Duct Tape. - "halt_hits": re.compile(r"\b(threadDelay)\b"), - # bitwise_hits: Sub-Atomic Math. - "bitwise_hits": re.compile( + "telemetry": re.compile(r"\b(?:logDebug|logInfo|logWarn|logError|logOther|katip|MonadLogger|LoggerT)\b"), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) + "debug_prints": re.compile(r"\b(putStr|putStrLn|print|putChar)\b"), + # # # 40. explicit_casts (Explicit Type Casting) "Trust Me" Tax. + "explicit_casts": re.compile(r"\b(unsafeCoerce|coerce|fromIntegral|realToFrac|floor|ceiling|truncate|round)\b"), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|throwIO|panic|error)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(threadDelay)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile( r"\b(?:shift[LR]?|rotate[LR]?|xor|complement|testBit|setBit|clearBit|complementBit)\b|\.&&\.|\|\.\|\|\." ), # sync_locks: Barricades preventing races. - "sync_locks": re.compile( - r"\b(takeMVar|putMVar|readMVar|swapMVar|atomically|STM|Mutex|lock|unlock)\b" - ), - # freeze_hits: Data Cryogenics. Implicit in pure Haskell, but explicit in mutable contexts. - "freeze_hits": re.compile(r"\b(pure|return|frozen|immutable|const)\b"), - # cleanup: The Janitor. - "cleanup": re.compile( - r"\b(hClose|close|free|bracket|finally|onException)\b" - ), - # encapsulation: The Vault. + "sync_locks": re.compile(r"\b(takeMVar|putMVar|readMVar|swapMVar|atomically|STM|Mutex|lock|unlock)\b"), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(pure|return|frozen|immutable|const)\b"), + # 46. cleanup (Resource Cleanup / Teardown) + "cleanup": re.compile(r"\b(hClose|close|free|bracket|finally|onException)\b"), + # 47. encapsulation (Encapsulation / Access Modifiers) "encapsulation": re.compile( r"^[ \t]*module\s+[A-Z][a-zA-Z0-9_.]*\s*\([^)]*\)\s*where", re.M - ), # Explicit export list = Encapsulated - # listeners: The Sinks. + ), + # 48. listeners (Event Listeners / Observers) "listeners": re.compile(r"\b(subscribe|onEvent|addEventListener|watch)\b"), - # test_skip: Safety Theater. + # 49. test_skip (Bypassed Tests / Ignored Specs) Safety Theater. "test_skip": re.compile(r"\b(ignore|pending|skip|xit|xdescribe)\b"), # --- PHASE 3: HYBRID DOMAIN SENSORS (Haskell Specifics) --- "serialization_parsing": re.compile( r"\b(Data\.Aeson|decode|decodeStrict|fromJSON|Data\.Binary|Data\.Serialize)\b" ), "regex_execution": re.compile(r"\b(Text\.Regex|makeRegex|matchRegex|=~)\b"), - "time_date_logic": re.compile( - r"\b(getCurrentTime|diffUTCTime|addUTCTime|System\.Time|threadDelay)\b" - ), + "time_date_logic": re.compile(r"\b(getCurrentTime|diffUTCTime|addUTCTime|System\.Time|threadDelay)\b"), "ipc_rpc_bridges": re.compile( r"\b(System\.Process|createProcess|callProcess|callCommand|forkIO|Control\.Concurrent)\b" ), @@ -6921,9 +6241,9 @@ # UPGRADED: Maps to Family 3 (Pure Hash) # Rationale: Uses '#' for line-level literature; multi-line literature # (docstrings) is handled by the Section 2.3.C.3 Heuristic Pass. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- # MicroPython uses '#' for line-level Commented / Non-Executable Text. "_line_anchor": re.compile(r"#"), # Inline comments are also triggered by the '#' token. @@ -6932,12 +6252,10 @@ # (Note: Multi-line strings used as docs are handled by the 2.3.C Python Heuristic). "_block_start": None, "_block_end": None, - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Decisions and logical jumps. EXCLUDES raise (bailout_hits). - "branch": re.compile( - r"\b(if|elif|else|for|while|with|try|finally|match|case|and|or)\b" - ), + "branch": re.compile(r"\b(if|elif|else|for|while|with|try|finally|match|case|and|or)\b"), # 2. args (Parameters / Coupling) # Parameter blocks of functions/lambdas. Bounded negation to prevent ReDoS. "args": re.compile( @@ -6946,7 +6264,7 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: _private (encapsulation) and Final (freeze_hits). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(def|class|return|import|from|as|pass|continue|break|yield|await|assert|del|global|nonlocal|type)\b" ), # 4. func_start (Executable Logic Anchors) @@ -6968,13 +6286,13 @@ ), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Bare excepts and blocking the event loop (detrimental in embedded async). - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\bpass\b[ \t]*$|except\s*[:\n]|except\s+(?:Base)?Exception|from\s+[\w.]+\s+import\s+\*|\btime\.sleep(?:_ms|_us)?\b", re.M, ), # 8. danger (High-Risk Execution / System Calls) # Hardware resets and raw memory pokes. EXCLUDES TODO (debt) and print (print_hits). - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(machine\.reset|machine\.deepsleep|machine\.bootloader|machine\.disable_irq|eval|exec|sys\.exit)\b" ), # 9. io (I/O & Network Boundaries) @@ -6990,21 +6308,15 @@ ), # 11. flux (State Mutation) # State mutation including hardware value toggling. - "flux": re.compile( + "state_mutation": re.compile( r"\bglobal\b|\bnonlocal\b|\b(?:self|cls)\.\w+[ \t]*=|:=|(?:\.\w+)?\.(?:append|extend|update|pop|remove|insert|clear)\s*\(|\.(?:value|on|off|high|low|toggle)\s*\(" ), - # 12. graveyard (Dead / Commented-out Code) - "graveyard": re.compile( - r"#[ \t]*(?:def|class|import|if|for|while|try|print|machine\.Pin)\b" - ), + # 12. dead_code (Commented Logic / Deprecated Trails) + "dead_code": re.compile(r"#[ \t]*(?:def|class|import|if|for|while|try|print|machine\.Pin)\b"), # 13. doc (Structured Documentation) - "doc": re.compile( - r'"""|\'\'\'|:param|:return|:raises|:type|#\s*Pin[ \t]*=|#\s*GPIO' - ), + "doc": re.compile(r'"""|\'\'\'|:param|:return|:raises|:type|#\s*Pin[ \t]*=|#\s*GPIO'), # 14. test (Testing & Assertions) - "test": re.compile( - r"\b(unittest|pytest|assert|test_|setUp|tearDown|Mock)\b" - ), + "test": re.compile(r"\b(unittest|pytest|assert|test_|setUp|tearDown|Mock)\b"), # --- PHASE 3: ARCHITECTURE & DOMAIN SENSORS --- # 15. concurrency (Asynchronous Execution) "concurrency": re.compile( @@ -7018,9 +6330,7 @@ # 17. closures (Closures / Anonymous Functions) "closures": re.compile(r"\blambda\b"), # 18. globals (Global / Shared State) - "globals": re.compile( - r"\bglobal\b|\bglobals\(\)|\blocals\(\)|\b(sys\.path|sys\.modules|os\.environ)\b" - ), + "globals": re.compile(r"\bglobal\b|\bglobals\(\)|\blocals\(\)|\b(sys\.path|sys\.modules|os\.environ)\b"), # 19. decorators (Decorators / Annotations) # Generic decorators. (Specific ASM/Viper optimizations moved to heat_triggers/inline_asm). "decorators": re.compile( @@ -7032,39 +6342,31 @@ r"\b(?:List|Dict|Set|Tuple|Optional|Union|Any|Callable|Sequence|Iterable)\[[^\]]*\]|->" ), # 21. comprehensions (Iterators / Comprehensions) - "comprehensions": re.compile( - r"\[[^\]]*\bfor\b[^\]]*\]|\{[^}]*\bfor\b[^}]*\}|\([^)]*\bfor\b[^)]*\)" - ), + "comprehensions": re.compile(r"\[[^\]]*\bfor\b[^\]]*\]|\{[^}]*\bfor\b[^}]*\}|\([^)]*\bfor\b[^)]*\)"), # 22. scientific (Numerical / Compute Libraries) # Math, complex arrays, and ulab (MicroPython's NumPy). "scientific": re.compile( r"\b(math|cmath|ulab|numpy|ndarray|struct\.pack|struct\.unpack|bin|hex|oct|abs|sin|cos|tan)\b" ), # 23. heat_triggers (Metaprogramming & Reflection) - # Extreme "Logic Heat": Dunder methods and Viper/Native emitters. - "heat_triggers": re.compile( + # High Cognitive Load: Dunder methods and Viper/Native emitters. + "reflection_metaprogramming": re.compile( r"__(?:getattr|setattr|new|call|dict|dir|import)__|@(?:staticmethod|classmethod|property)|@micropython\.(?:viper|native)\b|\b(?:getattr|setattr|hasattr)\b" ), # 24. import (Dependency Inclusions) "import": re.compile(r"^[ \t]*(?:import|from)\b\s+[\w.]+", re.M), - "_dependency_capture": re.compile( - r"^[ \t]*(?:import|from)\b\s+([\w.]+)", re.M - ), + "_dependency_capture": re.compile(r"^[ \t]*(?:import|from)\b\s+([\w.]+)", re.M), # 25. ownership (Authorship Metadata) - "ownership": re.compile( - r"(?:__author__[ \t]*=|Author:|Created by:)\s*(.*)", re.I - ), + "ownership": re.compile(r"(?:__author__[ \t]*=|Author:|Created by:)\s*(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) # Lightweight web servers (Microdot, Picoweb). "ssr_boundaries": re.compile( @@ -7085,37 +6387,29 @@ "pointers": re.compile( r"\b(uctypes\.addressof|uctypes\.bytearray_at|ptr8|ptr16|ptr32|machine\.mem8|machine\.mem16|machine\.mem32)\b" ), - # 36. memory_alloc - "memory_alloc": re.compile( - r"\b(bytearray|memoryview|alloc_emergency_exception_buf)\b" - ), + # 36. memory_alloc + "memory_alloc": re.compile(r"\b(bytearray|memoryview|alloc_emergency_exception_buf)\b"), # 37. inline_asm (The Bare Metal) - "inline_asm": re.compile( - r"@(?:micropython\.asm_thumb|micropython\.asm_xtensa|rp2\.asm_pio)\b" - ), + "inline_asm": re.compile(r"@(?:micropython\.asm_thumb|micropython\.asm_xtensa|rp2\.asm_pio)\b"), # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry (Structured Logging / Telemetry) "telemetry": re.compile( r"\b(logging|logger|ulogging|syslog)\.(?:info|error|warn|warning|debug|trace|critical|exception)\b" ), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile(r"\b(print|input)\s*\("), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( - r"\b(int|str|float|list|dict|set|tuple|bool|bytes|cast)\b\s*\(" - ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(raise|quit|exit|sys\.exit|abort)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\b(time\.sleep|asyncio\.sleep|Thread\.join)\b"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"<<|>>|(?>|(?)[ \t]*(?:MOVE|COMPUTE|IF|PERFORM|CALL|EXEC)\b", re.I | re.M, ), @@ -7292,14 +6576,10 @@ re.I | re.M, ), # 14. test: Testing & Assertions. Unit testing framework markers (ZUnit). - "test": re.compile( - r"\b(ZUNIT|CBLUNIT|ASSERT|TEST-CASE|READY\s+TRACE)\b", re.I - ), + "test": re.compile(r"\b(ZUNIT|CBLUNIT|ASSERT|TEST-CASE|READY\s+TRACE)\b", re.I), # --- PHASE 3: ARCHITECTURE & DOMAIN SENSORS --- # 15. concurrency: Temporal Static. CICS Task and resource coordination. - "concurrency": re.compile( - r"\bEXEC\s+CICS\s+(?:ENQ|DEQ|WAIT|START|DELAY)\b", re.I - ), + "concurrency": re.compile(r"\bEXEC\s+CICS\s+(?:ENQ|DEQ|WAIT|START|DELAY)\b", re.I), # 16. ui_framework: UI / View Components. Screen sections and CICS maps. "ui_framework": re.compile( r"\b(SCREEN\s+SECTION|EXEC\s+CICS\s+SEND\s+MAP|DFHMDF|DFHMDI|DFHMSD)\b", @@ -7308,18 +6588,14 @@ # 17. closures: Closures / Anonymous Functions. (COBOL lacks native lambdas). "closures": None, # 18. globals: Global / Shared State. Global storage and external linkages. - "globals": re.compile( - r"\b(WORKING-STORAGE\s+SECTION|COMMON|GLOBAL|EXTERNAL)\b", re.I - ), + "globals": re.compile(r"\b(WORKING-STORAGE\s+SECTION|COMMON|GLOBAL|EXTERNAL)\b", re.I), # 19. decorators: Decorators / Annotations. (COBOL uses compiler directives). "decorators": re.compile( r"^(?:[0-9a-zA-Z \t]{6}[ \-]?)?[ \t]*>>\s*(?:IF|ELSE|END-IF|DEFINE|CALL-CONVENTION)", re.I | re.M, ), # 20. generics: Generics / Type Parameters. Parameterized classes (Modern COBOL). - "generics": re.compile( - r"\bCLASS-ID\.\s+[A-Za-z0-9_-]+\s+USING\s+[A-Za-z0-9_-]+", re.I - ), + "generics": re.compile(r"\bCLASS-ID\.\s+[A-Za-z0-9_-]+\s+USING\s+[A-Za-z0-9_-]+", re.I), # 21. comprehensions: Iterators / Comprehensions. (Not native to COBOL). "comprehensions": None, # 22. scientific: Numerical / Compute Libraries. Intrinsic math functions. @@ -7328,7 +6604,7 @@ re.I, ), # 23. heat_triggers: Metaprogramming & Reflection. Metaprogramming and memory aliasing. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(REDEFINES|RENAMES|OCCURS\s+DEPENDING\s+ON|EVALUATE\s+TRUE|EXEC\s+CICS|EXEC\s+SQL)\b", re.I, ), @@ -7337,9 +6613,7 @@ re.I | re.M, ), # 25. ownership: Authorship indicators. - "ownership": re.compile( - r"^(?:[0-9a-zA-Z \t]{6}[ \-]?)?[ \t]*AUTHOR\.\s+([^\n]+)", re.I | re.M - ), + "ownership": re.compile(r"^(?:[0-9a-zA-Z \t]{6}[ \-]?)?[ \t]*AUTHOR\.\s+([^\n]+)", re.I | re.M), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt: The Promise. Future work markers. "planned_debt": GLOBAL_PLANNED_DEBT, @@ -7347,12 +6621,10 @@ "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure: Map vs. Territory. Audit tags. "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)\]", re.I), - # 30. civil_war: Indentation Tracker. Tabs vs spaces conflict. - "civil_war": None, # COBOL fixed format strictly forbids Tabs. + # 30. tabs_vs_spaces (Formatting Inconsistencies): Indentation Tracker. Tabs vs spaces conflict. + "tabs_vs_spaces": None, # COBOL fixed format strictly forbids Tabs. # 31. ssr_boundaries: View Horizon. CICS web endpoints. - "ssr_boundaries": re.compile( - r"\bEXEC\s+CICS\s+(?:WEB\s+SEND|DOCUMENT|WEB\s+READ)\b", re.I - ), + "ssr_boundaries": re.compile(r"\bEXEC\s+CICS\s+(?:WEB\s+SEND|DOCUMENT|WEB\s+READ)\b", re.I), # 32. events: Pub/Sub Network. Signal handlers and MQ bindings. "events": re.compile( r"\b(?:EXEC\s+CICS\s+(?:SIGNAL|HANDLE\s+CONDITION)|CALL\s+\'(?:MQPUT|MQGET)\')\b", @@ -7371,39 +6643,33 @@ re.I, ), # 36. memory_alloc: Manual Memory Management. Heap and CICS allocation. - "memory_alloc": re.compile( - r"\b(?:ALLOCATE|FREE|EXEC\s+CICS\s+(?:GETMAIN|FREEMAIN))\b", re.I - ), + "memory_alloc": re.compile(r"\b(?:ALLOCATE|FREE|EXEC\s+CICS\s+(?:GETMAIN|FREEMAIN))\b", re.I), # 37. inline_asm: Bare Metal. "inline_asm": None, # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry: Professional diagnostics. - "telemetry": re.compile( - r"\b(?:EXEC\s+CICS\s+WRITEQ\s+TD|CEE3DMP|CEEMOUT|CEEDUMP)\b", re.I - ), - # 39. print_hits: Standard output. - "print_hits": re.compile(r"\b(DISPLAY)\b", re.I), - # 40. cast_hits: Trust Me Tax. REDEFINES is the implicit COBOL cast. - "cast_hits": re.compile(r"\b(REDEFINES)\b", re.I), - # 41. bailout_hits: Detonators. Aborting execution. - "bailout_hits": re.compile(r"\b(STOP\s+RUN|EXIT\s+PROGRAM|GOBACK)\b", re.I), - # 42. halt_hits: Temporal Duct Tape. (Forced waits). - "halt_hits": re.compile(r"\bEXEC\s+CICS\s+DELAY\b", re.I), - # 43. bitwise_hits: Sub-Atomic Math. (Modern intrinsic bitwise). - "bitwise_hits": re.compile( - r"\bFUNCTION\s+(?:BIT-AND|BIT-OR|BIT-XOR|BIT-NOT)\b", re.I - ), - # 44. sync_locks: Barricades. + "telemetry": re.compile(r"\b(?:EXEC\s+CICS\s+WRITEQ\s+TD|CEE3DMP|CEEMOUT|CEEDUMP)\b", re.I), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs): Standard output. + "debug_prints": re.compile(r"\b(DISPLAY)\b", re.I), + # 40. explicit_casts (Explicit Type Casting): Explicit type coercion/casting. + "explicit_casts": re.compile(r"\b(REDEFINES)\b", re.I), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) Aborting execution. + "panics_and_aborts": re.compile(r"\b(STOP\s+RUN|EXIT\s+PROGRAM|GOBACK)\b", re.I), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) (Forced waits). + "thread_sleeps": re.compile(r"\bEXEC\s+CICS\s+DELAY\b", re.I), + # 43. bitwise_ops (Bitwise Operations) (Modern intrinsic bitwise). + "bitwise_ops": re.compile(r"\bFUNCTION\s+(?:BIT-AND|BIT-OR|BIT-XOR|BIT-NOT)\b", re.I), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile(r"\bEXEC\s+CICS\s+ENQ\b", re.I), - # 45. freeze_hits: Data Cryogenics. Immutability. - "freeze_hits": re.compile(r"\b(CONSTANT)\b", re.I), - # 46. cleanup: The Janitor. Resource release. + # 45. immutability_locks (Immutability Constraints) Immutability. + "immutability_locks": re.compile(r"\b(CONSTANT)\b", re.I), + # 46. cleanup (Resource Cleanup / Teardown) Resource release. "cleanup": re.compile(r"\b(CLOSE|FREE|END-DECLARATIVES)\b", re.I), - # 47. encapsulation: The Vault. Scope hiding. + # 47. encapsulation (Encapsulation / Access Modifiers) "encapsulation": re.compile(r"\b(LOCAL-STORAGE\s+SECTION|PRIVATE)\b", re.I), - # 48. listeners: The Sinks. + # 48. listeners (Event Listeners / Observers) "listeners": re.compile(r"\b(?:MQGET|EXEC\s+CICS\s+RECEIVE)\b", re.I), - # 49. test_skip: Safety Theater. + # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(IGNORE)\b", re.I), # --- PHASE 3: HYBRID DOMAIN SENSORS (COBOL Specifics) --- "serialization_parsing": re.compile( @@ -7415,9 +6681,7 @@ "time_date_logic": re.compile( r"(?i)\b(ACCEPT\s+.*\s+FROM\s+(?:DATE|TIME|DAY)|CURRENT-DATE|WHEN-COMPILED)\b" ), - "ipc_rpc_bridges": re.compile( - r"(?i)\b(CALL\s+|EXEC\s+CICS\s+(?:LINK|XCTL|START|RETURN)|EXEC\s+SQL)\b" - ), + "ipc_rpc_bridges": re.compile(r"(?i)\b(CALL\s+|EXEC\s+CICS\s+(?:LINK|XCTL|START|RETURN)|EXEC\s+SQL)\b"), }, }, "zig": { @@ -7437,22 +6701,20 @@ "shebangs": ["zig"], # UPGRADED: Maps to Family 8 (Singular/Unique) # Rationale: Zig intentionally omits multi-line block comments to keep parsing simple, exclusively using '//'. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), "_block_start": None, "_block_end": None, - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch: decisions that split flow. Includes unique 'orelse' and 'catch' patterns. - "branch": re.compile( - r"\b(if|else|switch|while|for|try|catch|orelse|break|continue|return)\b|&&|\|\|" - ), + "branch": re.compile(r"\b(if|else|switch|while|for|try|catch|orelse|break|continue|return)\b|&&|\|\|"), # 2. args: Parameters / Coupling. Captures parameters in function signatures. "args": re.compile(r"\bfn\s*(?:[a-zA-Z_]\w*\s*)?\([^)]*\)"), # 3. linear: Sequential I/O & Network Boundaries. Structural boundaries. EXCLUDES access modifiers and const (freeze_hits). - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(var|return|defer|errdefer|unreachable|resume|suspend|await|nosuspend|usingnamespace)\b" ), # 4. func_start: Executable Logic Anchors. Anchors logic blocks (fn). EXCLUDES struct/enum/union headers. @@ -7467,27 +6729,21 @@ ), # --- PHASE 2: RISK & STRUCTURAL INTEGRITY --- # 6. safety: Defensive Programming. Error handling, payload capturing (|val|), and debug assertions. - "safety": re.compile( - r"\b(try|catch|orelse|errdefer|std\.debug\.assert)\b|\|[ \t]*[a-zA-Z_]\w*[ \t]*\|" - ), + "safety": re.compile(r"\b(try|catch|orelse|errdefer|std\.debug\.assert)\b|\|[ \t]*[a-zA-Z_]\w*[ \t]*\|"), # 7. safety_neg: Safety Bypasses. Bypassing safety (undefined, unreachable, raw ptr casting). - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(undefined|unreachable|@ptrCast|@intCast|@alignCast|@bitCast|@truncate|@enumFromInt)\b" ), # 8. danger: High-Risk Execution. Forceful panics and process terminations. - "danger": re.compile(r"\b(@panic|panic|std\.process\.exit)\b"), + "high_risk_execution": re.compile(r"\b(@panic|panic|std\.process\.exit)\b"), # 9. io: I/O & Network Boundaries. Standard library IO, Network, and Filesystem interactions. - "io": re.compile( - r"\b(std\.fs|std\.net|std\.io(?!\.getStdOut)|std\.ChildProcess|std\.posix|std\.os)\b" - ), + "io": re.compile(r"\b(std\.fs|std\.net|std\.io(?!\.getStdOut)|std\.ChildProcess|std\.posix|std\.os)\b"), # 10. api: Public Surface Area. Exposed boundaries via 'pub' and 'export' (C ABI). "api": re.compile(r"\b(pub|export)\b"), # 11. flux: State Mutation. State mutation (var) and pointer dereference assignments (.* =). - "flux": re.compile(r"\bvar\b|\.\*[ \t]*=[^=]"), - # 12. graveyard: Dead / Commented-out Code. Commented out structural code. - "graveyard": re.compile( - r"//[ \t]*(?:fn|const|var|pub|if|for|while|try|catch)\b" - ), + "state_mutation": re.compile(r"\bvar\b|\.\*[ \t]*=[^=]"), + # 12. dead_code (Commented Logic / Deprecated Trails) Commented out structural code. + "dead_code": re.compile(r"//[ \t]*(?:fn|const|var|pub|if|for|while|try|catch)\b"), # 13. doc: Structured Documentation. Structured documentation (/// and //!). "doc": re.compile(r"///|//!"), # 14. test: Testing & Assertions. Native test framework blocks. @@ -7500,9 +6756,7 @@ r"\b(std\.Thread|std\.Thread\.Mutex|std\.Thread\.RwLock|std\.atomic|@atomicLoad|@atomicStore|@atomicRmw|suspend|resume|await)\b" ), # 16. ui_framework: UI / View Components. (Zig lacks native UI; targets common bindings like Mach/zgui). - "ui_framework": re.compile( - r"\b(mach\.|zgui\.|zopengl\.|capy\.|vaxis\.|raylib\.)\b" - ), + "ui_framework": re.compile(r"\b(mach\.|zgui\.|zopengl\.|capy\.|vaxis\.|raylib\.)\b"), # 17. closures: Closures / Anonymous Functions. (Zig lacks traditional anonymous closures). "closures": None, # 18. globals: Global / Shared State. Top-level file-scoped state. @@ -7513,17 +6767,13 @@ # 19. decorators: Decorators / Annotations. (Zig uses @builtins instead). "decorators": None, # 20. generics: Generics / Type Parameters. Comptime parameters and 'anytype' duck typing. - "generics": re.compile( - r"\b(anytype|type)\b|\bcomptime\s+[a-zA-Z_]\w*\s*:\s*type\b" - ), + "generics": re.compile(r"\b(anytype|type)\b|\bcomptime\s+[a-zA-Z_]\w*\s*:\s*type\b"), # 21. comprehensions: Iterators / Comprehensions. (Not native to Zig). "comprehensions": None, # 22. scientific: Numerical / Compute Libraries. Math intrinsics and SIMD @Vector support. - "scientific": re.compile( - r"\b(std\.math|@Vector|f16|f32|f64|f80|f128|@sqrt|@sin|@cos|@splat|@reduce)\b" - ), + "scientific": re.compile(r"\b(std\.math|@Vector|f16|f32|f64|f80|f128|@sqrt|@sin|@cos|@splat|@reduce)\b"), # 23. heat_triggers: Metaprogramming & Reflection. Comptime metaprogramming and reflection. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(comptime[ \t]*\{|inline\s+for|inline\s+while|@Type|@typeInfo|@compileLog|@hasDecl|@hasField)\b" ), # 24. import: Dependency Inclusions. Module and C-header bridges. @@ -7533,9 +6783,7 @@ re.M, ), # 25. ownership: Authorship indicators in comments. - "ownership": re.compile( - r"//\s*(?:Author|Created by|Maintainer|Copyright):\s+([^\n]+)", re.I - ), + "ownership": re.compile(r"//\s*(?:Author|Created by|Maintainer|Copyright):\s+([^\n]+)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt: The Promise. Future work markers. "planned_debt": GLOBAL_PLANNED_DEBT, @@ -7543,16 +6791,12 @@ "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure: Map vs. Territory. Audit tags. "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)\]", re.I), - # 30. civil_war: Indentation Tracker. Tabs vs 4-space standardization. - "civil_war": None, + # 30. tabs_vs_spaces (Formatting Inconsistencies): Indentation Tracker. Tabs vs 4-space standardization. + "tabs_vs_spaces": None, # 31. ssr_boundaries: View Horizon. Zap/httpz response handlers. - "ssr_boundaries": re.compile( - r"\b(zap\.Endpoint|zap\.Request|httpz\.Request|std\.http\.Server\.Request)\b" - ), + "ssr_boundaries": re.compile(r"\b(zap\.Endpoint|zap\.Request|httpz\.Request|std\.http\.Server\.Request)\b"), # 32. events: Pub/Sub Network. OS-level event loops. - "events": re.compile( - r"\b(std\.posix\.epoll_wait|std\.posix\.kevent|xev\.Loop)\b" - ), + "events": re.compile(r"\b(std\.posix\.epoll_wait|std\.posix\.kevent|xev\.Loop)\b"), # 33. dependency_injection: Inversion of Control. "dependency_injection": None, # 34. macros: Preprocessor Hooks. (Zig lacks macros). @@ -7569,32 +6813,28 @@ "inline_asm": re.compile(r"\basm\b(?:\s+volatile)?\s*\([^)]+\)"), # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry: Professional diagnostics. - "telemetry": re.compile( - r"\b(?:std\.log\.(?:info|err|warn|debug)|std\.log\.scoped)\b" - ), - # 39. print_hits: Standard output. - "print_hits": re.compile(r"\b(std\.debug\.print)\b"), - # 40. cast_hits: "Trust Me" Tax. Explicit casting. - "cast_hits": re.compile(r"\b(@ptrCast|@intCast|@alignCast|@bitCast|@as)\b"), - # 41. bailout_hits: Detonators. Aborting context. - "bailout_hits": re.compile(r"\b(@panic|unreachable|return)\b"), - # 42. halt_hits: Temporal Duct Tape. (Forced waits/sleep). - "halt_hits": re.compile(r"\b(std\.time\.sleep)\b"), - # 43. bitwise_hits: Sub-Atomic Math. - "bitwise_hits": re.compile(r"(?>|\^|~"), - # 44. sync_locks: Barricades. Coordinated threading. + "telemetry": re.compile(r"\b(?:std\.log\.(?:info|err|warn|debug)|std\.log\.scoped)\b"), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs): Standard output. + "debug_prints": re.compile(r"\b(std\.debug\.print)\b"), + # 40. explicit_casts (Explicit Type Casting): "Trust Me" Tax. Explicit casting. + "explicit_casts": re.compile(r"\b(@ptrCast|@intCast|@alignCast|@bitCast|@as)\b"), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) Aborting context. + "panics_and_aborts": re.compile(r"\b(@panic|unreachable|return)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) (Forced waits/sleep). + "thread_sleeps": re.compile(r"\b(std\.time\.sleep)\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"(?>|\^|~"), + # 44. sync_locks (Resource Management & Stability) Coordinated threading. "sync_locks": re.compile(r"\b(Mutex|RwLock|Semaphore|lock|unlock)\b"), - # 45. freeze_hits: Data Cryogenics. Immutability. - "freeze_hits": re.compile(r"\bconst\b"), - # 46. cleanup: The Janitor. Resource release. + # 45. immutability_locks (Immutability Constraints) Immutability. + "immutability_locks": re.compile(r"\bconst\b"), + # 46. cleanup (Resource Cleanup / Teardown) Resource release. "cleanup": re.compile(r"\b(deinit|free|destroy|allocator\.free)\b"), - # 47. encapsulation: The Vault. Scope hiding (Lack of pub). - "encapsulation": re.compile( - r"^[ \t]*(?!(?:pub|export|extern)\b)(?:const|var|fn)\s+", re.M - ), - # 48. listeners: The Sinks. + # 47. encapsulation Scope hiding (Lack of pub). + "encapsulation": re.compile(r"^[ \t]*(?!(?:pub|export|extern)\b)(?:const|var|fn)\s+", re.M), + # 48. listeners (Event Listeners / Observers) "listeners": None, - # 49. test_skip: Safety Theater. + # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(std\.testing\.expect|assume|expectError)\b"), # --- PHASE 3: HYBRID DOMAIN SENSORS (Zig Specifics) --- "serialization_parsing": re.compile( @@ -7603,9 +6843,7 @@ "regex_execution": re.compile( r"\b(std\.mem\.(?:indexOf|tokenize(?:Any)?|split(?:Sequence|Any)?|replace))\b" ), # Zig has no native regex! - "time_date_logic": re.compile( - r"\b(std\.time\.(?:nanoTimestamp|milliTimestamp|Timer|sleep))\b" - ), + "time_date_logic": re.compile(r"\b(std\.time\.(?:nanoTimestamp|milliTimestamp|Timer|sleep))\b"), "ipc_rpc_bridges": re.compile( r"\b(std\.process\.Child|std\.net\.tcpConnectToHost|std\.Thread\.spawn|std\.posix|std\.os\.execve)\b" ), @@ -7631,16 +6869,15 @@ ], # EXECUTION SIGNATURES: Executed exclusively on the Salesforce platform; no shebangs exist. "shebangs": [], - # UPGRADED: Maps to Family 1 (Standard C-Style) - # Rationale: Uses standard '//' for lines and '/*' '*/' for block-level Ghost Mass. - "lexical_family": "c_style_comment", + # Rationale: Uses standard '//' for lines and '/*' '*/' for block-level Commented / Non-Executable Text. + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), "_block_start": re.compile(r"/\*"), "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch: decisions that split flow. Includes switch on/when and DML try-catch. "branch": re.compile( r"\b(if|else|switch\s+on|when|for|while|do|try|catch|finally|break|continue|return)\b|&&|\|\||\?|\?\?", @@ -7652,7 +6889,7 @@ re.I, ), # 3. linear: Sequential I/O & Network Boundaries. Structural boundaries. EXCLUDES access modifiers and sharing keywords. - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(class|interface|trigger|enum|final|transient|implements|extends|virtual|abstract|return)\b", re.I, ), @@ -7680,12 +6917,12 @@ re.I, ), # 7. safety_neg: Safety Bypasses. Actively bypassing safety (without sharing, raw casting). - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(without\s+sharing|Database\.query(?!\s*\(.*?WITH\s+SECURITY_ENFORCED)|@SuppressWarnings)\b|\(\s*[A-Z_]\w*\s*\)\s*[a-z_]\w*", re.I, ), # 8. danger: High-Risk Execution. Dynamic SOQL, mass deletion, and hardcoded IDs. - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(Database\.query|delete|undelete|emptyRecycleBin|purgeOldAsyncJobs)\b|\'[a-z0-9]{15,18}\'", re.I, ), @@ -7700,19 +6937,17 @@ re.I, ), # 11. flux: State Mutation. State mutation (DML operations and standard assignments). - "flux": re.compile( + "state_mutation": re.compile( r"\b(insert|update|upsert|delete|merge)\b|^[ \t]*(?:this\.)?[a-z_]\w*\s*[-+*/%]?=|\.(?:add|addAll|remove|put|clear|set)\s*\(", re.I | re.M, ), - # 12. graveyard: Dead / Commented-out Code. Commented out structural code or queries. - "graveyard": re.compile( + # 12. dead_code (Commented Logic / Deprecated Trails) Commented out structural code or queries. + "dead_code": re.compile( r"//[ \t]*(?:class|trigger|public|private|if|for|while|System\.debug|\[\s*SELECT|insert|update)\b|/\*[ \t]*(?:class|trigger|\[\s*SELECT)", re.I | re.M, ), # 13. doc: Structured Documentation. ApexDoc annotations and metadata blocks. - "doc": re.compile( - r"/\*\*|@description|@param|@return|@author|@date|@example", re.I - ), + "doc": re.compile(r"/\*\*|@description|@param|@return|@author|@date|@example", re.I), # 14. test: Testing & Assertions. Salesforce test execution and assertion markers. "test": re.compile( r"@isTest|@TestSetup|@TestVisible|\b(?:Test\.startTest|Test\.stopTest|System\.assert|Assert\.(?:isTrue|isNotNull|areEqual)|Test\.setMock)\b", @@ -7739,20 +6974,16 @@ # 19. decorators: Decorators / Annotations. Execution context annotations. "decorators": re.compile(r"@[a-z_]\w*(?:\([^)]*\))?", re.I), # 20. generics: Generics / Type Parameters. Parameterized collections (List, Map, Set). - "generics": re.compile( - r"\b(?:List|Set|Map|Iterable|Iterator)\s*<\s*[a-z_][^>]*>", re.I - ), + "generics": re.compile(r"\b(?:List|Set|Map|Iterable|Iterator)\s*<\s*[a-z_][^>]*>", re.I), # 21. comprehensions: Iterators / Comprehensions. Inline SOQL for-loops act as mappers. - "comprehensions": re.compile( - r"\bfor\s*\([^)]+:\s*\[\s*SELECT[^\]]+\]\s*\)", re.I - ), + "comprehensions": re.compile(r"\bfor\s*\([^)]+:\s*\[\s*SELECT[^\]]+\]\s*\)", re.I), # 22. scientific: Numerical / Compute Libraries. Standard numerical and currency math. "scientific": re.compile( r"\b(Math\.(?:abs|sin|cos|tan|exp|log|pow|sqrt)|Decimal|setScale|setRoundingMode)\b", re.I, ), # 23. heat_triggers: Metaprogramming & Reflection. Dynamic SOQL, Reflection, and Describe calls. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(Database\.query|Type\.forName|Schema\.getGlobalDescribe|Schema\.describeSObjects|SObject\.put|SObject\.get|JSON\.deserializeUntyped)\b", re.I, ), @@ -7782,8 +7013,8 @@ r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)\]|\b(?:WorldWideWeb|RFC|W3C|CERN|TBL|ENQUIRE)\b", re.I, ), - # 30. civil_war: Indentation Tracker. Tabs vs 4-space standardization. - "civil_war": None, + # 30. tabs_vs_spaces (Formatting Inconsistencies): Indentation Tracker. Tabs vs 4-space standardization. + "tabs_vs_spaces": None, # 31. ssr_boundaries: View Horizon. REST and Visualforce response handlers. "ssr_boundaries": re.compile( r"\b(RestContext\.request|RestContext\.response|RestRequest|RestResponse|renderAs)\b", @@ -7816,36 +7047,30 @@ r"\b(Logger|Log|AppLog|NebulaLogger)\.(?:info|error|warn|debug|trace)\b|\binsert\s+new\s+Log__c\b", re.I, ), - # 39. print_hits: Standard output. - "print_hits": re.compile(r"\b(System\.debug)\b", re.I), - # 40. cast_hits: "Trust Me" Tax. Explicit type coercion. - "cast_hits": re.compile( + # 39. debug_prints (Debug Artifacts / Unstructured Outputs): Standard output. + "debug_prints": re.compile(r"\b(System\.debug)\b", re.I), + # 40. explicit_casts (Explicit Type Casting): "Trust Me" Tax. Explicit type coercion. + "explicit_casts": re.compile( r"\(\s*(?:[A-Z]\w*|int|Id|String|Decimal|Boolean|Double|Long|Blob|Date|Datetime|Time)\s*\)\s*[a-zA-Z_$]" ), - # 41. bailout_hits: Detonators. Aborting execution or rollback. - "bailout_hits": re.compile( - r"\b(throw|Database\.rollback|purgeOldAsyncJobs)\b", re.I - ), - # 42. halt_hits: Temporal Duct Tape. (Apex has no native sleep/delay). - "halt_hits": None, - # 43. bitwise_hits: Sub-Atomic Math. - "bitwise_hits": re.compile(r"(?>|\^|~"), - # 44. sync_locks: Barricades. Row-level SOQL locking. + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) Aborting execution or rollback. + "panics_and_aborts": re.compile(r"\b(throw|Database\.rollback|purgeOldAsyncJobs)\b", re.I), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) (Apex has no native sleep/delay). + "thread_sleeps": None, + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"(?>|\^|~"), + # 44. sync_locks (Resource Management & Stability) Row-level SOQL locking. "sync_locks": re.compile(r"\bFOR\s+UPDATE\b", re.I), - # 45. freeze_hits: Data Cryogenics. Immutability (constants). - "freeze_hits": re.compile(r"\b(static\s+final|final|const)\b", re.I), - # 46. cleanup: The Janitor. Recycle bin management. - "cleanup": re.compile( - r"\b(emptyRecycleBin|Database\.rollback|clear)\s*\(", re.I - ), - # 47. encapsulation: The Vault. Scope hiding. + # 45. immutability_locks (Immutability Constraints) Immutability (constants). + "immutability_locks": re.compile(r"\b(static\s+final|final|const)\b", re.I), + # 46. cleanup (Resource Cleanup / Teardown) Recycle bin management. + "cleanup": re.compile(r"\b(emptyRecycleBin|Database\.rollback|clear)\s*\(", re.I), + # 47. encapsulation (Encapsulation / Access Modifiers) "encapsulation": re.compile(r"\b(private|protected)\b", re.I), - # 48. listeners: The Sinks. Triggers listening for events. + # 48. listeners (Event Listeners / Observers) Triggers listening for events. "listeners": re.compile(r"^[ \t]*trigger\s+[a-z_]\w*\s+on\b", re.I | re.M), - # 49. test_skip: Safety Theater. - "test_skip": re.compile( - r"\b(StubProvider|Test\.setMock|@SuppressWarnings)\b", re.I - ), + # 49. test_skip (Bypassed Tests / Ignored Specs) + "test_skip": re.compile(r"\b(StubProvider|Test\.setMock|@SuppressWarnings)\b", re.I), }, }, "dart": { @@ -7872,13 +7097,13 @@ # UPGRADED: Maps to Family 2 (Nested C) # Rationale: (CORRECTION) Like Swift and Rust, Dart officially supports nested multi-line # comments (/* /* */ */). Standard C parsing would prematurely terminate here causing geometry failure. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "_block_start": re.compile(r"/\*"), "_block_end": re.compile(r"\*/"), "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch: decisions that split flow. Includes modern pattern guards (when) and null-coalescing. "branch": re.compile( r"\b(if|else|switch|case|default|for|while|do|try|catch|finally|break|continue|when)\b|&&|\|\||\?|\?\?", @@ -7900,7 +7125,7 @@ re.I | re.M, ), # 3. linear: Sequential I/O & Network Boundaries. Structural boundaries. EXCLUDES access modifiers and const/final. - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(var|late|return|yield|await|class|mixin|extension|enum|typedef|import|export|part|library|base|sealed|interface|macro)\b|=>", re.I, ), @@ -7936,11 +7161,9 @@ re.I, ), # 7. safety_neg: Safety Bypasses. Actively bypassing sound null safety or static analysis. - "safety_neg": re.compile( - r"!\s*[;,\n)\.\]]|\bdynamic\b|//\s*ignore(?:_for_file)?:\s*\w+" - ), + "safety_bypasses": re.compile(r"!\s*[;,\n)\.\]]|\bdynamic\b|//\s*ignore(?:_for_file)?:\s*\w+"), # 8. danger: High-Risk Execution. Process killers and catastrophic exit commands. - "danger": re.compile(r"\b(exit|exitCode|Process\.killPid)\b", re.I), + "high_risk_execution": re.compile(r"\b(exit|exitCode|Process\.killPid)\b", re.I), # 9. io: I/O & Network Boundaries. Disk, Network, WebSockets, and Uri parsing (Includes legacy CERN triggers). "io": re.compile( r"\b(File|Directory|HttpClient|HttpServer|ServerSocket|WebSocket|Uri\.parse|HtmlDocument|HttpRequest|HttpResponse|HTRequest|Nexus|ENQUIRE)\b", @@ -7952,12 +7175,12 @@ re.I | re.M, ), # 11. flux: State Mutation. State mutation (setState and reactive collection mutators). - "flux": re.compile( + "state_mutation": re.compile( r"\b(setState|notifyListeners|markNeedsBuild|StreamController\.add)\b|[^!=<>\+\-\*\/%&\|\s]=\s*[^=]|(?:\+\+|--)|\.(?:add|addAll|remove|insert|clear|update)\s*\(", re.I, ), - # 12. graveyard: Dead / Commented-out Code. Commented out structural code or dead widgets. - "graveyard": re.compile( + # 12. dead_code (Commented Logic / Deprecated Trails) Commented out structural code or dead widgets. + "dead_code": re.compile( r"//[ \t]*(?:class|mixin|void|if|for|while|print|Widget|return)\b|/\*[ \t]*(?:class|mixin|void|Widget|if|for)" ), # 13. doc: Structured Documentation. dartdoc annotations and structured comments. @@ -7998,22 +7221,18 @@ re.I, ), # 23. heat_triggers: Metaprogramming & Reflection. Reflection, Native Bridges, and code generation markers. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r'\b(MethodChannel|EventChannel|dart:mirrors|reflect|reflectClass|noSuchMethod|dart:js_interop)\b|part\s+[\'"][^\'"]+\.(?:g|freezed)\.dart[\'"]', re.I, ), # 24. import: Dependency Inclusions. Dependency resolution and library partitions. - "import": re.compile( - r'^[ \t]*(?:import|export|part|part\s+of)\b\s*[\'"][^\'"]+[\'"]', re.M - ), + "import": re.compile(r'^[ \t]*(?:import|export|part|part\s+of)\b\s*[\'"][^\'"]+[\'"]', re.M), "_dependency_capture": re.compile( r"^[ \t]*(?:import|export|part(?:[ \t\n]+of)?)\b[ \t\n]*['\"]([^'\"]+)['\"]", re.M, ), # 25. ownership: Authorship indicators. - "ownership": re.compile( - r"//\s*(?:Author|Created by|Maintainer|Copyright):\s+([^\n]+)", re.I - ), + "ownership": re.compile(r"//\s*(?:Author|Created by|Maintainer|Copyright):\s+([^\n]+)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- # 26. planned_debt: The Promise. Future work markers. "planned_debt": GLOBAL_PLANNED_DEBT, @@ -8024,8 +7243,8 @@ r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit|RFC|W3C|CERN|TBL|ENQUIRE)[^\]]*\]|\b(?:Tim\s+Berners-Lee|WorldWideWeb|HyperText\s+Proposal)\b", re.I, ), - # 30. civil_war: Indentation Tracker. Tabs vs 2-space standardization. - "civil_war": None, + # 30. tabs_vs_spaces (Formatting Inconsistencies): Indentation Tracker. Tabs vs 2-space standardization. + "tabs_vs_spaces": None, # 31. ssr_boundaries: View Horizon. shelf/Serverpod response handlers. "ssr_boundaries": re.compile( r"\b(shelf|dart_frog|Serverpod|Response\.(?:ok|internalServerError)|RequestContext|Router\(\)|Handler|Serve|renderHtml)\b", @@ -8064,50 +7283,34 @@ r"\b(developer\.log|Logger|log|FirebaseCrashlytics|Sentry)\.(?:info|error|warn|severe|debug|trace|recordError)\b|\bdart:developer\b", re.I, ), - # 39. print_hits: Standard output. - "print_hits": re.compile(r"\b(print|debugPrint)\s*\(", re.I), - # 40. cast_hits: "Trust Me" Tax. Explicit casting. - "cast_hits": re.compile(r"\bas\s+[A-Z]\w*|\(\s*[A-Z]\w*\s*\)\s*[a-zA-Z_$]"), - # 41. bailout_hits: Detonators. Aborting context. - "bailout_hits": re.compile( - r"\b(throw|rethrow|exit|exitCode|Process\.killPid)\b", re.I - ), - # 42. halt_hits: Temporal Duct Tape. (Forced waits/delays). - "halt_hits": re.compile( - r"\b(sleep|delay|setTimeout|setInterval)\s*\(", re.I - ), - # 43. bitwise_hits: Sub-Atomic Math. - "bitwise_hits": re.compile( - r"(?>|\^|~(?!=|/)" - ), - # 44. sync_locks: Barricades. Coordinated threading. - "sync_locks": re.compile( - r"\b(Mutex|Lock|synchronized|Semaphore|Completer)\b", re.I - ), - # 45. freeze_hits: Data Cryogenics. Immutability. - "freeze_hits": re.compile(r"\b(const|final|readonly|@immutable)\b", re.I), - # 46. cleanup: The Janitor. Resource release. - "cleanup": re.compile( - r"\b(dispose|close|cleanup|cancel|drop|free)\s*\(", re.I - ), - # 47. encapsulation: The Vault. Scope hiding (Underscore prefix). + # 39. debug_prints (Debug Artifacts / Unstructured Outputs): Standard output. + "debug_prints": re.compile(r"\b(print|debugPrint)\s*\(", re.I), + # 40. explicit_casts (Explicit Type Casting): "Trust Me" Tax. Explicit casting. + "explicit_casts": re.compile(r"\bas\s+[A-Z]\w*|\(\s*[A-Z]\w*\s*\)\s*[a-zA-Z_$]"), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) Aborting context. + "panics_and_aborts": re.compile(r"\b(throw|rethrow|exit|exitCode|Process\.killPid)\b", re.I), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) (Forced waits/delays). + "thread_sleeps": re.compile(r"\b(sleep|delay|setTimeout|setInterval)\s*\(", re.I), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"(?>|\^|~(?!=|/)"), + # 44. sync_locks (Resource Management & Stability) Coordinated threading. + "sync_locks": re.compile(r"\b(Mutex|Lock|synchronized|Semaphore|Completer)\b", re.I), + # 45. immutability_locks (Immutability Constraints) Immutability. + "immutability_locks": re.compile(r"\b(const|final|readonly|@immutable)\b", re.I), + # 46. cleanup (Resource Cleanup / Teardown) Resource release. + "cleanup": re.compile(r"\b(dispose|close|cleanup|cancel|drop|free)\s*\(", re.I), + # 47. encapsulation Scope hiding (Underscore prefix). "encapsulation": re.compile(r"\b(_[a-zA-Z0-9_$]+)\b|@protected|@private"), - # 48. listeners: The Sinks. Waiting for state broadcasts. - "listeners": re.compile( - r"\b(on\(|addEventListener|subscribe|watch|useEffect|listen)\b", re.I - ), - # 49. test_skip: Safety Theater. + # 48. listeners (Event Listeners / Observers) Waiting for state broadcasts. + "listeners": re.compile(r"\b(on\(|addEventListener|subscribe|watch|useEffect|listen)\b", re.I), + # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(@Ignore|test\.skip|t\.Skip|xit|mock)\b", re.I), # --- PHASE 3: HYBRID DOMAIN SENSORS (Dart Specifics) --- "serialization_parsing": re.compile( r"\b(jsonDecode|jsonEncode|json\.decode|json\.encode|Utf8Decoder|Utf8Encoder)\b" ), - "regex_execution": re.compile( - r"\b(RegExp\s*\()|\.(hasMatch|allMatches|stringMatch)\b" - ), - "time_date_logic": re.compile( - r"\b(DateTime\.now|Duration\s*\(|Timer\.run|Timer\.periodic|Stopwatch)\b" - ), + "regex_execution": re.compile(r"\b(RegExp\s*\()|\.(hasMatch|allMatches|stringMatch)\b"), + "time_date_logic": re.compile(r"\b(DateTime\.now|Duration\s*\(|Timer\.run|Timer\.periodic|Stopwatch)\b"), "ipc_rpc_bridges": re.compile( r"\b(Isolate\.spawn|ReceivePort|SendPort|Process\.run|Process\.start|HttpClient)\b" ), @@ -8138,13 +7341,13 @@ # UPGRADED: Maps to Family 2 (Nested C) # Rationale: Scala explicitly supports nested multi-line comments (/* /* */ */), # requiring depth-aware stripping to prevent premature termination. - "lexical_family": "recursive_c_style", + "lexical_family": "recursive_block", "rules": { "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), "_block_start": re.compile(r"/\*"), "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch: decisions that split flow. Includes Scala 3 if-then and match-case. "branch": re.compile( r"\b(if|then|else|match|case|try|catch|finally|for|while|do|throw|yield)\b|&&|\|\|", @@ -8155,7 +7358,7 @@ r"\bdef\s+[a-zA-Z_]\w*(?:\[[^\]]*\])?\s*\([^)]*\)|\([^)]*\)[ \t]*=>|\b[a-zA-Z_]\w*[ \t]*=>" ), # 3. linear: Sequential I/O & Network Boundaries. Structural boundaries. EXCLUDES access modifiers and val/var. - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(lazy|type|opaque|class|trait|object|enum|extension|import|export|return|extends|with|derives|new|given|using)\b" ), # 4. func_start: Executable Logic Anchors. Anchors executable logic. EXCLUDES structural headers. @@ -8186,13 +7389,9 @@ r"\b(Option|Some|None|Try|Success|Failure|Either|Left|Right|sealed|require|assert|assume)\b|\|\s*Null\b" ), # 7. safety_neg: Safety Bypasses. Actively bypassing type safety (asInstanceOf, .get). - "safety_neg": re.compile( - r"\b(null|asInstanceOf|isInstanceOf|\.get\b(?!Class)|@unchecked|Any|AnyRef)\b" - ), + "safety_bypasses": re.compile(r"\b(null|asInstanceOf|isInstanceOf|\.get\b(?!Class)|@unchecked|Any|AnyRef)\b"), # 8. danger: High-Risk Execution. Process killers and catastrophic exit commands. - "danger": re.compile( - r"\b(System\.exit|sys\.exit|Thread\.stop|Runtime\.getRuntime\.exec)\b" - ), + "high_risk_execution": re.compile(r"\b(System\.exit|sys\.exit|Thread\.stop|Runtime\.getRuntime\.exec)\b"), # 9. io: I/O & Network Boundaries. Filesystem, Network, and Http Clients (Includes CERN triggers). "io": re.compile( r"\b(Source|java\.io|java\.nio|Files\.|Socket|ServerSocket|sttp|Http|WSClient|HTLoad|HTGet|ENQUIRE)\b" @@ -8203,12 +7402,12 @@ re.M, ), # 11. flux: State Mutation. State mutation (var and mutable collection updates). - "flux": re.compile( + "state_mutation": re.compile( r"\b(var|scala\.collection\.mutable|AtomicReference|AtomicInteger)\b|^[ \t]*[a-zA-Z_]\w*[ \t]*=", re.M, ), - # 12. graveyard: Dead / Commented-out Code. Commented out structural code or logic trails. - "graveyard": re.compile( + # 12. dead_code (Commented Logic / Deprecated Trails) Commented out structural code or logic trails. + "dead_code": re.compile( r"//[ \t]*(?:def|val|var|class|object|trait|if|match|println|import)\b|/\*[ \t]*(?:def|val|class|object)" ), # 13. doc: Structured Documentation. Scaladoc documentation (/**) and annotations. @@ -8235,9 +7434,7 @@ # 19. decorators: Decorators / Annotations. Method and class annotations. "decorators": re.compile(r"@[A-Za-z_]\w*(?:\([^)]*\))?"), # 20. generics: Generics / Type Parameters. Type parameterization and HKT constraints. - "generics": re.compile( - r"\[\s*[+-]?[A-Z][^\]]*\]|\bF\[_\]|<:|>:|\[[ \t]*_\s*\]" - ), + "generics": re.compile(r"\[\s*[+-]?[A-Z][^\]]*\]|\bF\[_\]|<:|>:|\[[ \t]*_\s*\]"), # 21. comprehensions: Iterators / Comprehensions. For-comprehensions and monadic chains. "comprehensions": re.compile( r"\bfor\s*(?:\{[^}]*\}|\([^)]*\))\s*yield\b|\.(?:map|flatMap|filter|withFilter|foldLeft|reduce|collect)\s*[\(\{]" @@ -8247,7 +7444,7 @@ r"\b(scala\.math|breeze\.|spire\.|algebird|Math\.|StrictMath\.|DenseMatrix|DenseVector)\b" ), # 23. heat_triggers: Metaprogramming & Reflection. Contextual abstractions and implicit resolution. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(implicit|given|using|inline|extension|TypeTag|ClassTag|scala\.reflect|Typeable|Dynamic|summon|derives)\b" ), # 24. import (Dependency Inclusions) @@ -8288,16 +7485,14 @@ r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)\]|\b(?:WorldWideWeb|HyperText\s+Proposal|NeXTSTEP)\b", re.I, ), - # 30. civil_war: Indentation Tracker. Tabs vs 2-space standardization. - "civil_war": None, + # 30. tabs_vs_spaces (Formatting Inconsistencies): Indentation Tracker. Tabs vs 2-space standardization. + "tabs_vs_spaces": None, # 31. ssr_boundaries: View Horizon. Play Framework and twirl template endpoints. "ssr_boundaries": re.compile( r"\b(Action|Controller|HttpRoutes|ServerEndpoint|twirl|html\.[a-zA-Z_]\w*|Ok\(|BadRequest\()\b" ), # 32. events: Pub/Sub Network. Stream processing and event bus signatures. - "events": re.compile( - r"\b(Source|Flow|Sink|fs2\.Stream|ZStream|EventBus|system\.eventStream|Observable)\b" - ), + "events": re.compile(r"\b(Source|Flow|Sink|fs2\.Stream|ZStream|EventBus|system\.eventStream|Observable)\b"), # 33. dependency_injection: Inversion of Control. ZLayer and ReaderT patterns. "dependency_injection": re.compile( r"\b(@Inject|wire\[|ZLayer|ZLayer\.from|provide|provideSome|ReaderT|Kleisli|requires)\b" @@ -8307,9 +7502,7 @@ r"\b(inline\s+def|transparent\s+inline|macro|scala\.quoted|Expr|Type|Quotes)\b|\$\{.*?\}|\'\{" ), # 35. pointers: Memory Map. Scala Native C-Interop pointers. - "pointers": re.compile( - r"\b(Ptr\[[^\]]+\]|scala\.scalanative\.unsafe|!ptr|ptr\.|CFuncPtr|CStruct\d+)\b" - ), + "pointers": re.compile(r"\b(Ptr\[[^\]]+\]|scala\.scalanative\.unsafe|!ptr|ptr\.|CFuncPtr|CStruct\d+)\b"), # 36. memory_alloc: Manual Memory Management. Heap and Native allocations. "memory_alloc": re.compile( r"\b(Zone|zone[ \t]*\{|alloc\[[^\]]+\]|malloc|calloc|free|scala\.scalanative\.libc\.stdlib)\b" @@ -8321,53 +7514,37 @@ "telemetry": re.compile( r"\b(?:logger|log|ZIO\.log|LoggerFactory|log4cats|slf4j)\.(?:info|error|warn|debug|trace)\b|@Slf4j" ), - # 39. print_hits: Standard output. - "print_hits": re.compile(r"\b(println|print|Console\.println)\b"), - # 40. cast_hits: "Trust Me" Tax. Explicit type coercion. - "cast_hits": re.compile( - r"\basInstanceOf\[[^\]]*\]|\.(?:toInt|toLong|toFloat|toDouble|toByte|toShort)\b" - ), - # 41. bailout_hits: Detonators. Aborting context. - "bailout_hits": re.compile(r"\b(throw|panic|abort|sys\.error|exit)\b"), - # 42. halt_hits: Temporal Duct Tape. (Forced waits/sleep). - "halt_hits": re.compile( - r"\b(Thread\.sleep|delay|setTimeout|setInterval)\s*\(" - ), - # 43. bitwise_hits: Sub-Atomic Math. - "bitwise_hits": re.compile(r"(?>|\^|~"), - # 44. sync_locks: Barricades. Coordinated threading. - "sync_locks": re.compile( - r"\b(synchronized|volatile|Semaphore|Mutex|lock|unlock)\b" - ), - # 45. freeze_hits: Data Cryogenics. Immutability. - "freeze_hits": re.compile( - r"\b(val|final|sealed|readonly|Object\.freeze|immutable)\b" - ), - # 46. cleanup: The Janitor. Resource release. - "cleanup": re.compile( - r"\b(dispose|close|cleanup|cancel|free|bracket|finally|onException)\b" - ), - # 47. encapsulation: The Vault. Scope hiding. + # 39. debug_prints (Debug Artifacts / Unstructured Outputs): Standard output. + "debug_prints": re.compile(r"\b(println|print|Console\.println)\b"), + # 40. explicit_casts (Explicit Type Casting): "Trust Me" Tax. Explicit type coercion. + "explicit_casts": re.compile(r"\basInstanceOf\[[^\]]*\]|\.(?:toInt|toLong|toFloat|toDouble|toByte|toShort)\b"), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) Aborting context. + "panics_and_aborts": re.compile(r"\b(throw|panic|abort|sys\.error|exit)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) (Forced waits/sleep). + "thread_sleeps": re.compile(r"\b(Thread\.sleep|delay|setTimeout|setInterval)\s*\("), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"(?>|\^|~"), + # 44. sync_locks (Resource Management & Stability) Coordinated threading. + "sync_locks": re.compile(r"\b(synchronized|volatile|Semaphore|Mutex|lock|unlock)\b"), + # 45. immutability_locks (Immutability Constraints) Immutability. + "immutability_locks": re.compile(r"\b(val|final|sealed|readonly|Object\.freeze|immutable)\b"), + # 46. cleanup (Resource Cleanup / Teardown) Resource release. + "cleanup": re.compile(r"\b(dispose|close|cleanup|cancel|free|bracket|finally|onException)\b"), + # 47. encapsulation (Encapsulation / Access Modifiers) "encapsulation": re.compile(r"\b(private|protected)\b|private\[[^\]]+\]"), - # 48. listeners: The Sinks. Waiting for state broadcasts. - "listeners": re.compile( - r"\b(on\(|addEventListener|subscribe|watch|useEffect|listen)\b" - ), - # 49. test_skip: Safety Theater. + # 48. listeners (Event Listeners / Observers) Waiting for state broadcasts. + "listeners": re.compile(r"\b(on\(|addEventListener|subscribe|watch|useEffect|listen)\b"), + # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(ignore|pending|skip|xit|xdescribe)\b"), # --- PHASE 3: HYBRID DOMAIN SENSORS (Scala Specifics) --- "serialization_parsing": re.compile( r"\b(io\.circe|decode\[|asJson|Json\.parse|Json\.toJson|upickle\.default)\b" ), - "regex_execution": re.compile( - r'"[^"]+"\.r\b|\bRegex\s*\(|\.(findAllIn|findFirstIn|replaceAllIn)\b' - ), + "regex_execution": re.compile(r'"[^"]+"\.r\b|\bRegex\s*\(|\.(findAllIn|findFirstIn|replaceAllIn)\b'), "time_date_logic": re.compile( r"\b(Duration\s*\(|FiniteDuration|System\.currentTimeMillis|LocalDate\.now)\b" ), - "ipc_rpc_bridges": re.compile( - r"\b(ActorSystem|ActorRef|sys\.process\._|Process\s*\(|Future\.apply)\b" - ), + "ipc_rpc_bridges": re.compile(r"\b(ActorSystem|ActorRef|sys\.process\._|Process\s*\(|Future\.apply)\b"), }, }, "dockerfile": { @@ -8389,7 +7566,7 @@ "Dockerfile.test", "Dockerfile.local", ], - # ECOSYSTEM ANCHORS & DISAMBIGUATION: Compose files and ignore manifests acting as massive gravity anchors. + # ECOSYSTEM ANCHORS & DISAMBIGUATION: Compose files and ignore manifests acting as contextual baselines. "discriminators": [ "docker-compose.yml", "docker-compose.yaml", @@ -8400,13 +7577,13 @@ "shebangs": [], # UPGRADED: Maps to Family 3 (Pure Hash) # Rationale: Docker natively uses '#' exclusively for line-level comments and parser directives. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { "_line_anchor": re.compile(r"#"), "_inline_comment": re.compile(r"#"), "_block_start": None, "_block_end": None, - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Control flow executing inside RUN shell blocks. High density indicates complex embedded shell scripts. "branch": re.compile( @@ -8419,15 +7596,11 @@ # 3. linear (Sequential Boundaries) # Structural boundaries defining straight-line execution and environment contexts. # CRITICAL GUARDRAIL: EXCLUDES `FROM` and `RUN`/`CMD` to maintain geometric stability. - "linear": re.compile( - r"^[ \t]*(?:WORKDIR|USER|VOLUME|STOPSIGNAL|SHELL|LABEL)\b", re.M | re.I - ), + "structural_boundaries": re.compile(r"^[ \t]*(?:WORKDIR|USER|VOLUME|STOPSIGNAL|SHELL|LABEL)\b", re.M | re.I), # 4. func_start (Executable Logic Anchors) # CRITICAL GUARDRAIL: Anchors logic blocks. ONLY executable logic blocks. # In Docker, `RUN`, `CMD`, and `ENTRYPOINT` execute logic, generating discrete intermediate image layers. - "func_start": re.compile( - r"^[ \t]*(RUN|CMD|ENTRYPOINT|HEALTHCHECK)(?=[ \t])", re.M | re.I - ), + "func_start": re.compile(r"^[ \t]*(RUN|CMD|ENTRYPOINT|HEALTHCHECK)(?=[ \t])", re.M | re.I), # 5. class_start (Object / Entity Declarations) # Defines object-oriented and structural boundaries. Drives API Surface Area math. # `FROM` instantiates a discrete build stage/image boundary, acting as a class wrapper. @@ -8444,16 +7617,14 @@ # Actively bypassing isolation or safety logic. # Using `:latest`, running as root, setting permissions to 777, or blindly curling directly into bash. # CRITICAL GUARDRAIL: Safely bounds the curl/wget pipe `[^|\n]{1,200}` to prevent ReDoS on massive RUN chains. - "safety_neg": re.compile( + "safety_bypasses": re.compile( r":latest\b|^[ \t]*USER[ \t]+(?:root|0)\b|chmod[ \t]+777|--privileged|--allow-unauthenticated|\b(?:curl|wget)[ \t]+[^|\n]{1,200}\|[ \t]*(?:bash|sh|zsh)\b", re.M | re.I, ), # 8. danger (High-Risk Execution) # Extreme space debris. Destructive recursive removes targeting root, and dangerous dynamic eval. # CRITICAL GUARDRAIL: Raw terminal prints (`echo`) strictly routed to print_hits. - "danger": re.compile( - r"\b(?:rm[ \t]+-rf[ \t]+/(?![A-Za-z])|eval|exec)\b", re.M | re.I - ), + "high_risk_execution": re.compile(r"\b(?:rm[ \t]+-rf[ \t]+/(?![A-Za-z])|eval|exec)\b", re.M | re.I), # 9. io (I/O & Network Boundaries) # Interaction with external networks, copying files from host, or executing package managers. "io": re.compile( @@ -8465,13 +7636,13 @@ "api": re.compile(r"^[ \t]*EXPOSE[ \t]+[0-9]+", re.M | re.I), # 11. flux (State Mutation) # Mutation of state. Setting Environment variables that permanently alter the image layer state. - "flux": re.compile( + "state_mutation": re.compile( r"^[ \t]*ENV[ \t]+[a-zA-Z0-9_]+|export[ \t]+[a-zA-Z0-9_]+[ \t]*=", re.M | re.I, ), - # 12. graveyard (Dead / Commented-out Code) + # 12. dead_code (Commented Logic / Deprecated Trails) # Commented-out logic, commented-out structural Dockerfile commands. - "graveyard": re.compile( + "dead_code": re.compile( r"^[ \t]*#[ \t]*(?:RUN|COPY|ADD|ENV|EXPOSE|FROM|CMD|ENTRYPOINT|WORKDIR)\b", re.M | re.I, ), @@ -8490,14 +7661,10 @@ # --- PHASE 3: ARCHITECTURE & DOMAIN SENSORS --- # 15. concurrency (Asynchronous Execution) # Parallelism executed inside the build shell (e.g. compiling with all cores). - "concurrency": re.compile( - r"&[ \t]*$|\b(?:nohup|parallel|make[ \t]+-j|xargs[ \t]+-P)\b", re.M - ), + "concurrency": re.compile(r"&[ \t]*$|\b(?:nohup|parallel|make[ \t]+-j|xargs[ \t]+-P)\b", re.M), # 16. ui_framework (UI / View Components) # Containerizing GUI applications (X11, Wayland, GTK). - "ui_framework": re.compile( - r"\b(?:xvfb|x11|wayland|gtk|qt5?|libgl1-mesa)\b", re.I - ), + "ui_framework": re.compile(r"\b(?:xvfb|x11|wayland|gtk|qt5?|libgl1-mesa)\b", re.I), # 17. closures (Closures / Anonymous Functions) # Dockerfiles are purely declarative structurally; closures do not exist. "closures": None, @@ -8518,16 +7685,14 @@ re.I, ), # 23. heat_triggers (Metaprogramming & Reflection) - # Extreme "Logic Heat": Advanced BuildKit logic. Mounting caches, secrets, cross-platform builds, or `ONBUILD` (which defers execution to downstream images). - "heat_triggers": re.compile( + # High Cognitive Load: Advanced BuildKit logic. Mounting caches, secrets, cross-platform builds, or `ONBUILD` (which defers execution to downstream images). + "reflection_metaprogramming": re.compile( r"^[ \t]*ONBUILD\b|--mount=type=(?:cache|secret|bind|ssh)|--platform=|<|\$_POST|\$_GET|\$_SERVER|\$_COOKIE|\$_SESSION|put\s+header)\b", @@ -9059,46 +8172,36 @@ r"\b(revLog|syslog|logError|logInfo|logWarn|logDebug|mergLog|rreLog|lcLog)\b", re.I, ), - # 39. print_hits: Raw terminal output (puts to message box without target). - "print_hits": re.compile( - r'^[ \t]*put\s+(?:"[^"]*"|[a-zA-Z0-9_]+)[ \t]*$', re.I | re.M - ), - # 40. cast_hits: English-style type checking. - "cast_hits": re.compile(r"\bis\s+(?:not\s+)?a\b|\bis\s+strictly\b", re.I), - # 41. bailout_hits: Hard detonations. - "bailout_hits": re.compile(r"\b(exit\s+to\s+top|quit|throw|abort)\b", re.I), - # 42. halt_hits: Temporal Duct Tape (Blocking wait). - "halt_hits": re.compile( - r"\bwait\s+(?:for[ \t]+)?\d+\s+[^ \t\n]+?(?!\s+with\s+messages)\b", re.I - ), - # 43. bitwise_hits: Sub-Atomic Math. - "bitwise_hits": re.compile( - r"\b(bitAnd|bitOr|bitXor|bitNot|bitShiftLeft|bitShiftRight)\b", re.I - ), - # 44. sync_locks: Barricades. - "sync_locks": re.compile( - r"\b(lock\s+screen|lock\s+messages|lock\s+errordialogs)\b", re.I - ), - # 45. freeze_hits: Data Cryogenics. - "freeze_hits": re.compile(r"\b(constant\s+)\b", re.I), - # 46. cleanup: The Janitor. + # 39. debug_prints (Debug Artifacts / Unstructured Outputs): Raw terminal output (puts to message box without target). + "debug_prints": re.compile(r'^[ \t]*put\s+(?:"[^"]*"|[a-zA-Z0-9_]+)[ \t]*$', re.I | re.M), + # 40. explicit_casts (Explicit Type Casting): English-style type checking. + "explicit_casts": re.compile(r"\bis\s+(?:not\s+)?a\b|\bis\s+strictly\b", re.I), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts): Hard detonations. + "panics_and_aborts": re.compile(r"\b(exit\s+to\s+top|quit|throw|abort)\b", re.I), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses): Temporal Duct Tape (Blocking wait). + "thread_sleeps": re.compile(r"\bwait\s+(?:for[ \t]+)?\d+\s+[^ \t\n]+?(?!\s+with\s+messages)\b", re.I), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"\b(bitAnd|bitOr|bitXor|bitNot|bitShiftLeft|bitShiftRight)\b", re.I), + # 44. sync_locks (Resource Management & Stability) + "sync_locks": re.compile(r"\b(lock\s+screen|lock\s+messages|lock\s+errordialogs)\b", re.I), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(constant\s+)\b", re.I), + # 46. cleanup (Resource Cleanup / Teardown) "cleanup": re.compile( r"\b(delete\s+variable|close\s+file|stop\s+using|remove\s+script)\b", re.I, ), - # 47. encapsulation: The Vault. + # 47. encapsulation "encapsulation": re.compile(r"\b(private\s+)\b", re.I), - # 48. listeners: The Sinks. + # 48. listeners (Event Listeners / Observers) "listeners": re.compile(r"^[ \t]*on\s+[a-zA-Z0-9_-]+", re.I | re.M), - # 49. test_skip: Safety Theater. + # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(skip\s+test)\b", re.I), # --- PHASE 3: HYBRID DOMAIN SENSORS (LiveCode Specifics) --- "serialization_parsing": re.compile( r"(?i)\b(jsonImport|jsonExport|arrayEncode|arrayDecode|revXMLCreateTree)\b" ), - "regex_execution": re.compile( - r"(?i)\b(matchText|matchChunk|replaceText|filter\s+.*\s+with\s+regex)\b" - ), + "regex_execution": re.compile(r"(?i)\b(matchText|matchChunk|replaceText|filter\s+.*\s+with\s+regex)\b"), "time_date_logic": re.compile( r"(?i)\b(the\s+(?:seconds|ticks|time|date|internet date)|wait\s+(?:for|until))\b" ), @@ -9130,25 +8233,23 @@ "shebangs": [], # UPGRADED: Maps to Family 1 (Standard C-Style) # Rationale: Solidity strictly adheres to C-style line (//) and block (/* */) comments. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), "_block_start": re.compile(r"/\*"), "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch: Decisions that split flow. Includes Solidity 0.6+ try/catch. - "branch": re.compile( - r"\b(if|else|for|while|do|break|continue|return|try|catch)\b|\?|:" - ), + "branch": re.compile(r"\b(if|else|for|while|do|break|continue|return|try|catch)\b|\?|:"), # 2. args: Parameters / Coupling. Captures parameters for functions, errors, events, and modifiers. # Bounded `{0,50}` to prevent ReDoS on massive tuple returns or complex signatures. "args": re.compile( r"\b(?:function|modifier|error|event|constructor)\s+(?:[a-zA-Z_]\w*[ \t]*)?\([^)]{0,500}\)" ), # 3. linear: Sequential I/O & Network Boundaries. Structural boundaries defining scope and data definitions. - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(pragma|import|contract|interface|library|struct|enum|type|mapping|address|uint\d*|int\d*|bytes\d*|bool|string)\b" ), # 4. func_start: Executable Logic Anchors. Anchors executable logic (Functions, Modifiers, Custom Errors, Events). @@ -9164,29 +8265,21 @@ ), # --- PHASE 2: RISK & STRUCTURAL INTEGRITY --- # 6. safety: Defensive Programming. State reversion, assertions, and defensive modifier usage. - "safety": re.compile( - r"\b(require|assert|revert|modifier|nonReentrant|onlyOwner)\b" - ), + "safety": re.compile(r"\b(require|assert|revert|modifier|nonReentrant|onlyOwner)\b"), # 7. safety_neg: Safety Bypasses. Bypassing overflow checks (0.8+) or dangerous delegation. - "safety_neg": re.compile(r"\b(unchecked|assembly|delegatecall)\b"), + "safety_bypasses": re.compile(r"\b(unchecked|assembly|delegatecall)\b"), # 8. danger: High-Risk Execution. Contract destruction and absolute value termination. - "danger": re.compile(r"\b(selfdestruct|suicide)\b"), + "high_risk_execution": re.compile(r"\b(selfdestruct|suicide)\b"), # 9. io: I/O & Network Boundaries. EVM blockchains are closed systems. (Cross-contract calls are mapped as API/Generics). "io": None, # 10. api: Public Surface Area. Exposed boundaries to external wallets or contracts. "api": re.compile(r"\b(external|public)\b"), # 11. flux: State Mutation. State mutation. Captures array mutators, payable states, and explicit assignment. - "flux": re.compile( - r"\b(payable|push|pop)\b|(?!])=(?![=])|\+\+|--|\+=|-=|\*=|/=" - ), - # 12. graveyard: Dead / Commented-out Code. Commented out execution flow or structural definitions. - "graveyard": re.compile( - r"//[ \t]*(?:function|contract|if|require|uint|address)\b" - ), + "state_mutation": re.compile(r"\b(payable|push|pop)\b|(?!])=(?![=])|\+\+|--|\+=|-=|\*=|/="), + # 12. dead_code (Commented Logic / Deprecated Trails) Commented out execution flow or structural definitions. + "dead_code": re.compile(r"//[ \t]*(?:function|contract|if|require|uint|address)\b"), # 13. doc: Structured Documentation. NatSpec (Ethereum Natural Specification Format). - "doc": re.compile( - r"///|/\*\*|@(?:param|return|dev|notice|custom|title|author)" - ), + "doc": re.compile(r"///|/\*\*|@(?:param|return|dev|notice|custom|title|author)"), # 14. test: Testing & Assertions. Foundry/Forge testing hooks and assertions. "test": re.compile( r"\b(?:setUp|test[A-Za-z0-9_]*|assertEq|assertTrue|assertFalse|assertGt|assertLt|vm\.expectRevert)\b" @@ -9212,40 +8305,26 @@ # 21. comprehensions: Iterators / Comprehensions. Solidity lacks native comprehensions. "comprehensions": None, # 22. scientific: Numerical / Compute Libraries. Cryptographic hashing and elliptic curve recovery. - "scientific": re.compile( - r"\b(keccak256|sha256|ripemd160|ecrecover|addmod|mulmod)\b" - ), + "scientific": re.compile(r"\b(keccak256|sha256|ripemd160|ecrecover|addmod|mulmod)\b"), # 23. heat_triggers: Metaprogramming & Reflection. Low-level assembly injections and fallback routers. - "heat_triggers": re.compile( - r"\b(fallback|receive|assembly|delegatecall|call|staticcall)\b" - ), + "reflection_metaprogramming": re.compile(r"\b(fallback|receive|assembly|delegatecall|call|staticcall)\b"), # 24. import: Dependency Inclusions. Resolving dependencies across files. - "import": re.compile( - r"^[ \t]*import\s+(?:\{[^}]+\}\s+from\s+)?[\"'][^\"']+[\"'];", re.M - ), + "import": re.compile(r"^[ \t]*import\s+(?:\{[^}]+\}\s+from\s+)?[\"'][^\"']+[\"'];", re.M), # 24b. _dependency_capture: Graph resolution extracting exactly ONE path string. - "_dependency_capture": re.compile( - r"^[ \t]*import\s+(?:\{[^}]+\}\s+from\s+)?[\"']([^\"']+)[\"'];", re.M - ), + "_dependency_capture": re.compile(r"^[ \t]*import\s+(?:\{[^}]+\}\s+from\s+)?[\"']([^\"']+)[\"'];", re.M), # 25. ownership: Authorship indicators. Strictly targets SPDX license tags and authorship notes. - "ownership": re.compile( - r"//[ \t]*SPDX-License-Identifier:|(?:@author|Created by):\s+(.*)", re.I - ), + "ownership": re.compile(r"//[ \t]*SPDX-License-Identifier:|(?:@author|Created by):\s+(.*)", re.I), # --- 🌌 PHASE 4: EXTENDED DIMENSIONS (Specialized Sub-Equations) --- # 26. planned_debt (Annotated Debt / TODOs) "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 28. private_info: Hardcoded credentials or private keys. Requires assignment. - "private_info": re.compile( - r"\b(private_key|secret|mnemonic|api_key)\b[ \t]*[:=]", re.I - ), + "hardcoded_secrets": re.compile(r"\b(private_key|secret|mnemonic|api_key)\b[ \t]*[:=]", re.I), # 29. spec_exposure: Map vs. Territory. ERC/EIP standards and audit tags. - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|audit)[^\]]*\]|\b(ERC-\d+|EIP-\d+)\b", re.I - ), - # 30. civil_war: Indentation Tracker. Handled natively. - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|audit)[^\]]*\]|\b(ERC-\d+|EIP-\d+)\b", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies): Indentation Tracker. Handled natively. + "tabs_vs_spaces": None, # 31. ssr_boundaries: View Horizon. "ssr_boundaries": None, # 32. events: Pub/Sub Network. Logging state to the blockchain EVM logs. @@ -9263,43 +8342,35 @@ # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry: Professional diagnostics. (Hardhat console logging). "telemetry": re.compile(r"\b(console\.log[a-zA-Z0-9_]*)\b"), - # 39. print_hits: Standard output. (Solidity lacks native printing outside Hardhat). - "print_hits": None, - # 40. cast_hits: "Trust Me" Tax. Explicit type coercion (e.g., uint256(addr)). - "cast_hits": re.compile( - r"\b(address|uint\d*|int\d*|bytes\d*|uint|int|bytes)\s*\(" - ), - # 41. bailout_hits: Detonators. Aborting transaction state. - "bailout_hits": re.compile(r"\b(revert)\b"), - # 42. halt_hits: Temporal Duct Tape. (EVM cannot sleep). - "halt_hits": None, - # 43. bitwise_hits: Sub-Atomic Math. Bitwise operations for gas optimization. - "bitwise_hits": re.compile(r"<<|>>|\^|~|(?>|\^|~|(?]*\s*\*?\s*>"), # 21. comprehensions: Iterators / Comprehensions. Block-based array/set enumeration. @@ -9444,7 +8509,7 @@ r"\b(math\.h|sin|cos|tan|sqrt|exp|log|abs|NSDecimalNumber|CGVector|CGAffineTransform|CGPoint|CGRect|CGSize|NXRect|NXSize)\b" ), # 23. heat_triggers: Metaprogramming & Reflection. Objective-C Runtime Swizzling and dynamic messaging. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(objc_msgSend|performSelector|method_exchangeImplementations|class_addMethod|objc_allocateClassPair|isa|object_setClass)\b|" ), # 24. import: Dependency Inclusions. Module and header inclusion. @@ -9454,9 +8519,7 @@ re.M, ), # 25. ownership: Authorship metadata. - "ownership": re.compile( - r"\b(?:Created by|@author|Author:|Copyright|Tim Berners-Lee)\b", re.I - ), + "ownership": re.compile(r"\b(?:Created by|@author|Author:|Copyright|Tim Berners-Lee)\b", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- "planned_debt": GLOBAL_PLANNED_DEBT, "fragile_debt": GLOBAL_FRAGILE_DEBT, @@ -9464,13 +8527,11 @@ r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit|RFC|W3C|CERN|TBL|ENQUIRE)[^\]]*\]|\b(?:WorldWideWeb|HyperText\s+Proposal|NeXTSTEP\s+Docs)\b", re.I, ), - "civil_war": None, + "tabs_vs_spaces": None, "ssr_boundaries": re.compile( r"\b(WOComponent|WOResponse|WOContext|WOApplication|WODirectAction|WebObjects)\b" ), - "events": re.compile( - r"\b(NSNotificationCenter|addObserver|postNotification|NXApp\s+run|sendEvent)\b" - ), + "events": re.compile(r"\b(NSNotificationCenter|addObserver|postNotification|NXApp\s+run|sendEvent)\b"), "dependency_injection": re.compile( r"\b(TyphoonComponentFactory|TyphoonDefinition|JSObjection|inject:|initWithDependency:)\b" ), @@ -9478,55 +8539,43 @@ r"^[ \t]*#(?:define|undef|ifdef|ifndef|if|elif|else|endif|pragma)\b", re.M, ), - "pointers": re.compile( - r"->|&\w+|\b(?:id|Class|SEL|IMP)\b|(?<=[=(,])[ \t]*\*[a-zA-Z_]\w*" - ), + "pointers": re.compile(r"->|&\w+|\b(?:id|Class|SEL|IMP)\b|(?<=[=(,])[ \t]*\*[a-zA-Z_]\w*"), "memory_alloc": re.compile( r"\b(alloc|init|new|copy|mutableCopy|retain|malloc|calloc|NX_MALLOC|NX_ZONEMALLOC|NSZoneMalloc)\b" ), - "inline_asm": re.compile( - r"\b(?:__asm__|asm|__asm)\b(?:\s+volatile)?\s*\([^)]*\)" - ), + "inline_asm": re.compile(r"\b(?:__asm__|asm|__asm)\b(?:\s+volatile)?\s*\([^)]*\)"), # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry: Professional diagnostics. - "telemetry": re.compile( - r"\b(os_log|OSLog|DDLogInfo|DDLogError|DDLogWarn|DDLogDebug|syslog)\b" - ), - # 39. print_hits: Standard output. - "print_hits": re.compile(r"\b(printf|fprintf|NXPrintf|NSLog)\b"), - # 40. cast_hits: "Trust Me" Tax. Explicit type coercion. - "cast_hits": re.compile( - r"\(\s*[A-Za-z_]\w*\s*\*?\s*\)\s*[a-zA-Z_$]|typeof\b" - ), - # 41. bailout_hits: Detonators. Aborting execution context. - "bailout_hits": re.compile(r"\b(@throw|abort|exit)\b"), - # 42. halt_hits: Temporal Duct Tape. Forcing threads to sleep. - "halt_hits": re.compile(r"\b(sleep|usleep|nanosleep)\s*\("), - # 43. bitwise_hits: Sub-Atomic Math. - "bitwise_hits": re.compile(r"(?>|\^|~"), - # 44. sync_locks: Barricades. Coordinated threading logic. + "telemetry": re.compile(r"\b(os_log|OSLog|DDLogInfo|DDLogError|DDLogWarn|DDLogDebug|syslog)\b"), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs): Standard output. + "debug_prints": re.compile(r"\b(printf|fprintf|NXPrintf|NSLog)\b"), + # 40. explicit_casts (Explicit Type Casting): "Trust Me" Tax. Explicit type coercion. + "explicit_casts": re.compile(r"\(\s*[A-Za-z_]\w*\s*\*?\s*\)\s*[a-zA-Z_$]|typeof\b"), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) Aborting execution context. + "panics_and_aborts": re.compile(r"\b(@throw|abort|exit)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) Forcing threads to sleep. + "thread_sleeps": re.compile(r"\b(sleep|usleep|nanosleep)\s*\("), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"(?>|\^|~"), + # 44. sync_locks (Resource Management & Stability) Coordinated threading logic. "sync_locks": re.compile( r"\b(@synchronized|NSLock|NSRecursiveLock|NSConditionLock|dispatch_semaphore_wait)\b" ), - # 45. freeze_hits: Data Cryogenics. Immutability. - "freeze_hits": re.compile(r"\b(const|readonly|immutable)\b"), - # 46. cleanup: The Janitor. Resource release (Crucial for MRC NeXT era). + # 45. immutability_locks (Immutability Constraints) Immutability. + "immutability_locks": re.compile(r"\b(const|readonly|immutable)\b"), + # 46. cleanup (Resource Cleanup / Teardown) Resource release (Crucial for MRC NeXT era). "cleanup": re.compile(r"\b(dealloc|release|autorelease|free|NX_FREE)\b"), - # 47. encapsulation: The Vault. Hiding logic from the application. + # 47. encapsulation Hiding logic from the application. "encapsulation": re.compile(r"\b(@private|@protected|@package)\b"), - # 48. listeners: The Sinks. Waiting for state broadcasts. - "listeners": re.compile( - r"\b(addObserver:|observeValueForKeyPath:|subscribeNext:)\b" - ), - # 49. test_skip: Safety Theater. + # 48. listeners (Event Listeners / Observers) Waiting for state broadcasts. + "listeners": re.compile(r"\b(addObserver:|observeValueForKeyPath:|subscribeNext:)\b"), + # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(XCTSkip|xit|xdescribe)\b"), # --- PHASE 3: HYBRID DOMAIN SENSORS (Objective-C Specifics) --- "serialization_parsing": re.compile( r"\b(NSJSONSerialization|NSKeyedUnarchiver|NSKeyedArchiver|NSXMLParser|NSPropertyListSerialization)\b" ), - "regex_execution": re.compile( - r"\b(NSRegularExpression|NSRegularExpressionSearch)\b" - ), + "regex_execution": re.compile(r"\b(NSRegularExpression|NSRegularExpressionSearch)\b"), "time_date_logic": re.compile( r"\b(NSDate|NSDateFormatter|NSTimer|CFAbsoluteTimeGetCurrent|NSDateComponents)\b" ), @@ -9553,13 +8602,13 @@ "Makeconf", "Makevars", ], - # ECOSYSTEM ANCHORS & DISAMBIGUATION: Sibling configurations acting as gravity anchors to resolve ambiguous .mk includes. + # ECOSYSTEM ANCHORS & DISAMBIGUATION: Sibling configurations acting as disambiguation anchors to resolve ambiguous .mk includes. "discriminators": ["Makefile", "makefile", "configure.ac", "CMakeLists.txt"], # EXECUTION SIGNATURES: Interpreters found on Line 1 for executable make scripts. "shebangs": ["make", "gmake"], # UPGRADED: Maps to Family 3 (Pure Hash) # Rationale: Make natively uses '#' exclusively for line-level comments. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { # Makefiles natively use '#' for both line and inline comments. "_line_anchor": re.compile(r"#"), @@ -9579,7 +8628,7 @@ "args": re.compile(r"\$\([0-9]+\)|\$[0-9]\b|\$\(call[ \t]+[a-zA-Z0-9_.-]+"), # Smooth structural boundaries: variable assignments (:=, =, ?=) and native structural controls like vpath. # Explicitly excludes the append operator `+=` which belongs in flux. - "linear": re.compile( + "structural_boundaries": re.compile( r"^[ \t]*[a-zA-Z0-9_.-]+[ \t]*(?::|\?|::)?=(?![ \t]*=)|^[ \t]*(?:vpath|undefine)\b", re.M, ), @@ -9602,13 +8651,9 @@ re.M, ), # Bypassing safety: Prefixing recipes with `-` to swallow errors, or forcefully exiting true via shell logic. - "safety_neg": re.compile( - r"^\t[ \t]*-[a-zA-Z0-9_./$]|\|\|[ \t]*(?:true|exit[ \t]+0)\b", re.M - ), + "safety_bypasses": re.compile(r"^\t[ \t]*-[a-zA-Z0-9_./$]|\|\|[ \t]*(?:true|exit[ \t]+0)\b", re.M), # Heavily destructive sequence patterns or overriding permissions. (Eval is categorized under heat_triggers). - "danger": re.compile( - r"\bsudo[ \t]+|\brm[ \t]+-[rR]?[fF][ \t]+(?:/|\$[{(])|\bkill[ \t]+-9\b" - ), + "high_risk_execution": re.compile(r"\bsudo[ \t]+|\brm[ \t]+-[rR]?[fF][ \t]+(?:/|\$[{(])|\bkill[ \t]+-9\b"), # Interacting directly with outputs, networks, or the disk filesystem. "io": re.compile( r"\$\((?:file|wildcard)[ \t]+|\b(?:curl|wget|scp|rsync|tar|unzip|mkdir|cp|mv)\b|>>?[ \t]*[^ \t\n/]+" @@ -9619,9 +8664,9 @@ re.M, ), # Mutating variable state by appending (+=) or shell assignment (!=). . - "flux": re.compile(r"^[ \t]*[a-zA-Z0-9_.-]+[ \t]*(?:\+|!)=", re.M), + "state_mutation": re.compile(r"^[ \t]*[a-zA-Z0-9_.-]+[ \t]*(?:\+|!)=", re.M), # Commented-out targets, commented out shell logic, or commented conditional Make directives. - "graveyard": re.compile( + "dead_code": re.compile( r"^[ \t]*#[ \t]*(?:[a-zA-Z0-9_./%-]+[ \t]*::?|[a-zA-Z0-9_.-]+[ \t]*(?::|\?|::)?=|\b(?:ifeq|ifneq|ifdef|ifndef|include)\b)", re.M, ), @@ -9643,9 +8688,7 @@ "ui_framework": None, "closures": None, # Core global state built-in environments spanning the build system. - "globals": re.compile( - r"\$\((?:MAKE|MAKEFLAGS|MAKECMDGOALS|CURDIR|SHELL|PATH|USER|HOME|PWD|\.VARIABLES)\)" - ), + "globals": re.compile(r"\$\((?:MAKE|MAKEFLAGS|MAKECMDGOALS|CURDIR|SHELL|PATH|USER|HOME|PWD|\.VARIABLES)\)"), "decorators": None, "generics": None, # High-density text manipulating algorithms native to GNU Make iterating through variable spaces. @@ -9655,14 +8698,10 @@ # Launching explicit calculation boundaries outside the Make environment natively. "scientific": re.compile(r"\b(?:bc|expr|awk)\b|\$\(shell[ \t]+expr[ \t]+"), # Extremely dense meta-programming manipulations drastically raising cognitive load during debugging. - "heat_triggers": re.compile( - r"\$\((?:eval|call|value|origin|flavor|shell)[ \t]+|\.SECONDEXPANSION:" - ), + "reflection_metaprogramming": re.compile(r"\$\((?:eval|call|value|origin|flavor|shell)[ \t]+|\.SECONDEXPANSION:"), # Linking isolated segments of the graph execution via modular file resolution. "import": re.compile(r"^[ \t]*-?(?:include|sinclude)[ \t]+[^ \t\n]+", re.M), - "_dependency_capture": re.compile( - r"^[ \t]*-?(?:include|sinclude)[ \t\n]+([^\s#]+)", re.M - ), + "_dependency_capture": re.compile(r"^[ \t]*-?(?:include|sinclude)[ \t\n]+([^\s#]+)", re.M), # Metadata anchoring authorship and structural domain owners. "ownership": re.compile( r"^[ \t]*#[ \t]*(?:@author\b|author:|maintainer:|created by:)", @@ -9677,7 +8716,7 @@ "fragile_debt": GLOBAL_FRAGILE_DEBT, "spec_exposure": re.compile(r"\[(?:spec-[0-9]+|audit|spec)\]", re.I), # Strict tracking of Indentation structural boundaries. (Make strictly demands Tabs, mapping space usage catches severe fragmentation). - "civil_war": None, + "tabs_vs_spaces": None, "ssr_boundaries": None, "events": None, "dependency_injection": None, @@ -9687,32 +8726,24 @@ "memory_alloc": None, "inline_asm": None, # -------------------------------------------------------------------------- - # 5. THERMODYNAMIC BALANCE (Yin & Yang Forces) + # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # -------------------------------------------------------------------------- # Emitting pure, safe structural observability that does not risk halting or crashing the graph execution. "telemetry": re.compile(r"\$\(info[ \t]+[^)\n]*\)"), # Standard output commands echoing transient debris to the shell execution log. - "print_hits": re.compile( - r"^[ \t]*@?(?:echo|printf)[ \t]+|\$\(warning[ \t]+[^)\n]*\)", re.M - ), - "cast_hits": None, + "debug_prints": re.compile(r"^[ \t]*@?(?:echo|printf)[ \t]+|\$\(warning[ \t]+[^)\n]*\)", re.M), + "explicit_casts": None, # System detonators specifically intended to abort the build flow if preconditions are failed natively or via shell. - "bailout_hits": re.compile( - r"\$\(error[ \t]+[^)\n]*\)|\bexit[ \t]+[1-9][0-9]*\b|\bfalse\b" - ), + "panics_and_aborts": re.compile(r"\$\(error[ \t]+[^)\n]*\)|\bexit[ \t]+[1-9][0-9]*\b|\bfalse\b"), # Temporal duct tape strictly applying forced pausing. - "halt_hits": re.compile(r"\bsleep[ \t]+[0-9]+"), - "bitwise_hits": None, # Kept null as Bash pipe IPC limits logic math precision. + "thread_sleeps": re.compile(r"\bsleep[ \t]+[0-9]+"), + "bitwise_ops": None, # Kept null as Bash pipe IPC limits logic math precision. # Explicit locks halting temporal thread races. . - "sync_locks": re.compile( - r"^[ \t]*\.(?:NOTPARALLEL|WAIT)[ \t]*::?|\bflock[ \t]+", re.M - ), + "sync_locks": re.compile(r"^[ \t]*\.(?:NOTPARALLEL|WAIT)[ \t]*::?|\bflock[ \t]+", re.M), # Enforcing strict immutability bounds on state configuration. . - "freeze_hits": re.compile(r"^[ \t]*override[ \t]+[a-zA-Z0-9_.-]+", re.M), + "immutability_locks": re.compile(r"^[ \t]*override[ \t]+[a-zA-Z0-9_.-]+", re.M), # Janitor routines ripping apart build artifacts and cleanly tearing down output paths. . - "cleanup": re.compile( - r"^[ \t]*(?:dist)?clean[ \t]*::?|\brm[ \t]+-[a-zA-Z]*f[a-zA-Z]*\b", re.M - ), + "cleanup": re.compile(r"^[ \t]*(?:dist)?clean[ \t]*::?|\brm[ \t]+-[a-zA-Z]*f[a-zA-Z]*\b", re.M), # The Vault explicitly hiding scope logic away from external API leakage boundaries. . "encapsulation": re.compile( r"^[ \t]*(?:unexport[ \t]+[a-zA-Z0-9_.-]+|[a-zA-Z0-9_.-]+[ \t]*:[ \t]*private[ \t]+|\.SILENT[ \t]*:)", @@ -9726,18 +8757,10 @@ re.I, ), # --- PHASE 3: HYBRID DOMAIN SENSORS (Makefile Specifics) --- - "serialization_parsing": re.compile( - r"(?m)^\s*(?:@|-)?(?:tar|unzip|gunzip|jq|sed|awk)\b" - ), - "regex_execution": re.compile( - r"(?m)\$\((?:filter|filter-out|patsubst)\b|^\s*(?:@|-)?(?:grep|egrep|sed)\b" - ), - "time_date_logic": re.compile( - r"(?m)\$\(shell\s+date\b|^\s*(?:@|-)?(?:sleep|date)\b" - ), - "ipc_rpc_bridges": re.compile( - r"(?m)\$\(shell\b|^\s*(?:@|-)?(?:curl|wget|ssh|scp|docker|kubectl)\b" - ), + "serialization_parsing": re.compile(r"(?m)^\s*(?:@|-)?(?:tar|unzip|gunzip|jq|sed|awk)\b"), + "regex_execution": re.compile(r"(?m)\$\((?:filter|filter-out|patsubst)\b|^\s*(?:@|-)?(?:grep|egrep|sed)\b"), + "time_date_logic": re.compile(r"(?m)\$\(shell\s+date\b|^\s*(?:@|-)?(?:sleep|date)\b"), + "ipc_rpc_bridges": re.compile(r"(?m)\$\(shell\b|^\s*(?:@|-)?(?:curl|wget|ssh|scp|docker|kubectl)\b"), }, }, "abap": { @@ -9751,21 +8774,20 @@ "extensions": [".abap", ".asddls"], # ABSOLUTE IDENTITY & EXACT FILENAMES: ABAP is executed within the SAP environment; no extensionless exact configurations exist. "exact_matches": [], - # ECOSYSTEM ANCHORS & DISAMBIGUATION: SAP deployment artifacts acting as gravity anchors. + # ECOSYSTEM ANCHORS & DISAMBIGUATION: SAP deployment artifacts acting as disambiguation anchors. "discriminators": [".abap", "package.devc.xml", ".apc"], # EXECUTION SIGNATURES: Executed exclusively within the SAP NetWeaver/ABAP platform; no shebangs exist. "shebangs": [], # UPGRADED: Maps to Family 7 (The Positional Ancients) # Rationale: Strictly fixed-format legacy constraints. The engine must monitor Column 1 - # for an asterisk '*' to identify line-level Ghost Mass, while allowing '"' for inline. - "lexical_family": "column_sensitive", - + # for an asterisk '*' to identify line-level Commented / Non-Executable Text, while allowing '"' for inline. + "lexical_family": "positional_anchored", "rules": { "_line_anchor": re.compile(r"^\*"), "_inline_comment": re.compile(r"\""), "_block_start": None, # ABAP has no standard multi-line block comments "_block_end": None, - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch: decisions that split flow. Includes modern COND/SWITCH expressions. "branch": re.compile( r"^[ \t]*(IF|ELSE|ELSEIF|CASE|WHEN|WHILE|DO|LOOP\s+AT|TRY|CATCH|CLEANUP|CHECK|EXIT|CONTINUE|RETURN|COND|SWITCH)\b", @@ -9777,7 +8799,7 @@ re.I, ), # 3. linear: Sequential I/O & Network Boundaries. Structural boundaries. EXCLUDES access modifiers and constants. - "linear": re.compile( + "structural_boundaries": re.compile( r"^[ \t]*(DATA|TYPES|FIELD-SYMBOLS|CLASS|INTERFACE|METHOD|FORM|FUNCTION|MODULE|REPORT|PROGRAM|IMPORT|EXPORT)\b", re.I | re.M, ), @@ -9798,12 +8820,12 @@ re.I, ), # 7. safety_neg: Safety Bypasses. Actively bypassing safety (casting/unchecked generics). - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(UNASSIGNED|TYPE\s+ANY|TYPE\s+REF\s+TO\s+DATA|IGNORE\s+ERRORS)\b|ASSIGN\s+[^\n;]+\s+TO\s+<[^>]+>\s+CASTING", re.I, ), # 8. danger: High-Risk Execution. Raw SQL/Kernel bypasses and mass deletion. - "danger": re.compile( + "high_risk_execution": re.compile( r"\b(SYSTEM-CALL|EXEC\s+SQL|DELETE\s+FROM|TRUNCATE|GENERATE\s+SUBROUTINE\s+POOL)\b", re.I, ), @@ -9818,12 +8840,12 @@ re.I, ), # 11. flux: State Mutation. State mutation (The core of ABAP data manipulation). - "flux": re.compile( + "state_mutation": re.compile( r"^[ \t]*(MOVE|MOVE-CORRESPONDING|APPEND|MODIFY\s+TABLE|DELETE\s+TABLE)\b|^[ \t]*INSERT\s+[^\n;]+\s+INTO\s+TABLE", re.I | re.M, ), - # 12. graveyard: Dead / Commented-out Code. Commented out structural logic (supports * and "). - "graveyard": re.compile( + # 12. dead_code (Commented Logic / Deprecated Trails) Commented out structural logic (supports * and "). + "dead_code": re.compile( r'^[ \t]*\*[ \t]*(?:DATA|METHOD|IF|SELECT|WRITE)\b|"[ \t]*(?:DATA|METHOD|IF|SELECT|WRITE)\b', re.I | re.M, ), @@ -9851,9 +8873,7 @@ # 17. closures: Closures / Anonymous Functions. (ABAP lacks anonymous closures). "closures": None, # 18. globals: Global / Shared State. Global program data and the system registry. - "globals": re.compile( - r"\b(TABLES|STATICS|CLASS-DATA|SY-[A-Z0-9_]+)\b", re.I - ), + "globals": re.compile(r"\b(TABLES|STATICS|CLASS-DATA|SY-[A-Z0-9_]+)\b", re.I), # 19. decorators: Decorators / Annotations. CDS and class annotations. "decorators": re.compile(r"@[A-Za-z0-9_.]+(?:\([^)]*\))?", re.I), # 20. generics: Generics / Type Parameters. Generic data references and field symbols. @@ -9872,15 +8892,13 @@ re.I, ), # 23. heat_triggers: Metaprogramming & Reflection. RTTS and Dynamic assignment logic. - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(CL_ABAP_TYPEDESCR|CL_ABAP_CLASSDESCR|ASSIGN\s+\([a-zA-Z0-9_-]+\)\s+TO|GENERATE\s+SUBROUTINE\s+POOL)\b", re.I, ), # 24. import: Dependency Inclusions. Includes and type pools. "import": re.compile(r"\b(INCLUDE|TYPE-POOLS)\b", re.I), - "_dependency_capture": re.compile( - r"^[ \t]*(?:INCLUDE|TYPE-POOLS)[ \t\n]+([A-Za-z0-9_/]+)", re.I | re.M - ), + "_dependency_capture": re.compile(r"^[ \t]*(?:INCLUDE|TYPE-POOLS)[ \t\n]+([A-Za-z0-9_/]+)", re.I | re.M), # 25. ownership: Authorship indicators. "ownership": re.compile( r"(?:AUTHOR|CREATED\s+BY|MAINTAINER|Tim Berners-Lee):\s+([^\n]+)", @@ -9896,72 +8914,58 @@ r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)\]|\b(?:WorldWideWeb|RFC|W3C|CERN|TBL|ENQUIRE)\b", re.I, ), - # 30. civil_war: Indentation Tracker. Tabs vs 2-space standardization. - "civil_war": None, + # 30. tabs_vs_spaces (Formatting Inconsistencies): Indentation Tracker. Tabs vs 2-space standardization. + "tabs_vs_spaces": None, # 31. ssr_boundaries: View Horizon. ICF and BSP request handlers. "ssr_boundaries": re.compile( r"\b(IF_HTTP_EXTENSION~HANDLE_REQUEST|CL_BSP_CONTEXT|CL_BSP_RUNTIME|IF_HTTP_REQUEST|IF_HTTP_RESPONSE|HTML_STRING)\b", re.I, ), # 32. events: Pub/Sub Network. Native OO event architecture. - "events": re.compile( - r"\b(RAISE\s+EVENT|SET\s+HANDLER)\b|FOR\s+EVENT\s+[^\n;]+\s+OF", re.I - ), + "events": re.compile(r"\b(RAISE\s+EVENT|SET\s+HANDLER)\b|FOR\s+EVENT\s+[^\n;]+\s+OF", re.I), # 33. dependency_injection: Inversion of Control. BAdIs and Test Doubles. - "dependency_injection": re.compile( - r"\b(GET\s+BADI|CALL\s+BADI|CL_BADI_BASE|CL_ABAP_TESTDOUBLE)\b", re.I - ), + "dependency_injection": re.compile(r"\b(GET\s+BADI|CALL\s+BADI|CL_BADI_BASE|CL_ABAP_TESTDOUBLE)\b", re.I), # 34. macros: Preprocessor Hooks. ABAP macro definitions. "macros": re.compile( r"^[ \t]*DEFINE\s+[a-zA-Z0-9_-]+\.|^[ \t]*END-OF-DEFINITION\s*\.", re.I | re.M, ), # 35. pointers: Memory Map. Field-Symbols and data references. - "pointers": re.compile( - r"<[A-Za-z0-9_-]+>|->\*|\b(?:GET\s+REFERENCE\s+OF|REF\s+TO)\b", re.I - ), + "pointers": re.compile(r"<[A-Za-z0-9_-]+>|->\*|\b(?:GET\s+REFERENCE\s+OF|REF\s+TO)\b", re.I), # 36. memory_alloc: Manual Memory Management. Heap allocations. - "memory_alloc": re.compile( - r"\b(CREATE\s+OBJECT|CREATE\s+DATA|FREE|CLEAR)\b", re.I - ), + "memory_alloc": re.compile(r"\b(CREATE\s+OBJECT|CREATE\s+DATA|FREE|CLEAR)\b", re.I), # 37. inline_asm: Bare Metal. "inline_asm": None, # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry: Professional diagnostics. - "telemetry": re.compile( - r"\b(BAL_LOG_CREATE|BAL_DB_SAVE|CL_BALI_LOG|CL_BALI_MSG_SETTER)\b", re.I - ), - # 39. print_hits: Standard output. - "print_hits": re.compile(r"^[ \t]*(WRITE)\b", re.I | re.M), - # 40. cast_hits: "Trust Me" Tax. Explicit casting and conversions. - "cast_hits": re.compile( + "telemetry": re.compile(r"\b(BAL_LOG_CREATE|BAL_DB_SAVE|CL_BALI_LOG|CL_BALI_MSG_SETTER)\b", re.I), + # 39. debug_prints (Debug Artifacts / Unstructured Outputs): Standard output. + "debug_prints": re.compile(r"^[ \t]*(WRITE)\b", re.I | re.M), + # 40. explicit_casts (Explicit Type Casting): "Trust Me" Tax. Explicit casting and conversions. + "explicit_casts": re.compile( r"\b(?:CAST|CONV)\s*[a-zA-Z0-9_~-]*\s*#?\s*\(|ASSIGNING\s+<[^>]+>\s+CASTING", re.I, ), - # 41. bailout_hits: Detonators. Aborting execution or error messages. - "bailout_hits": re.compile( + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) Aborting execution or error messages. + "panics_and_aborts": re.compile( r'\b(RAISE\s+EXCEPTION|MESSAGE\s+[^\n;]+\s+TYPE\s+[\'"][EX][\'"]|LEAVE\s+PROGRAM)\b', re.I, ), - # 42. halt_hits: Temporal Duct Tape. Thread sleep. - "halt_hits": re.compile(r"\bWAIT\s+UP\s+TO\b", re.I), - # 43. bitwise_hits: Sub-Atomic Math. - "bitwise_hits": re.compile(r"\b(BIT-AND|BIT-OR|BIT-XOR|BIT-NOT)\b", re.I), - # 44. sync_locks: Barricades. + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) Thread sleep. + "thread_sleeps": re.compile(r"\bWAIT\s+UP\s+TO\b", re.I), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"\b(BIT-AND|BIT-OR|BIT-XOR|BIT-NOT)\b", re.I), + # 44. sync_locks (Resource Management & Stability) "sync_locks": re.compile(r"\b(ENQUEUE_|DEQUEUE_)\b", re.I), - # 45. freeze_hits: Data Cryogenics. Immutability (constants). - "freeze_hits": re.compile(r"\b(CONSTANTS|FINAL|READ-ONLY)\b", re.I), - # 46. cleanup: The Janitor. - "cleanup": re.compile( - r"^[ \t]*(FREE|CLEAR|CLOSE\s+DATASET)\b", re.I | re.M - ), - # 47. encapsulation: The Vault. Scope hiding. - "encapsulation": re.compile( - r"\b(PRIVATE\s+SECTION|PROTECTED\s+SECTION)\b", re.I - ), - # 48. listeners: The Sinks. + # 45. immutability_locks (Immutability Constraints) Immutability (constants). + "immutability_locks": re.compile(r"\b(CONSTANTS|FINAL|READ-ONLY)\b", re.I), + # 46. cleanup (Resource Cleanup / Teardown) + "cleanup": re.compile(r"^[ \t]*(FREE|CLEAR|CLOSE\s+DATASET)\b", re.I | re.M), + # 47. encapsulation (Encapsulation / Access Modifiers) + "encapsulation": re.compile(r"\b(PRIVATE\s+SECTION|PROTECTED\s+SECTION)\b", re.I), + # 48. listeners (Event Listeners / Observers) "listeners": re.compile(r"\bFOR\s+EVENT\s+[^\n;]+\s+OF\b", re.I), - # 49. test_skip: Safety Theater. + # 49. test_skip (Bypassed Tests / Ignored Specs) "test_skip": re.compile(r"\b(IGNORE)\b", re.I), }, }, @@ -9992,9 +8996,8 @@ "shebangs": [], # UPGRADED: Maps to Family 8 (Singular/Unique) # Rationale: (CORRECTION) Consolidated 'xml_angle' into 'singular'. Like HTML, XML - # exclusively uses SGML-style block delimiters () for its Ghost Mass. - "lexical_family": "single_line_only", - + # exclusively uses SGML-style block delimiters () for its Commented / Non-Executable Text. + "lexical_family": "block_exclusive", "rules": {}, }, "markdown": { @@ -10014,7 +9017,7 @@ ], # ABSOLUTE IDENTITY & EXACT FILENAMES: The universally recognized, extensionless repository documentation anchors. "exact_matches": ["README", "LICENSE", "CHANGELOG", "CONTRIBUTING", "SECURITY"], - # ECOSYSTEM ANCHORS & DISAMBIGUATION: Static site generators and documentation build configs acting as gravity anchors. + # ECOSYSTEM ANCHORS & DISAMBIGUATION: Static site generators and documentation build configs acting as disambiguation anchors. "discriminators": [ ".md", ".mdx", @@ -10027,7 +9030,7 @@ # UPGRADED: Maps to Family 8 (Singular/Unique) # Rationale: (CORRECTION) Markdown relies entirely on HTML's SGML-style block comments (). # Mapping this to 'hybrid_dash' would cause the engine to miss hidden documentation mass. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { "lit_code_blocks": re.compile(r"^```[a-zA-Z0-9]*$", re.M), "lit_diagrams": re.compile(r"^```(?:mermaid|plantuml)$", re.M), @@ -10041,14 +9044,14 @@ "extensions": [".csv", ".tsv", ".psv"], # ABSOLUTE IDENTITY & EXACT FILENAMES: Delimited data relies strictly on extensions. "exact_matches": [], - # ECOSYSTEM ANCHORS & DISAMBIGUATION: Sibling datasets and data-science logic files acting as gravity anchors. + # ECOSYSTEM ANCHORS & DISAMBIGUATION: Sibling datasets and data-science logic files acting as disambiguation anchors. "discriminators": [".csv", ".tsv", ".py", ".ipynb", ".R", ".m"], # EXECUTION SIGNATURES: CSV is purely static data; no shebangs exist. "shebangs": [], # UPGRADED: Maps to Family 3 (Pure Hash) # Rationale: While strictly data, when CSVs *do* contain comments (supported by # parsers like Pandas or DuckDB), they almost exclusively use the '#' symbol at the start of a line. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": {}, }, "yaml": { @@ -10071,22 +9074,16 @@ ".github/workflows", ], "shebangs": [], - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { "_line_anchor": re.compile(r"#"), "_inline_comment": re.compile(r"#"), "_block_start": None, "_block_end": None, - # --- PHASE 1: GEOMETRY & STRUCTURE --- - "branch": re.compile( - r"\b(?:if|else|elif|fi|case|esac|for|while|do|done)\b|&&|\|\|", re.I - ), - "args": re.compile( - r"^[ \t]*with:[ \t]*\n(?:[ \t]+[a-zA-Z0-9_-]+:[ \t]*.*)+", re.M | re.I - ), - "linear": re.compile( - r"^[ \t]*(?:env|needs|runs-on|steps|strategy|matrix):", re.M | re.I - ), + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- + "branch": re.compile(r"\b(?:if|else|elif|fi|case|esac|for|while|do|done)\b|&&|\|\|", re.I), + "args": re.compile(r"^[ \t]*with:[ \t]*\n(?:[ \t]+[a-zA-Z0-9_-]+:[ \t]*.*)+", re.M | re.I), + "structural_boundaries": re.compile(r"^[ \t]*(?:env|needs|runs-on|steps|strategy|matrix):", re.M | re.I), # Executable Logic Anchors: Explicit execution blocks "func_start": re.compile( r"^[ \t]*(?:-?[ \t]*run:|script:|before_script:|after_script:)[ \t]*[|>]*", @@ -10102,13 +9099,11 @@ re.M | re.I, ), # Catches the classic curl-to-bash supply chain dropper inside a run block - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"^[ \t]*continue-on-error:[ \t]*true|chmod[ \t]+777|\b(?:curl|wget)[ \t]+[^|\n]{1,200}\|[ \t]*(?:bash|sh|zsh)\b", re.M | re.I, ), - "danger": re.compile( - r"\b(?:rm[ \t]+-rf[ \t]+/(?![A-Za-z])|eval|exec)\b", re.M | re.I - ), + "high_risk_execution": re.compile(r"\b(?:rm[ \t]+-rf[ \t]+/(?![A-Za-z])|eval|exec)\b", re.M | re.I), "io": re.compile( r"\b(?:wget|curl|apt-get|apk|yum|git[ \t]+clone|npm[ \t]+install|pip[ \t]+install)\b", re.M | re.I, @@ -10118,17 +9113,15 @@ r"^[ \t]*on:[ \t]*\n(?:[ \t]+(?:push|pull_request|workflow_dispatch|issues):)", re.M | re.I, ), - "flux": re.compile( + "state_mutation": re.compile( r"^[ \t]*env:[ \t]*\n(?:[ \t]+[a-zA-Z0-9_-]+:[ \t]*.*)+|export[ \t]+[a-zA-Z0-9_]+[ \t]*=", re.M | re.I, ), - "graveyard": re.compile( + "dead_code": re.compile( r"^[ \t]*#[ \t]*(?:-?[ \t]*run:|uses:|jobs:|steps:|script:)", re.M | re.I, ), - "doc": re.compile( - r"^[ \t]*name:[ \t]+.*|^[ \t]*description:[ \t]+.*", re.M | re.I - ), + "doc": re.compile(r"^[ \t]*name:[ \t]+.*|^[ \t]*description:[ \t]+.*", re.M | re.I), "test": re.compile( r"\b(?:npm[ \t]+test|pytest|make[ \t]+test|cargo[ \t]+test|go[ \t]+test)\b", re.M | re.I, @@ -10149,9 +9142,7 @@ "comprehensions": None, "scientific": None, # Catching complex GitHub Expression injection logic - "heat_triggers": re.compile( - r"\$\{\{[ \t]*fromJson\(|to[A-Z][a-zA-Z]+\(", re.M - ), + "reflection_metaprogramming": re.compile(r"\$\{\{[ \t]*fromJson\(|to[A-Z][a-zA-Z]+\(", re.M), # The Gravity Links: External dependencies "import": re.compile( r"^[ \t]*(?:-?[ \t]*uses:|image:)[ \t]+([a-zA-Z0-9_./@:-]+)", @@ -10167,45 +9158,41 @@ "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, - "private_info": re.compile( + "hardcoded_secrets": re.compile( r"\b(?:password|secret|token|api[_-]?key|client[_-]?secret|private[_-]?key)[ \t]*:[ \t]*[\"'][A-Za-z0-9\-_+/=]{16,}[\"']", re.I, ), "spec_exposure": None, - "civil_war": None, + "tabs_vs_spaces": None, "ssr_boundaries": None, "events": re.compile( r"^[ \t]*repository_dispatch:|schedule:|^[ \t]*-?[ \t]*cron:", re.M | re.I, ), # Secrets injection - "dependency_injection": re.compile( - r"\$\{\{[ \t]*secrets\.[a-zA-Z0-9_]+[ \t]*\}\}", re.M - ), + "dependency_injection": re.compile(r"\$\{\{[ \t]*secrets\.[a-zA-Z0-9_]+[ \t]*\}\}", re.M), "macros": None, "pointers": None, "memory_alloc": None, "inline_asm": None, # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- "telemetry": re.compile(r"^[ \t]*::(?:debug|warning|error)[ \t]+.*", re.M), - "print_hits": re.compile(r"\b(?:echo|printf)\b", re.I), - "cast_hits": None, + "debug_prints": re.compile(r"\b(?:echo|printf)\b", re.I), + "explicit_casts": None, # GitHub action specific bailout outputs - "bailout_hits": re.compile( + "panics_and_aborts": re.compile( r"\b(?:exit[ \t]+[1-9]|kill[ \t]+-[0-9]+)\b|^[ \t]*::error::", re.M | re.I, ), - "halt_hits": re.compile(r"\bsleep[ \t]+[0-9]+\b", re.I), - "bitwise_hits": None, + "thread_sleeps": re.compile(r"\bsleep[ \t]+[0-9]+\b", re.I), + "bitwise_ops": None, "sync_locks": None, # Strict SHA-1 pinning for immutable security - "freeze_hits": re.compile(r"@[a-f0-9]{40}\b", re.I), + "immutability_locks": re.compile(r"@[a-f0-9]{40}\b", re.I), "cleanup": None, "encapsulation": None, "listeners": re.compile(r"^[ \t]*webhook:", re.M | re.I), - "test_skip": re.compile( - r"\|\|[ \t]*true\b|\b(?:--passWithNoTests|skipTests|--no-audit)\b", re.I - ), + "test_skip": re.compile(r"\|\|[ \t]*true\b|\b(?:--passWithNoTests|skipTests|--no-audit)\b", re.I), }, }, "pbtxt": { @@ -10219,14 +9206,14 @@ "extensions": [".pbtxt", ".textproto", ".textpb", ".pb"], # ABSOLUTE IDENTITY & EXACT FILENAMES: PBTXT strictly relies on its extensions. "exact_matches": [], - # ECOSYSTEM ANCHORS & DISAMBIGUATION: Standard .proto schema definitions and Bazel build files acting as gravity anchors. + # ECOSYSTEM ANCHORS & DISAMBIGUATION: Standard .proto schema definitions and Bazel build files acting as disambiguation anchors. "discriminators": [".proto", "WORKSPACE", "BUILD.bazel", "BUILD"], # EXECUTION SIGNATURES: PBTXT is purely serialized message data; no shebangs exist. "shebangs": [], # UPGRADED: Maps to Family 3 (Pure Hash) # Rationale: While standard .proto schemas use C-style (//) comments, the instantiated # Text Format (.pbtxt) strictly uses '#' for comments. - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": {}, }, "yacc": { @@ -10240,7 +9227,7 @@ "extensions": [".y", ".yy", ".ypp", ".l", ".ll", ".lpp"], # ABSOLUTE IDENTITY & EXACT FILENAMES: Parser generators rely strictly on extensions. "exact_matches": [], - # ECOSYSTEM ANCHORS & DISAMBIGUATION: The generated C/C++ outputs and standard build systems acting as gravity anchors. + # ECOSYSTEM ANCHORS & DISAMBIGUATION: The generated C/C++ outputs and standard build systems acting as disambiguation anchors. "discriminators": [ ".c", ".cpp", @@ -10254,32 +9241,28 @@ # UPGRADED: Maps to Family 1 (Standard C-Style) # Rationale: Yacc and Lex files interleave grammar definitions with pure C/C++ code # blocks (enclosed in %{ %}), relying entirely on standard '/* */' and '//' comments. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), "_block_start": re.compile(r"/\*"), "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- "branch": re.compile(r"\b(if|else|switch|case|for|while|do)\b|\|"), "args": re.compile(r"\$\d+|\$\$"), - "linear": re.compile( - r"\b(return|goto|break|continue|%token|%type|%left|%right|%nonassoc)\b" - ), + "structural_boundaries": re.compile(r"\b(return|goto|break|continue|%token|%type|%left|%right|%nonassoc)\b"), # Executable Logic Anchor: Anchors specifically onto Grammar Rules # Matches "rule_name :" or "rule_name:" at the start of a line "func_start": re.compile(r"^[ \t]*([a-zA-Z_]\w*)(?=[ \t]*:)", re.M), "class_start": None, # --- PHASE 2: RISK & STRUCTURAL INTEGRITY --- "safety": re.compile(r"\b(assert|YYABORT|YYACCEPT|YYERROR)\b"), - "safety_neg": re.compile(r"\b(goto|void\s*\*)\b"), - "danger": re.compile(r"\b(abort|exit|YYNOMEM)\b"), + "safety_bypasses": re.compile(r"\b(goto|void\s*\*)\b"), + "high_risk_execution": re.compile(r"\b(abort|exit|YYNOMEM)\b"), "io": re.compile(r"\b(fopen|fclose|fread|fwrite|yyin|yyout|fprintf)\b"), "api": re.compile(r"\b(%define|%code|%provides|%requires)\b"), - "flux": re.compile(r"(?])=(?![=])|\+\+|--"), - "graveyard": re.compile( - r"//[ \t]*(?:if|for|while|return|%token)\b|/\*[ \t]*(?:if|for|while|%token)" - ), + "state_mutation": re.compile(r"(?])=(?![=])|\+\+|--"), + "dead_code": re.compile(r"//[ \t]*(?:if|for|while|return|%token)\b|/\*[ \t]*(?:if|for|while|%token)"), "doc": re.compile(r"/\*\*|@param|@return"), "test": None, # --- PHASE 3: ARCHITECTURE & DOMAIN SENSORS --- @@ -10291,42 +9274,32 @@ "generics": re.compile(r"<[a-zA-Z_][a-zA-Z0-9_]*>"), # Captures %type "comprehensions": None, "scientific": None, - "heat_triggers": re.compile(r"%\{|%\}|%%"), + "reflection_metaprogramming": re.compile(r"%\{|%\}|%%"), "import": re.compile(r'^[ \t]*#(?:include)\s*[<"][^>"]+[>"]', re.M), - "ownership": re.compile( - r"(?:@author|Author:|Created by:|Copyright)\s+(.*)", re.I - ), + "ownership": re.compile(r"(?:@author|Author:|Created by:|Copyright)\s+(.*)", re.I), # --- PHASE 4: SPECIALIZED SUB-SYSTEMS --- "planned_debt": GLOBAL_PLANNED_DEBT, "fragile_debt": GLOBAL_FRAGILE_DEBT, - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + "tabs_vs_spaces": None, "ssr_boundaries": None, "events": None, "dependency_injection": None, - "macros": re.compile( - r"^[ \t]*#(?:define|undef|if|elif|else|endif|pragma)\b", re.M - ), - "pointers": re.compile( - r"->|&\w+|(?<=[=(,])[ \t]*\*(?:\s*const\s*)?[a-zA-Z_]\w*" - ), - "memory_alloc": re.compile( - r"\b(malloc|calloc|realloc|free|YYMALLOC|YYFREE)\b" - ), + "macros": re.compile(r"^[ \t]*#(?:define|undef|if|elif|else|endif|pragma)\b", re.M), + "pointers": re.compile(r"->|&\w+|(?<=[=(,])[ \t]*\*(?:\s*const\s*)?[a-zA-Z_]\w*"), + "memory_alloc": re.compile(r"\b(malloc|calloc|realloc|free|YYMALLOC|YYFREE)\b"), "inline_asm": None, # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- "telemetry": re.compile(r"\b(?:syslog|openlog|log_info|YYDPRINTF)\b"), - "print_hits": re.compile(r"\b(printf|fprintf|vprintf|puts|yyerror)\b"), - "cast_hits": re.compile( + "debug_prints": re.compile(r"\b(printf|fprintf|vprintf|puts|yyerror)\b"), + "explicit_casts": re.compile( r"\(\s*(?:int|char|short|long|float|double|void|unsigned|signed|[A-Z]\w*)\s*\*?\s*\)\s*[a-zA-Z_$]" ), - "bailout_hits": re.compile(r"\b(abort|exit|YYABORT)\b"), - "halt_hits": None, - "bitwise_hits": re.compile(r"<<|>>|(?>|(?string|string->number|symbol->string|string->symbol|list->vector|vector->list|char->integer|integer->char)(?=[ \t)\]\n\r])" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile( - r"(?<=[ \t(\[])(error|abort|exit|emergency-exit)(?=[ \t)\]\n\r])" - ), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile( - r"(?<=[ \t(\[])(sleep|usleep|thread-sleep!)(?=[ \t)\]\n\r])" - ), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile( + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"(?<=[ \t(\[])(error|abort|exit|emergency-exit)(?=[ \t)\]\n\r])"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"(?<=[ \t(\[])(sleep|usleep|thread-sleep!)(?=[ \t)\]\n\r])"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile( r"(?<=[ \t(\[])(bitwise-and|bitwise-ior|bitwise-xor|bitwise-not|arithmetic-shift|ash)(?=[ \t)\]\n\r])" ), - # 44. sync_locks (Thread Synchronization / Locks) - "sync_locks": re.compile( - r"(?<=[ \t(\[])(mutex-lock!|make-mutex)(?=[ \t)\]\n\r])" - ), - # 45. freeze_hits (Immutability Constraints) + # 44. sync_locks (Resource Management & Stability) + "sync_locks": re.compile(r"(?<=[ \t(\[])(mutex-lock!|make-mutex)(?=[ \t)\]\n\r])"), + # 45. immutability_locks (Immutability Constraints) # Immutable strings and explicit quotations (meaning the list cannot be mutated safely). - "freeze_hits": re.compile( - r"(?<=[ \t(\[])(quote|string->immutable-string)(?=[ \t)\]\n\r])|\'(?=\()" - ), + "immutability_locks": re.compile(r"(?<=[ \t(\[])(quote|string->immutable-string)(?=[ \t)\]\n\r])|\'(?=\()"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r"(?<=[ \t(\[])(close-input-port|close-output-port|close-port)(?=[ \t)\]\n\r])" - ), + "cleanup": re.compile(r"(?<=[ \t(\[])(close-input-port|close-output-port|close-port)(?=[ \t)\]\n\r])"), # 47. encapsulation (Access Modifiers / Encapsulation) # Module-internal definitions. - "encapsulation": re.compile( - r"^[ \t]*\([ \t]*define-private(?=[ \t)\]\n\r])", re.M - ), + "encapsulation": re.compile(r"^[ \t]*\([ \t]*define-private(?=[ \t)\]\n\r])", re.M), # 48. listeners (Event Listeners / Observers) "listeners": re.compile(r"(?<=[ \t(\[])(add-hook!)(?=[ \t)\]\n\r])"), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"(?<=[ \t(\[])(test-skip|test-expect-fail)(?=[ \t)\]\n\r])" - ), + "test_skip": re.compile(r"(?<=[ \t(\[])(test-skip|test-expect-fail)(?=[ \t)\]\n\r])"), }, }, "mlir": { @@ -10796,7 +9705,7 @@ # UPGRADED: Maps to Family 1 (Standard C-Style) # Rationale: MLIR intentionally adopts standard LLVM assembly syntax conventions, # using '//' exclusively for line comments to maintain C++ ecosystem familiarity. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), @@ -10828,7 +9737,7 @@ "shebangs": [], # UPGRADED: Maps to Family 1 (Standard C-Style) # Rationale: Protobuf schemas strictly use standard '//' and '/* */' comments. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), @@ -10847,13 +9756,13 @@ "extensions": [".hlo"], # ABSOLUTE IDENTITY & EXACT FILENAMES: IR text files strictly rely on their extensions. "exact_matches": [], - # ECOSYSTEM ANCHORS & DISAMBIGUATION: JAX, TensorFlow, and MLIR toolchain markers acting as gravity anchors for ML compilers. + # ECOSYSTEM ANCHORS & DISAMBIGUATION: JAX, TensorFlow, and MLIR toolchain markers acting as disambiguation anchors for ML compilers. "discriminators": [".hlo", ".mlir", ".pbtxt", ".py", "BUILD.bazel", "BUILD"], # EXECUTION SIGNATURES: HLO is compiler intermediate representation; no shebangs exist. "shebangs": [], # UPGRADED: Maps to Family 1 (Standard C-Style) # Rationale: HLO text format exclusively utilizes '//' for line-level comments, maintaining C++ ecosystem alignment. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), @@ -10885,7 +9794,7 @@ "shebangs": [], # UPGRADED: Maps to Family 1 (Standard C-Style) # Rationale: TableGen was built to integrate seamlessly into LLVM's C++ codebase, natively supporting '//' and '/* */' comments. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), @@ -10991,19 +9900,17 @@ # UPGRADED: Maps to Family 3 (Pure Hash) # Rationale: Tcl natively uses '#' exclusively for line-level comments. It does not # have native block comments (developers sometimes hack `if 0 { ... }`, but `#` is the standard). - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- "_line_anchor": re.compile(r"#"), "_inline_comment": re.compile(r"#"), "_block_start": None, "_block_end": None, - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) # Tcl control flow keywords. - "branch": re.compile( - r"\b(?:if|elseif|else|switch|while|for|foreach|catch|try|trap|finally)\b" - ), + "branch": re.compile(r"\b(?:if|elseif|else|switch|while|for|foreach|catch|try|trap|finally)\b"), # 2. args (Parameters / Coupling) # Safely captures the parameter list `{...}` immediately following a proc name. "args": re.compile( @@ -11018,15 +9925,11 @@ ), # 3. linear (Sequential Boundaries) # Structural boundaries. EXCLUDES: global/upvar (globals/heat). - "linear": re.compile( - r"\b(?:proc|return|break|continue|namespace|variable|yield)\b" - ), + "structural_boundaries": re.compile(r"\b(?:proc|return|break|continue|namespace|variable|yield)\b"), # 4. func_start (Executable Logic Anchors) # MUST HAVE EXACTLY ONE CAPTURE GROUP. # Captures standard procs and namespaced procs (e.g., `proc ::my::func`). - "func_start": re.compile( - r"^[ \t]*proc[ \t]+([a-zA-Z0-9_:]+)(?=[ \t]*\{|[ \t\n]|$)", re.M - ), + "func_start": re.compile(r"^[ \t]*proc[ \t]+([a-zA-Z0-9_:]+)(?=[ \t]*\{|[ \t\n]|$)", re.M), # 5. class_start (Object / Entity Declarations) # Captures TclOO, Snit, and Itcl class definitions. "class_start": re.compile( @@ -11036,35 +9939,25 @@ # --- PHASE 2: RISK & STRUCTURAL INTEGRITY --- # 6. safety (Defensive Programming / Validation) # Safe evaluation and error catching. - "safety": re.compile( - r"\b(?:catch|try|trap|finally|info[ \t]+exists|assert)\b" - ), + "safety": re.compile(r"\b(?:catch|try|trap|finally|info[ \t]+exists|assert)\b"), # 7. safety_neg (Safety Bypasses / Unchecked Types) # Unrestricted evaluation and context manipulation. - "safety_neg": re.compile(r"\b(?:eval|uplevel|upvar)\b"), + "safety_bypasses": re.compile(r"\b(?:eval|uplevel|upvar)\b"), # 8. danger (High-Risk Execution / System Calls) # OS command execution and process termination. - "danger": re.compile(r"\b(?:exec|exit)\b|file[ \t]+delete[ \t]+-force"), + "high_risk_execution": re.compile(r"\b(?:exec|exit)\b|file[ \t]+delete[ \t]+-force"), # 9. io (I/O & Network Boundaries) # File system, sockets, and configuration. (Excludes puts which is mapped to print_hits). - "io": re.compile( - r"\b(?:open|close|read|gets|socket|fconfigure|file|source|vfs::)\b" - ), + "io": re.compile(r"\b(?:open|close|read|gets|socket|fconfigure|file|source|vfs::)\b"), # 10. api (Public Surface Area) # Exposing packages or namespace exports. - "api": re.compile( - r"^[ \t]*(?:package[ \t]+provide|namespace[ \t]+export)\b", re.M - ), + "api": re.compile(r"^[ \t]*(?:package[ \t]+provide|namespace[ \t]+export)\b", re.M), # 11. flux (State Mutation) # Variable state mutations. - "flux": re.compile( - r"\b(?:set|lappend|dict[ \t]+set|array[ \t]+set|incr|append)\b[ \t]+[a-zA-Z0-9_:]+" - ), - # 12. graveyard (Dead / Commented-out Code) + "state_mutation": re.compile(r"\b(?:set|lappend|dict[ \t]+set|array[ \t]+set|incr|append)\b[ \t]+[a-zA-Z0-9_:]+"), + # 12. dead_code (Commented Logic / Deprecated Trails) # Commented out structural code. - "graveyard": re.compile( - r"^[ \t]*#[ \t]*(?:proc|set|if|while|foreach|return)\b", re.M - ), + "dead_code": re.compile(r"^[ \t]*#[ \t]*(?:proc|set|if|while|foreach|return)\b", re.M), # 13. doc (Structured Documentation) # Tcl doc blocks. "doc": re.compile(r"^[ \t]*#[ \t]*@(?:param|return|brief|author)", re.M), @@ -11079,9 +9972,7 @@ "concurrency": re.compile(r"\b(?:vwait|after|thread::|coroutine|yield)\b"), # 16. ui_framework (UI / View Components) # Tkinter/Tk graphical elements. - "ui_framework": re.compile( - r"\b(?:button|pack|grid|place|canvas|frame|label|ttk::)\b" - ), + "ui_framework": re.compile(r"\b(?:button|pack|grid|place|canvas|frame|label|ttk::)\b"), # 17. closures (Closures / Anonymous Functions) # Tcl 8.6 anonymous functions. "closures": re.compile(r"\bapply[ \t]+\{"), @@ -11097,19 +9988,13 @@ "comprehensions": re.compile(r"\blmap\b"), # 22. scientific (Numerical / Compute Libraries) # Explicit math invocations via expr. - "scientific": re.compile( - r"\b(?:expr|math::)\b|\b(?:sin|cos|tan|sqrt|exp|log|pow)\b" - ), + "scientific": re.compile(r"\b(?:expr|math::)\b|\b(?:sin|cos|tan|sqrt|exp|log|pow)\b"), # 23. heat_triggers (Metaprogramming & Reflection) - # Massive cognitive heat: Intercepting variables, tracking execution, and runtime aliasing. - "heat_triggers": re.compile( - r"\b(?:trace[ \t]+add|rename|interp[ \t]+create|interp[ \t]+alias)\b" - ), + # High Cognitive Load: Intercepting variables, tracking execution, and runtime aliasing. + "reflection_metaprogramming": re.compile(r"\b(?:trace[ \t]+add|rename|interp[ \t]+create|interp[ \t]+alias)\b"), # 24. import (Dependency Inclusions) # Package and module loading. - "import": re.compile( - r"^[ \t]*(?:package[ \t]+require|source|load)\b", re.M - ), + "import": re.compile(r"^[ \t]*(?:package[ \t]+require|source|load)\b", re.M), # 25. ownership (Authorship Metadata) "ownership": re.compile( r"^[ \t]*#[ \t]*(?:Author|Created by|Maintainer|Copyright):\s+(.*)", @@ -11120,12 +10005,10 @@ "planned_debt": GLOBAL_PLANNED_DEBT, # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, - "spec_exposure": re.compile( - r"\[(?:[ \t]*SPEC[ \t]*-[ \t]*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) + "spec_exposure": re.compile(r"\[(?:[ \t]*SPEC[ \t]*-[ \t]*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) # Tcl standardizes on spaces. Tabs indicate formatter friction. - "civil_war": None, + "tabs_vs_spaces": None, "ssr_boundaries": None, # 32. events (Event Emitters / Pub-Sub) # Tcl event bindings and file event handlers. @@ -11138,41 +10021,31 @@ # --- PHASE 5: RESOURCE MANAGEMENT & STABILITY --- # 38. telemetry (Structured Logging / Telemetry) "telemetry": re.compile(r"\b(?:log::log|logger::|syslog)\b"), - # 39. print_hits (Standard Output / Debug Prints) - "print_hits": re.compile(r"\bputs\b"), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile(r"\bexpr[ \t]+(?:int|double|wide)\("), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(?:error|exit)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\bafter[ \t]+[0-9]+\b"), - # 43. bitwise_hits (Bitwise Operations) - "bitwise_hits": re.compile(r"(?>"), - # 44. sync_locks (Thread Synchronization / Locks) - "sync_locks": re.compile( - r"\b(?:thread::mutex|thread::rwmutex|thread::cond)\b" - ), - # 45. freeze_hits (Immutability Constraints) + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) (Standard Output / Debug Prints) + "debug_prints": re.compile(r"\bputs\b"), + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile(r"\bexpr[ \t]+(?:int|double|wide)\("), + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(?:error|exit)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\bafter[ \t]+[0-9]+\b"), + # 43. bitwise_ops (Bitwise Operations) + "bitwise_ops": re.compile(r"(?>"), + # 44. sync_locks (Resource Management & Stability) + "sync_locks": re.compile(r"\b(?:thread::mutex|thread::rwmutex|thread::cond)\b"), + # 45. immutability_locks (Immutability Constraints) # Tcl lacks `const`, but setting a trace to prevent writes is the Tcl idiom for freezing. - "freeze_hits": re.compile( - r"\btrace[ \t]+add[ \t]+variable[ \t]+[a-zA-Z0-9_:]+[ \t]+write\b" - ), + "immutability_locks": re.compile(r"\btrace[ \t]+add[ \t]+variable[ \t]+[a-zA-Z0-9_:]+[ \t]+write\b"), # 46. cleanup (Resource Cleanup / Teardown) - "cleanup": re.compile( - r'\b(?:close|unset)\b|rename[ \t]+[a-zA-Z0-9_:]+[ \t]+""' - ), + "cleanup": re.compile(r'\b(?:close|unset)\b|rename[ \t]+[a-zA-Z0-9_:]+[ \t]+""'), # 47. encapsulation (Access Modifiers / Encapsulation) # Internal namespaces and private `_` prefixed procs. - "encapsulation": re.compile( - r"\bnamespace[ \t]+eval\b|^[ \t]*proc[ \t]+_[a-zA-Z0-9_:]+", re.M - ), + "encapsulation": re.compile(r"\bnamespace[ \t]+eval\b|^[ \t]*proc[ \t]+_[a-zA-Z0-9_:]+", re.M), # 48. listeners (Event Listeners / Observers) "listeners": re.compile(r"\b(?:bind|fileevent)\b"), # 49. test_skip (Bypassed Tests / Ignored Specs) # Using TclTest constraints to silently skip tests on certain OS environments. - "test_skip": re.compile( - r"-constraints[ \t]+[a-zA-Z0-9_]+\b|\btestConstraint\b" - ), + "test_skip": re.compile(r"-constraints[ \t]+[a-zA-Z0-9_]+\b|\btestConstraint\b"), }, }, "groovy": { @@ -11197,18 +10070,16 @@ # EXECUTION SIGNATURES "shebangs": ["groovy"], # LEXICAL FAMILY - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { - # --- 2.3.C OPTICAL SPLIT CONTROLS --- + # --- LEXICAL DELIMITER CONTROLS --- "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), "_block_start": re.compile(r"/\*"), "_block_end": re.compile(r"\*/"), - # --- PHASE 1: GEOMETRY & STRUCTURE --- + # --- PHASE 1: LOGIC TOPOLOGY & STRUCTURE --- # 1. branch (Control Flow / Branching) - "branch": re.compile( - r"\b(if|else|switch|case|default|for|while|in|try|catch|finally)\b|\?|:" - ), + "branch": re.compile(r"\b(if|else|switch|case|default|for|while|in|try|catch|finally)\b|\?|:"), # 2. args (Parameters / Coupling) # Captures standard method arguments and Groovy closures (x, y ->) # CRITICAL FIX: Anchored the parenthesis capture to method signatures so it @@ -11218,7 +10089,7 @@ re.M, ), # 3. linear (Sequential Boundaries) - "linear": re.compile( + "structural_boundaries": re.compile( r"\b(def|class|interface|trait|enum|record|import|package|extends|implements|return|yield)\b" ), # 4. func_start (Executable Logic Anchors) @@ -11239,13 +10110,11 @@ r"\b(try|catch|finally|assert|instanceof|Optional)\b|@(?:Valid|Validated|NotNull|NonNull|Immutable)" ), # 7. safety_neg (Safety Bypasses / Unchecked Types) - "safety_neg": re.compile( + "safety_bypasses": re.compile( r"\b(null)\b|return\s+null|catch\s*\(\s*(?:Exception|Throwable)\b|@SuppressWarnings|@SneakyThrows|\.get\(\)" ), # 8. danger (High-Risk Execution / System Calls) - "danger": re.compile( - r"\b(System\.exit|Runtime\.getRuntime\(\)\.exec|execute)\b" - ), + "high_risk_execution": re.compile(r"\b(System\.exit|Runtime\.getRuntime\(\)\.exec|execute)\b"), # 9. io (I/O & Network Boundaries) "io": re.compile( r"\b(File|Files|Paths|FileReader|FileWriter|file|copy|sync|uri|url|Socket|Connection|ResultSet)\b" @@ -11256,10 +10125,10 @@ r"\b(public)\b|@(RestController|Controller|Service|Component|Bean|RequestMapping|GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping)\b" ), # 11. flux (State Mutation) - "flux": re.compile(r"^[ \t]*\w+(?:\.\w+)*[ \t]*=|@(?:Setter|Data)\b", re.M), - # 12. graveyard (Dead / Commented-out Code) + "state_mutation": re.compile(r"^[ \t]*\w+(?:\.\w+)*[ \t]*=|@(?:Setter|Data)\b", re.M), + # 12. dead_code (Commented Logic / Deprecated Trails) # Tuned to catch dead Gradle definitions and Groovy logic. - "graveyard": re.compile( + "dead_code": re.compile( r"//[ \t]*(?:def|class|void|if|for|while|import|implementation|compile|api|testImplementation)\b" ), # 13. doc (Structured Documentation) @@ -11276,15 +10145,11 @@ r"\b(synchronized|Thread|Runnable|Future|ExecutorService|Promise|Atomic\w+|task)\b|@(?:Async|Scheduled)" ), # 16. ui_framework (UI / View Components) - "ui_framework": re.compile( - r"\b(SwingBuilder|JFrame|JPanel|ModelAndView|ModelMap|Model|UIComponent)\b" - ), + "ui_framework": re.compile(r"\b(SwingBuilder|JFrame|JPanel|ModelAndView|ModelMap|Model|UIComponent)\b"), # 17. closures (Closures / Anonymous Functions) "closures": re.compile(r"->|\{\s*(?:it|[\w\s,]+)\s*->"), # 18. globals (Global / Shared State) - "globals": re.compile( - r"\b(System\.getProperty|System\.getenv|project\.ext)\b|@Value" - ), + "globals": re.compile(r"\b(System\.getProperty|System\.getenv|project\.ext)\b|@Value"), # 19. decorators (Decorators / Annotations) "decorators": re.compile(r"^[ \t]*@[\w.]+(?:\([^)]*\))?", re.M), # 20. generics (Generics / Type Parameters) @@ -11294,12 +10159,10 @@ r"\.(?:collect|find|findAll|grep|inject|each|eachWithIndex|map|filter|reduce)\(" ), # 22. scientific (Numerical / Compute Libraries) - "scientific": re.compile( - r"\b(Math\.|BigDecimal|BigInteger|Random|SecureRandom)\b" - ), + "scientific": re.compile(r"\b(Math\.|BigDecimal|BigInteger|Random|SecureRandom)\b"), # 23. heat_triggers (Metaprogramming & Reflection) # Groovy's highly dynamic Meta-Object Protocol (MOP). - "heat_triggers": re.compile( + "reflection_metaprogramming": re.compile( r"\b(invokeMethod|getProperty|setProperty|methodMissing|propertyMissing|ExpandoMetaClass|metaClass)\b" ), # 24. import (Dependency Inclusions) @@ -11312,19 +10175,15 @@ # 27. fragile_debt (Acknowledged Hacks / FIXMEs) "fragile_debt": GLOBAL_FRAGILE_DEBT, # 29. spec_exposure (Spec / Audit Traceability) - "spec_exposure": re.compile( - r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I - ), - # 30. civil_war (Formatting Inconsistencies) - "civil_war": None, + "spec_exposure": re.compile(r"\[(?:\s*SPEC\s*-\s*\d+|spec|audit)[^\]]*\]", re.I), + # 30. tabs_vs_spaces (Formatting Inconsistencies) + "tabs_vs_spaces": None, # 31. ssr_boundaries (Server-Side Rendering) "ssr_boundaries": re.compile( r"\b(MarkupBuilder|StreamingMarkupBuilder|TemplateEngine|HttpServletRequest|HttpServletResponse|@ResponseBody)\b" ), # 32. events (Event Emitters / Pub-Sub) - "events": re.compile( - r"\b(ApplicationEvent|ApplicationListener|@EventListener|publishEvent)\b" - ), + "events": re.compile(r"\b(ApplicationEvent|ApplicationListener|@EventListener|publishEvent)\b"), # 33. dependency_injection (Dependency Injection / IoC) # Heavily captures Gradle plugin and dependency architecture. "dependency_injection": re.compile( @@ -11334,7 +10193,7 @@ "macros": None, # 35. pointers (Pointer Arithmetic / Memory Addressing) "pointers": None, - # 36. memory_alloc + # 36. memory_alloc "memory_alloc": None, # 37. inline_asm "inline_asm": None, @@ -11343,27 +10202,25 @@ "telemetry": re.compile( r"\b(log|logger|LOGGER|LoggerFactory)\.(?:info|error|warn|warning|debug|trace)\b|@Slf4j|@Log4j2|@Log" ), - # 39. print_hits (The Amateur) - "print_hits": re.compile( + # 39. debug_prints (Debug Artifacts / Unstructured Outputs) + "debug_prints": re.compile( r"\b(println|print|printf|System\.out\.print|System\.err\.print|\.printStackTrace\(\))\b" ), - # 40. cast_hits (Explicit Type Casting) - "cast_hits": re.compile( + # # 40. explicit_casts (Explicit Type Casting) + "explicit_casts": re.compile( r"\bas\s+[A-Z]\w*|\(\s*(?:int|long|short|byte|char|float|double|boolean|[A-Z][A-Za-z0-9_]*)\s*\)\s*[a-zA-Z_$]" ), - # 41. bailout_hits (Execution Halts / Panics) - "bailout_hits": re.compile(r"\b(throw|System\.exit|GradleException)\b"), - # 42. halt_hits (Thread Blocking / Sleeps) - "halt_hits": re.compile(r"\b(Thread\.sleep|sleep)\b"), - # 43. bitwise_hits (Bitwise Operations) + # 41. panics_and_aborts (Execution Interrupts / Fatal Aborts) + "panics_and_aborts": re.compile(r"\b(throw|System\.exit|GradleException)\b"), + # 42. thread_sleeps (Thread Blocking / Synchronous Pauses) + "thread_sleeps": re.compile(r"\b(Thread\.sleep|sleep)\b"), + # 43. bitwise_ops (Bitwise Operations) # EXCLUDES `<<` and `>>` because Groovy heavily overloads `<<` for list/stream appending. - "bitwise_hits": re.compile(r"\^|~"), - # 44. sync_locks (Thread Synchronization / Locks) - "sync_locks": re.compile( - r"\b(synchronized|ReentrantLock|ReadWriteLock|Semaphore|Lock|Mutex)\b" - ), - # 45. freeze_hits (Immutability Constraints) - "freeze_hits": re.compile(r"\b(final|@Immutable)\b"), + "bitwise_ops": re.compile(r"\^|~"), + # 44. sync_locks (Resource Management & Stability) + "sync_locks": re.compile(r"\b(synchronized|ReentrantLock|ReadWriteLock|Semaphore|Lock|Mutex)\b"), + # 45. immutability_locks (Immutability Constraints) + "immutability_locks": re.compile(r"\b(final|@Immutable)\b"), # 46. cleanup (Resource Cleanup / Teardown) "cleanup": re.compile(r"\b(close|dispose|shutdown)\b\s*\("), # 47. encapsulation (Access Modifiers / Encapsulation) @@ -11371,9 +10228,7 @@ # 48. listeners (Event Listeners / Observers) "listeners": re.compile(r"\b(addListener|on[A-Z]\w*|subscribe)\b"), # 49. test_skip (Bypassed Tests / Ignored Specs) - "test_skip": re.compile( - r"@(?:Ignore|Disabled|PendingFeature)\b|mock\s*\(|spy\s*\(" - ), + "test_skip": re.compile(r"@(?:Ignore|Disabled|PendingFeature)\b|mock\s*\(|spy\s*\("), }, }, "json": { @@ -11407,7 +10262,7 @@ "discriminators": [".json", ".jsonc", ".json5", ".arb"], "shebangs": [], # THE FIX: JSON with comments relies on C-style comment structures, not Python/Ruby hashes. - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { # ===================================================================== # [ CRITICAL ROADMAP: JSONC/JSON5 LEXICAL DELIMITERS & THE RE.COMPILE TRAP ] @@ -11433,7 +10288,7 @@ "exact_matches": [], "discriminators": [".glsl", ".vert", ".frag"], "shebangs": [], - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), @@ -11447,7 +10302,7 @@ "exact_matches": [], "discriminators": ["flake.nix", "default.nix", "shell.nix"], "shebangs": [], - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { "_line_anchor": re.compile(r"#"), "_inline_comment": re.compile(r"#"), @@ -11461,7 +10316,7 @@ "exact_matches": [], "discriminators": [".blp", ".ui"], "shebangs": [], - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "_line_anchor": re.compile(r"//"), "_inline_comment": re.compile(r"//"), @@ -11475,7 +10330,7 @@ "exact_matches": [], "discriminators": [], "shebangs": [], - "lexical_family": "single_line_only", + "lexical_family": "line_exclusive", "rules": { # Uses REM or :: for comments. No active logic rules needed (Inert Matter Bypass). "_line_anchor": re.compile(r"^[ \t]*(?:REM|::)", re.I | re.M), @@ -11518,9 +10373,7 @@ }, "Apollo-11": { "agc_assembly": { - "_meta_purpose_block": re.compile( - r"^[ \t]*(?:FUNCTIONAL|PROGRAM)\s+DESCRIPTION\b", re.I - ), + "_meta_purpose_block": re.compile(r"^[ \t]*(?:FUNCTIONAL|PROGRAM)\s+DESCRIPTION\b", re.I), "_meta_purpose_line": re.compile(r"^[ \t]*Purpose[\s:\-]*(.*)", re.I), "_meta_boundary": re.compile( r"^[ \t]*(?:Assembler|Filename|Pages|Website|Mod history|Copyright|Reference|PROGRAM NAME)[\s:\-]+", @@ -11576,23 +10429,13 @@ } }, "discourse": { - "_shield_": { - "exclude_paths": ["config/unicorn_launcher", "pnpm-lock.yaml", "yarn.lock"] - }, + "_shield_": {"exclude_paths": ["config/unicorn_launcher", "pnpm-lock.yaml", "yarn.lock"]}, "javascript": {"extensions": [".js", ".jsx", ".mjs", ".cjs", ".gjs"]}, }, - "elasticsearch": { - "plaintext": {"extensions": [".txt", ".text", ".log", ".json", ".yaml", ".yml"]} - }, - "exiftool": { - "plaintext": {"extensions": [".txt", ".text", ".out", ".args", ".fmt", ".xmp"]} - }, + "elasticsearch": {"plaintext": {"extensions": [".txt", ".text", ".log", ".json", ".yaml", ".yml"]}}, + "exiftool": {"plaintext": {"extensions": [".txt", ".text", ".out", ".args", ".fmt", ".xmp"]}}, "express": {"html": {"extensions": [".html", ".htm", ".ejs", ".tmpl"]}}, "fieldtrip": {"_shield_": {"exclude_dirs": ["external"]}}, - "jenkins": { - "_shield_": {"exclude_paths": ["translation-tool.pl", "core/report-l10n.rb"]} - }, - "redis": { - "_shield_": {"exclude_dirs": ["deps/lua", "deps/jemalloc", "deps/hiredis"]} - }, + "jenkins": {"_shield_": {"exclude_paths": ["translation-tool.pl", "core/report-l10n.rb"]}}, + "redis": {"_shield_": {"exclude_dirs": ["deps/lua", "deps/jemalloc", "deps/hiredis"]}}, } diff --git a/gitgalaxy/tools/README.md b/gitgalaxy/tools/README.md index 0411aea5..f453b674 100644 --- a/gitgalaxy/tools/README.md +++ b/gitgalaxy/tools/README.md @@ -1,58 +1,66 @@ -# 🛠️ GitGalaxy Tools (The Spokes) +# GitGalaxy Tools: Decoupled Execution Controllers & DevSecOps Suite -Welcome to the GitGalaxy Tools directory. +[![Architecture](https://img.shields.io/badge/Architecture-Decoupled_Controllers-8A2BE2.svg)](#) +[![Performance](https://img.shields.io/badge/Performance-AST--Free_Velocity-00BFFF.svg)](#) +[![Security](https://img.shields.io/badge/Security-Zero--Trust_Pipelines-FF4500.svg)](#) -If `galaxyscope.py` is the core physics engine (the Hub), the tools in this directory are the "Spokes." These are specialized, standalone execution controllers that leverage GitGalaxy's AST-free, high-speed parsing capabilities to solve specific engineering and security challenges. +Welcome to the **GitGalaxy Tools** directory. -Each sub-module is designed to be executed directly from the CLI or wired into CI/CD pipelines. +These tools are specific operational deployments of the blAST engine. These are specialized, standalone execution controllers that leverage GitGalaxy's AST-free, high-speed parsing capabilities to solve specific engineering, compliance, and legacy migration challenges at planetary scale. -## 📂 Tool Suites Directory +Each sub-module is designed to execute strictly in $O(1)$ or linear $O(N)$ time complexity, making them uniquely suited for direct integration into high-velocity CI/CD pipelines without introducing latency or Out-Of-Memory (OOM) crashes. -### 🛡️ [Supply Chain Security](./supply_chain_security/README.md) -Zero-trust DevSecOps tools designed for pre-commit hooks and CI/CD pipeline blocking. -* **Supply Chain Firewall:** Scans `node_modules` and vendor directories for malicious typosquatting and unauthorized network I/O. -* **Vault Sentinel:** Hyper-speed secrets and credential detection. -* **Binary Anomaly Detector:** Triage engine for finding encrypted payloads and parasitic logic hidden in binary artifacts. +--- + +## 📂 Ecosystem Suites & Tooling + +### 🛡️ [Supply Chain & CI/CD Defense Suite](./supply_chain_security/README.md) +Zero-Trust DevSecOps tools designed for pre-commit hooks and CI/CD pipeline blocking. +* **Supply Chain Firewall:** Scans the physical execution graph of downloaded dependencies to block unauthorized network I/O, typosquatting, and RCE vulnerabilities during installation. +* **Vault Sentinel:** Hyper-speed, two-tier pre-commit hook for detecting exposed cryptographic keys and SaaS tokens. +* **Binary Anomaly Detector:** A localized triage engine that utilizes Shannon Entropy to detect packed malware and execution headers hidden inside static binary artifacts. -### 📜 [Compliance & Auditing](./compliance/README.md) -Tools for generating forensic and legal records of software architecture. -* **Zero-Trust SBOM Generator:** Builds CycloneDX/SPDX manifests verified by structural code analysis, not just package manifests. +### 📜 [Compliance & Auditing Suite](./compliance/README.md) +Tools for generating forensic, mathematically verified records of software architecture. +* **Zero-Trust SBOM Generator:** Rejects standard manifest assumptions. Physically hunts dependencies on disk and validates their structural identity before signing off on CycloneDX/SPDX manifests. -### 🕵️ [Terabyte Log Scanning](./terabyte_log_scanning/README.md) -High-throughput engines for processing massive data outputs. -* **PII Leak Hunter:** Scans terabytes of raw logs for accidentally exposed PII without choking system memory. -* **Terabyte Log Scanner:** Maps static architecture to dynamic runtime execution logs. +### 🕵️ [High-Velocity Log Streaming & Incident Response](./terabyte_log_scanning/README.md) +Unindexed binary streaming engines for processing massive data outputs without RAM exhaustion. +* **PII Leak Hunter:** Streams terabytes of raw database/server logs to instantly detect and redact accidentally exposed PII (SSNs, Credit Cards, AWS Keys). +* **Terabyte Log Scanner:** Maps static architecture to dynamic runtime execution logs to mathematically prove dead code abandonment or isolate brute-force anomalies. -### 🕸️ [Network Auditing](./network_auditing/README.md) -* **API Network Mapper:** Automatically maps the physical outbound and inbound API surface area and compares it against Swagger/OpenAPI docs to find Shadow APIs. +### 🕸️ [API Network Auditing](./network_auditing/README.md) +* **Full API Network Mapper:** Automatically extracts physical outbound and inbound API routing intents across 9+ frameworks (Spring, Express, FastAPI) and compares them against OpenAPI/Swagger docs to expose undocumented **Shadow APIs**. -### 🦕 [Legacy Modernization: COBOL to Java](./cobol_to_java/README.md) & [COBOL to COBOL](./cobol_to_cobol/README.md) -A complete suite of architectural controllers for modernizing legacy mainframe systems. -* **COBOL Refractor:** Slices massive monolithic COBOL programs into isolated microservices. -* **Java Spring Forge:** Translates legacy business logic into compiling Java Spring architectures. +### 🦕 [Mainframe Modernization Suite](./cobol_to_java/README.md) & [Structural Extraction](./cobol_to_cobol/README.md) +A complete suite of deterministic architectural controllers for modernizing monolithic legacy systems without relying on hallucination-prone LLMs. +* **Deprecated Trails Analyzer & DAG Architect:** Identifies dead mainframe memory and mathematically derives execution topologies. +* **Microservice Logic Extractor:** Performs recursive data-flow taint tracking to isolate COBOL business rules. +* **Java Spring Boot Forge:** Deterministically translates COBOL architectures into 100% compiling Java Spring `@Entity` models, `@RestController` endpoints, and Maven build systems. -### 🤖 [AI Guardrails](./ai_guardrails/README.md) -* **AppSec Sensor & Dev Agent Firewall:** Middleware sensors that prevent LLMs from being wired to RCE vulnerabilities, and block autonomous AI coding agents from mutating highly complex legacy code. +### 🤖 [Dual-Sided AI Guardrails](./ai_guardrails/README.md) +* **AppSec Sensor & Dev Agent Firewall:** Deep-inspection middleware sensors. They detect **Autonomous Execution Vectors** to prevent LLMs from being wired to RCE vulnerabilities, and mathematically constrain autonomous AI coding agents from corrupting highly complex legacy code. --- ## 🚀 Execution & CI/CD Integration -The GitGalaxy Spoke architecture allows you to run these specialized tools using three distinct methods: +The GitGalaxy decoupled architecture allows you to run these specialized tools using three distinct methods: ### 1. GitHub Actions (The Universal Pipeline) -You can trigger any of the standalone CLI tools in your CI/CD pipeline using our universal composite action. Simply change the `tool` parameter to the spoke you want to execute: +You can trigger any of the standalone CLI tools securely in your CI/CD pipeline using our universal composite action. Simply change the `tool` parameter to the controller you want to execute: ```yaml - name: Run GitGalaxy Tool uses: squid-protocol/gitgalaxy@main with: - tool: 'supply-chain-firewall' # Options: xray-inspector, zero-trust-sbom, api-network-map, etc. + tool: 'supply-chain-firewall' # Options: vault-sentinel, zero-trust-sbom, api-network-map, etc. target: '.' ``` ### 2. Global CLI Execution -If you have GitGalaxy installed via PyPI (`pip install gitgalaxy`), all the standalone tools are registered as global console scripts. You can run them instantly from your terminal: +If you have GitGalaxy installed via PyPI (`pip install gitgalaxy`), all the standalone tools are registered as global console scripts. You can run them instantly from your terminal during active incident response or local auditing: + ```bash vault-sentinel . api-network-map ./src @@ -60,4 +68,17 @@ pii-leak-hunter ./logs/dump.sql ``` ### 3. Engine Middleware (AI Guardrails) -Note that the **AI Guardrails** do not operate as standalone CLI tools. They act as deep-inspection middleware. To utilize them, run the primary `galaxyscope` engine, and the sensors will automatically inject their AppSec findings into the final project telemetry. \ No newline at end of file +Note that the **AI Guardrails** do not operate as standalone CLI tools. They act as deep-inspection middleware. To utilize them, run the primary GitGalaxy analysis engine, and the sensors will automatically inject their AppSec findings and Guardrail constraints into the final project telemetry. + +--- + +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) + +GitGalaxy Tools is the modular deployment layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed for planetary-scale repositories. + +Explore the ecosystem: + +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/tools/ai_guardrails/README.md b/gitgalaxy/tools/ai_guardrails/README.md index 1723ede4..ce38f149 100644 --- a/gitgalaxy/tools/ai_guardrails/README.md +++ b/gitgalaxy/tools/ai_guardrails/README.md @@ -1,4 +1,4 @@ -# GitGalaxy: Dual-Sided AI Guardrails & AppSec Sensors +# GitGalaxy Security: Dual-Sided AI Guardrails & AppSec Sensors [![Defense](https://img.shields.io/badge/Defense-Dual--Sided_AI_Guardrails-00BFFF.svg)](#) [![Velocity](https://img.shields.io/badge/Velocity-No_Compilation_Required-00C957.svg)](#) @@ -6,51 +6,63 @@ Welcome to the **GitGalaxy AI Guardrails Suite**. -The adoption of Generative AI has created two massive security blind spots for modern enterprise teams. First, developers are building AI features that grant LLMs dangerous levels of system access (The AppSec Threat). Second, developers are utilizing autonomous coding agents that can silently introduce architectural degradation into complex codebases (The DevSec Threat). +The rapid adoption of Generative AI has introduced two critical security and stability blind spots for modern enterprise teams. First, developers are deploying AI features that grant Large Language Models (LLMs) dangerous levels of system and execution access (The AppSec Threat). Second, developers are utilizing autonomous coding agents that can silently introduce architectural degradation into complex codebases (The DevSecOps Threat). -Legacy security scanners ([like SonarQube or Checkmarx](https://squid-protocol.github.io/gitgalaxy/04-00-security_landscape/)) cannot fix this. They look for traditional SQL injection, not Prompt Injection. They rely on slow compilation cycles that fail to keep pace with AI development, leaving you completely blind to Agentic logic loops and context shredders. +Legacy security scanners cannot solve this. They are designed to detect traditional SQL injections, not Prompt Injections or Agentic context exhaustion. They rely on slow AST (Abstract Syntax Tree) compilation cycles that fail to map the structural reality of AI-driven state mutation. -GitGalaxy maps the architectural reality of your code in seconds. We use AST-free mathematical heuristics to generate deep, contextual reports, allowing you to block dangerous AI behavior before it ever hits production. +GitGalaxy maps the architectural reality of your code in seconds. We use AST-free mathematical heuristics to generate deep, contextual telemetry, allowing you to block dangerous AI architectures and sandbox autonomous agents before they compromise production. --- -### 🛡️ Side 1: The AI AppSec Sensor (`AIAppSecSensor`) -*Protects your application from the AI features you build.* +## 🧠 Engineering Highlights (Architectural Feats) -Standard AST scanners frequently miss "Weaponized AI Architectures." This sensor acts as a physical boundary, mapping the physical call-path distance between an LLM API execution and your critical system functions. +To protect repositories against non-deterministic AI behavior without slowing down CI/CD pipelines, we engineered these sensors to evaluate the mathematical topology of the codebase rather than relying on brittle semantic analysis: -* **The RCE Funnel:** Detects LLMs wired directly to OS commands or shell executions. This allows you to aggressively [block Prompt-Injection-to-RCE attacks](https://squid-protocol.github.io/gitgalaxy/cookbook/prevent-agentic-rce/) in your CI/CD pipeline. -* **The "God-Mode" Agent:** Flags autonomous tools with raw, unfiltered database access. Blocks autonomous data corruption before it can wipe a production table. -* **The Exfiltration Vector:** Identifies LLMs accessing network sockets and cryptographic secrets, stopping SSRF and key exfiltration vulnerabilities cold. +* **Topological Threat Intersection (`ai_appsec_sensor.py`):** Standard scanners evaluate vulnerabilities in isolation. This sensor cross-references multi-dimensional structural topologies. It mathematically proves when an LLM Orchestrator node sits on the same execution path as an OS-level `subprocess` call and a Public API router. By mapping these intersections, it deterministically flags **Autonomous Execution Vectors** without requiring dynamic runtime execution. +* **Algorithmic Context Validation (`dev_agent_firewall.py`):** Autonomous coding agents blindly attempt to refactor files regardless of complexity. This firewall calculates the physical Token Mass of a file and cross-references it against its extracted Big-O Algorithmic Complexity (e.g., $O(N^3)$). If the limits are breached, it flags a **Context Window Exhaustion** risk, mathematically proving the agent is about to hallucinate and corrupt the logic. +* **Blast Radius Sandboxing (`dev_agent_firewall.py`):** We strictly prohibit AI agents from modifying the structural load-bearing pillars of your architecture. By querying the Knowledge Graph for a file's **Dependency Blast Radius** (PageRank / Downstream Exposure), the firewall automatically mandates Human-In-The-Loop (HITL) reviews for any PRs targeting highly centralized nodes with existing Technical Debt. --- -### 🤖 Side 2: The Dev Agent Firewall (`DevAgentFirewall`) -*Protects your codebase from the autonomous AI tools you use.* +## 🛡️ Side 1: The AI AppSec Sensor (`ai_appsec_sensor.py`) +*Protects your application from the AI features your developers build.* -Not all legacy code is safe for an AI coding assistant (like Cursor, Copilot, or Claude) to modify. This firewall evaluates the structural complexity, cognitive load, and entropy of a file to determine if an AI agent will succeed, hallucinate, or silently destroy your system logic. By running this sensor, you can safely [sandbox autonomous agents](https://squid-protocol.github.io/gitgalaxy/cookbook/sandbox-autonomous-agents/) to only work on verified, low-complexity files. +**Why It Was Built:** AI agents with unconstrained execution boundaries represent a critical security risk. Traditional Static Analysis (SAST) misses the intersection of LLM logic and system APIs. By analyzing the structural topology of the codebase, this sensor deterministically identifies intersections where LLMs (which are inherently vulnerable to Prompt Injection) are dangerously close to OS commands or database writes. -* **Context Window Shredders:** Identifies massive files with extreme algorithmic complexity. Prevents AI context collapse and logic truncation. -* **The Hallucination Zone:** Highlights heavy metaprogramming with zero documentation, preventing AI method hallucination and fabricated syntax. -* **Silent Mutation Risk:** Flags logic with a high blast radius and zero test coverage. Blocks unverifiable AI modifications. -* **HITL Mandate:** Detects severe technical debt. Forces a strict Human-In-The-Loop (HITL) code review requirement for PRs generated by AI agents. +**What It Detects:** +* **Autonomous Execution Vector:** Detects LLM logic that is adjacent to OS-level execution (`eval`, `subprocess`) and exposed via a public API router. This allows you to aggressively block Prompt-Injection-to-RCE attacks in your CI/CD pipeline. +* **Over-Permissioned Agent Binding:** Flags autonomous tools bound to raw Database/IO write access with critically low defensive programming density (e.g., missing `try/catch` blocks). Blocks autonomous data corruption before it reaches production tables. +* **Agentic Exfiltration Vector:** Identifies LLM logic with access to both unfiltered network sockets and hardcoded environment secrets, neutralizing SSRF and autonomous key exfiltration vectors. --- -### 🚀 Quickstart: CI/CD & Pipeline Integration +## 🤖 Side 2: The Dev Agent Firewall (`dev_agent_firewall.py`) +*Protects your codebase from the autonomous AI coding tools your developers use.* -Currently, the AI Guardrails operate as deep-inspection middleware. Instead of running as standalone commands, these sensors seamlessly inject themselves into the primary `galaxyscope` analysis pipeline to evaluate project telemetry in real-time. +**Why It Was Built:** Autonomous coding agents (e.g., Claude, Cursor) excel in isolated, pure-function environments but struggle with highly coupled, poorly documented, or dynamically generated logic. This firewall establishes Zero-Trust guardrails. It prevents AI agents from executing unchecked modifications in volatile sectors, mitigating the risk of cascading failures, context window exhaustion, and silent state mutations. -#### 1. Local CLI Execution +**What It Detects:** +* **Context Window Exhaustion:** Identifies files exceeding standard token limits combined with extreme algorithmic complexity. Prevents the AI from losing context and inducing severe structural hallucinations. +* **Hallucination Risk:** Highlights codebases with heavy dynamic metaprogramming and severe Documentation Risk Exposure (< 20% density). Flags zones where autonomous agents are mathematically highly likely to hallucinate missing methods. +* **Cascading State Flux:** Flags logic with high state mutation and dense downstream dependencies, but zero test coverage. Blocks unverifiable AI modifications where the agent cannot mathematically verify its own structural changes. +* **HITL Mandate:** Detects high **Dependency Blast Radius** combined with severe Technical Debt. Forces a strict Human-In-The-Loop (HITL) architectural review requirement for PRs generated by AI. + +--- + +## 🚀 Quickstart: CI/CD & Pipeline Integration + +Currently, the AI Guardrails operate as deep-inspection middleware. Instead of running as standalone commands, these sensors seamlessly inject themselves into the primary GitGalaxy analysis pipeline to evaluate project telemetry in real-time. + +### 1. Local CLI Execution Run a standard scan using the global PyPI package. The guardrails will automatically evaluate the ecosystem and report critical Agentic vulnerabilities. -```bash -galaxyscope /path/to/source/code -``` +###bash +gitgalaxy /path/to/source/code +### -#### 2. GitHub Actions CI/CD Integration -To block dangerous AI architectures or prevent AI agents from modifying complex code, run the main GalaxyScope engine on your pull requests. Create `.github/workflows/ai-guardrails.yml`: +### 2. GitHub Actions CI/CD Integration +To block dangerous AI architectures or prevent AI agents from modifying complex code, run the main GitGalaxy engine on your pull requests. Create `.github/workflows/ai-guardrails.yml`: -```yaml +###yaml name: GitGalaxy AI Guardrails on: @@ -64,18 +76,22 @@ jobs: - name: Checkout Repository uses: actions/checkout@v4 - - name: Run GalaxyScope Engine - uses: squid-protocol/gitgalaxy@main + - name: Run GitGalaxy Engine + uses: squid-protocol/gitgalaxy@v2.0.7 with: - tool: 'galaxyscope' + tool: 'core-engine' target: '.' -``` +### --- -### 🌌 Powered by the blAST Engine (Bypassing LLMs and ASTs) -This tool is a modular enterprise integration within the broader GitGalaxy architecture. It is driven by our custom mathematical heuristics engine, capable of mapping multi-dimensional relationships at extreme velocity without requiring rigid ASTs. Read the official documentation to see the structural methodologies powering these guardrails: -* 📖 **[AI AppSec Sensor Architecture](https://squid-protocol.github.io/gitgalaxy/02-17-ai-appsec-sensor/)** -* 📖 **[Dev Agent Firewall Mechanics](https://squid-protocol.github.io/gitgalaxy/02-18-dev-agent-firewall/)** -* 📖 **[Logic Bomb & Injection Surface Risk Equations](https://squid-protocol.github.io/gitgalaxy/08-20-logic-bomb-exposure/)** -* 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** \ No newline at end of file +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) + +GitGalaxy AI Guardrails is the autonomous defense layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed for planetary-scale repositories. + +Explore the ecosystem: + +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/tools/ai_guardrails/ai_appsec_sensor.py b/gitgalaxy/tools/ai_guardrails/ai_appsec_sensor.py index 9ba3e40e..fea85cfc 100644 --- a/gitgalaxy/tools/ai_guardrails/ai_appsec_sensor.py +++ b/gitgalaxy/tools/ai_guardrails/ai_appsec_sensor.py @@ -1,5 +1,18 @@ +#!/usr/bin/env python3 # ============================================================================== -# GitGalaxy - AI Application Security (AppSec) Sensor +# GitGalaxy Tool: AI Application Security (AppSec) Sensor +# +# PURPOSE: +# Scans the repository for vulnerable AI architectures built by developers. +# It flags dangerous intersections where LLMs (which are vulnerable to Prompt +# Injection) are given access to OS commands, database writes, or unfiltered +# network sockets. +# +# ARCHITECTURAL DECISION: +# AI agents with unconstrained execution boundaries represent a critical +# security risk. By analyzing the structural topology of the codebase, this +# sensor deterministically identifies Autonomous Execution Vectors and +# Over-Permissioned Agents before they reach production. # ============================================================================== import logging from typing import List, Dict, Any @@ -7,26 +20,17 @@ class AIAppSecSensor: """ - The AppSec Threat Hunter. - - PURPOSE: Scans the ecosystem for weaponized AI architectures built by the - developers. It flags dangerous intersections where LLMs (which are vulnerable - to Prompt Injection) are given access to OS commands, database writes, or - unfiltered network sockets. + AI Application Security (AppSec) Threat Sensor. """ def __init__(self, parent_logger=None): - self.logger = ( - parent_logger.getChild("appsec_sensor") - if parent_logger - else logging.getLogger("appsec_sensor") - ) + self.logger = parent_logger.getChild("appsec_sensor") if parent_logger else logging.getLogger("appsec_sensor") def hunt_threats(self, parsed_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - self.logger.info("AI AppSec Sensor: Hunting for Agentic Vulnerabilities...") + self.logger.info("AI AppSec Sensor: Scanning for Agentic Vulnerabilities...") for file_data in parsed_files: - # Extract the raw DNA triggers (assuming they are tallied in 'telemetry') + # Extract the raw structural signatures (assuming they are tallied in 'telemetry') telemetry = file_data.get("telemetry", {}) # Extract specific architectural signals @@ -38,14 +42,10 @@ def hunt_threats(self, parsed_files: List[Dict[str, Any]]) -> List[Dict[str, Any arch_io = telemetry.get("arch_io", 0) > 0 # Network/Disk I/O db_complexity = file_data.get("max_db_complexity", 0) # Data gravity - # Security DNA - sec_danger = telemetry.get("sec_danger", 0) > 0 # eval, exec, subprocess - sec_secrets = ( - telemetry.get("sec_secrets", 0) > 0 - ) # Hardcoded keys/env access - safety_density = telemetry.get( - "safety_density", 1.0 - ) # Defensive programming (try/catch, regex) + # Security Structural Signatures + sec_danger = telemetry.get("sec_high_risk_execution", 0) > 0 # eval, exec, subprocess + sec_secrets = telemetry.get("sec_secrets", 0) > 0 # Hardcoded keys/env access + safety_density = telemetry.get("safety_density", 1.0) # Defensive programming (try/catch, regex) appsec_report = { "is_rce_funnel": False, @@ -54,28 +54,28 @@ def hunt_threats(self, parsed_files: List[Dict[str, Any]]) -> List[Dict[str, Any "critical_warnings": [], } - # 1. The RCE Funnel (Weaponized Prompt Injection) + # 1. Autonomous Execution Vector (Weaponized Prompt Injection) # LLM Logic + Public API Router + OS Command Execution if (ai_orchestrator or llm_api) and arch_api and sec_danger: appsec_report["is_rce_funnel"] = True appsec_report["critical_warnings"].append( - "CRITICAL [RCE Funnel]: AI logic is adjacent to OS-level execution (eval/subprocess) and exposed via API. Immediate Prompt Injection -> RCE vulnerability." + "CRITICAL [Autonomous Execution Vector]: AI logic is adjacent to OS-level execution (eval/subprocess) and exposed via API. Immediate Prompt Injection -> Autonomous Execution vulnerability." ) - # 2. The "God-Mode" Tool Binding (Autonomous Escalation) + # 2. Over-Permissioned Agent Binding (Autonomous Escalation) # AI Agent Tools + State Mutation (DB or Disk) + Low Defensive Safety if ai_tools and (db_complexity >= 2 or arch_io) and safety_density < 0.5: appsec_report["over_permissioned_agent"] = True appsec_report["critical_warnings"].append( - "CRITICAL [God-Mode Agent]: AI is bound to tools with raw Database/IO write access and < 50% safety density. High risk of autonomous data corruption." + "CRITICAL [Over-Permissioned Agent]: AI is bound to tools with raw Database/IO write access and < 50% safety density. High risk of autonomous data corruption." ) - # 3. The Exfiltration Vector (Unsandboxed Sockets) + # 3. Agentic Exfiltration Vector (Unsandboxed Sockets) # LLM Logic + Outbound Sockets/Fetch + Access to Secrets if llm_api and arch_io and sec_secrets: appsec_report["agentic_exfiltration_risk"] = True appsec_report["critical_warnings"].append( - "CRITICAL [Exfiltration Vector]: LLM logic has access to network sockets AND environment secrets. High risk of SSRF and key exfiltration via prompt injection." + "CRITICAL [Agentic Exfiltration Vector]: LLM logic has access to network sockets AND environment secrets. High risk of SSRF and key exfiltration via prompt injection." ) # Inject the AppSec report back into the file's telemetry diff --git a/gitgalaxy/tools/ai_guardrails/dev_agent_firewall.py b/gitgalaxy/tools/ai_guardrails/dev_agent_firewall.py index cf62ce8c..c77bb168 100644 --- a/gitgalaxy/tools/ai_guardrails/dev_agent_firewall.py +++ b/gitgalaxy/tools/ai_guardrails/dev_agent_firewall.py @@ -1,5 +1,17 @@ +#!/usr/bin/env python3 # ============================================================================== -# GitGalaxy - AI Guardrails +# GitGalaxy Tool: Autonomous Agent Firewall +# +# PURPOSE: +# Evaluates the structural and topological constraints of the codebase to +# determine the safety boundaries for autonomous AI agents (e.g., Claude, Cursor). +# +# ARCHITECTURAL DECISION: +# Autonomous coding agents excel in isolated, pure-function environments but +# struggle with highly coupled, poorly documented, or dynamically generated logic. +# This firewall establishes Zero-Trust guardrails to prevent AI agents from +# executing unchecked modifications in volatile sectors, mitigating the risk +# of cascading failures, context window exhaustion, and silent state mutations. # ============================================================================== import logging from typing import List, Dict, Any @@ -7,30 +19,21 @@ class DevAgentFirewall: """ - Evaluates the codebase specifically to determine if it is safe to let - an autonomous AI agent (Claude, Cursor, etc.) modify the code. + Autonomous Agent Guardrail Engine. """ def __init__(self, parent_logger=None): - self.logger = ( - parent_logger.getChild("guardrails") - if parent_logger - else logging.getLogger("guardrails") - ) + self.logger = parent_logger.getChild("guardrails") if parent_logger else logging.getLogger("guardrails") - def evaluate_ecosystem( - self, parsed_files: List[Dict[str, Any]] - ) -> List[Dict[str, Any]]: - self.logger.info("Executing Agentic Firewall & Token Physics Checks...") + def evaluate_ecosystem(self, parsed_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + self.logger.info("Executing Autonomous Agent Firewall & Token Density Validation...") for file_data in parsed_files: token_mass = file_data.get("token_mass", 0) network_metrics = file_data.get("telemetry", {}).get("network_metrics", {}) - risk_vector = file_data.get( - "risk_vector", [] - ) # Assuming standard 0-100 risk scores + risk_vector = file_data.get("risk_vector", []) # Assuming standard 0-100 risk scores - # Extract relevant physics safely handling None values from Zero-Dependency Mode + # Extract relevant structural metrics, safely handling None values from Zero-Dependency Mode pagerank = network_metrics.get("normalized_blast_radius") or 0.0 max_big_o = file_data.get("max_big_o") or 1 @@ -41,32 +44,32 @@ def evaluate_ecosystem( "warnings": [], } - # 1. The Context Window Shredder (The Black Hole) - # If it burns > 8k tokens AND has terrible algorithmic complexity, the AI will fail. + # 1. Context Window Exhaustion (Agentic Black Hole) + # If a file exceeds token limits AND has severe algorithmic complexity, the AI will lose context. if token_mass is not None and token_mass > 8000 and max_big_o >= 3: guardrails["is_agentic_black_hole"] = True guardrails["warnings"].append( - f"CRITICAL: Black Hole detected. Token mass ({token_mass}) and O(N^{max_big_o}) complexity will shred agent context." + f"CRITICAL [Context Window Exhaustion]: Token mass ({token_mass}) and O(N^{max_big_o}) complexity will exceed agent context capabilities and induce severe hallucination." ) - # 2. The HITL Mandate (Blast Radius + Danger) + # 2. The HITL Mandate (Downstream Exposure + Severe Risk Debt) if pagerank > 1.0 and sum(risk_vector) > 200: guardrails["requires_hitl"] = True guardrails["warnings"].append( - "WARNING: High Blast Radius with severe risk debt. Human-in-the-Loop required for modifications." + "WARNING [HITL Mandate]: High Downstream Exposure combined with severe risk debt. Human-in-the-Loop required for structural modifications." ) - # 3. The Hallucination Zone (Metaprogramming + Low Docs) - meta_heavy = file_data.get("telemetry", {}).get("heat_triggers", 0) > 2 + # 3. Metaprogramming Hallucination Risk + meta_heavy = file_data.get("telemetry", {}).get("reflection_metaprogramming", 0) > 2 doc_density = file_data.get("telemetry", {}).get("doc_density", 1.0) if meta_heavy and doc_density < 0.2: guardrails["hallucination_zone"] = True guardrails["warnings"].append( - "DANGER: Hallucination Zone. Dynamic metaprogramming detected with < 20% documentation density. AI will likely hallucinate missing methods." + "DANGER [Hallucination Risk]: Dynamic metaprogramming detected combined with severe Documentation Risk Exposure (< 20% density). Autonomous agents are highly likely to hallucinate missing methods." ) - # 4. The Silent Mutation Risk (High Flux + High Blast + No Tests) + # 4. Cascading State Flux (Silent Mutation Risk) state_flux = file_data.get("telemetry", {}).get("state_flux", 0) in_degree = network_metrics.get("in_degree", 0) has_tests = file_data.get("telemetry", {}).get("has_tests", False) @@ -74,7 +77,7 @@ def evaluate_ecosystem( if state_flux > 50 and in_degree > 5 and not has_tests: guardrails["silent_mutation_risk"] = True guardrails["warnings"].append( - f"CRITICAL: Silent Mutation Risk. Flux ({state_flux}) and Blast Radius ({in_degree} deps) are high, but zero tests exist. AI cannot verify its own fixes." + f"CRITICAL [Cascading State Flux]: High state mutation ({state_flux}) and dense downstream dependencies ({in_degree}), with zero verification coverage. Autonomous agents cannot mathematically verify their own structural modifications." ) # Inject the firewall report back into the file's telemetry @@ -82,4 +85,4 @@ def evaluate_ecosystem( file_data["telemetry"] = {} file_data["telemetry"]["ai_guardrails"] = guardrails - return parsed_files + return parsed_files \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/README.md b/gitgalaxy/tools/cobol_to_cobol/README.md index 73a4fee5..bb01048b 100644 --- a/gitgalaxy/tools/cobol_to_cobol/README.md +++ b/gitgalaxy/tools/cobol_to_cobol/README.md @@ -1,22 +1,22 @@ -# GitGalaxy: Mainframe Refactoring & COBOL Modernization Suite +# GitGalaxy: Mainframe Modernization & Structural Extraction Suite [![Mainframe Tested](https://img.shields.io/badge/Tested-MVS_3.8j_(1974)-000000.svg?style=flat&logo=ibm)](#) -[![Architecture](https://img.shields.io/badge/Architecture-Deterministic_Regex-00BFFF.svg)](#) +[![Architecture](https://img.shields.io/badge/Architecture-Deterministic_Heuristics-00BFFF.svg)](#) [![State Manager](https://img.shields.io/badge/State-Hybrid_RAM%2FSQLite-8A2BE2.svg)](#) -Welcome to the **GitGalaxy Mainframe Modernization Suite**. This is a deterministic, high-speed static analysis suite designed to safely slice, sanitize, and [map monolithic legacy systems](https://squid-protocol.github.io/gitgalaxy/cookbook/map-cobol-monoliths/). +Welcome to the **GitGalaxy Mainframe Modernization Suite**. This is a deterministic, high-speed static analysis suite designed to safely slice, sanitize, and map monolithic legacy systems prior to cloud migration. -**Mainframe Proven:** The outputs of these architectural tools natively compile against raw MVS 3.8j operating systems (1974 Hercules Mainframe), while simultaneously scaffolding modern cloud environments. +**Mainframe Proven:** The architectural scaffolding generated by these tools compiles natively against raw MVS 3.8j operating systems (1974 Hercules Mainframe), while simultaneously generating strict architectural contracts for modern cloud environments (Spring Boot, PostgreSQL). ### 🔄 The Modernization Pipeline You point the [Legacy Refraction Controller](https://squid-protocol.github.io/gitgalaxy/05-01-legacy-refraction-controller/) at a massive, undocumented COBOL repository. It translates a chaotic folder of `.cbl` files into a deterministic execution pipeline: -* **The Assessment:** Dynamically scales between high-speed RAM and disk-backed SQLite3 **to provide absolute OOM (Out-of-Memory) crash protection when processing massive, monolithic legacy repositories.** -* **Dead Code Extraction:** Uses structural heuristics to mathematically map and [extract orphaned memory and dead code bloat](https://squid-protocol.github.io/gitgalaxy/cookbook/identifying-dead-code-in-cobol/). *(AST-Free)* -* **Dependency Mapping:** Maps data lineage to deflect dead dependencies. -* **Context-Aware Synergy:** A unified Intermediate Representation (IR) State Manager ensures tools communicate—the Graveyard Reaper's dead-code math prevents the Schema Forge from migrating dead columns to the cloud, and stops the Microservice Slicer from hallucinating business rules out of dead code. -* **Asset Generation:** Generates pristine PostgreSQL schemas, JSON APIs, and compile-ready JCLs. +* **The Assessment (OOM Protection):** Dynamically scales between high-speed RAM and disk-backed SQLite3 to provide absolute Out-of-Memory (OOM) crash protection when processing massive, monolithic legacy repositories. +* **Dead Code Extraction:** Uses structural heuristics to mathematically map and extract orphaned memory and dead code bloat, preventing legacy rot from migrating to the cloud. *(AST-Free)* +* **Dependency Mapping:** Maps data lineage to construct a Directed Acyclic Graph (DAG) for deterministic execution order. +* **Context-Aware Synergy:** A unified Intermediate Representation (IR) State Manager ensures tools communicate—the Deprecated Trails Analyzer's dead-code math prevents the Schema Generator from migrating dead columns to PostgreSQL, and stops the Microservice Extractor from hallucinating business rules out of unreachable code. +* **Asset Generation:** Generates pristine PostgreSQL schemas, JSON APIs, and strict Zero-Trust JCLs. --- @@ -25,37 +25,38 @@ You point the [Legacy Refraction Controller](https://squid-protocol.github.io/gi This suite is built on a modular Hub-and-Spoke architecture. Every Python script acts as an independent CLI tool or is orchestrated centrally. #### 1. Pre-Processors & Sensors -* **[Lexical Patcher](https://squid-protocol.github.io/gitgalaxy/05-13-lexical-patcher/) (`cobol_lexical_patcher.py`):** Safely neutralizes legacy compiler traps. -* **[System Limits Reporter](https://squid-protocol.github.io/gitgalaxy/05-17-system-limits-reporter/) (`cobol_system_limits_reporter.py`):** Flags non-deterministic routing logic and system constraint breaches. -
![System Limits Reporter](../../../docs/wiki/assets/system_limits_reporter.gif) +* **[Lexical Patcher](https://squid-protocol.github.io/gitgalaxy/05-13-lexical-patcher/) (`cobol_lexical_patcher.py`):** Safely neutralizes legacy compiler traps (e.g., converting `NEXT SENTENCE` to explicit `CONTINUE` block scopes). +* **[Architectural Anomaly Detector](https://squid-protocol.github.io/gitgalaxy/05-17-system-limits-reporter/) (`cobol_system_limits_reporter.py`):** Flags non-deterministic routing logic (e.g., `ALTER`, `EXEC CICS HANDLE CONDITION`) that compromises static data lineage. +
![Architectural Anomaly Detector](../../../docs/wiki/assets/system_limits_reporter.gif) #### 2. Extractors & Slicers -* **[Graveyard Reaper](https://squid-protocol.github.io/gitgalaxy/05-10-graveyard-reaper/) (`cobol_graveyard_finder.py`):** Expands copybooks to calculate dead code bloat. -
![Graveyard Reaper](../../../docs/wiki/assets/graveyard_reaper.gif) -* **[DAG Architect](https://squid-protocol.github.io/gitgalaxy/05-08-dag-architect/) (`cobol_dag_architect.py`):** Maps data lineage to [mathematically calculate zero-trust execution topology](https://squid-protocol.github.io/gitgalaxy/cookbook/creating-dag-from-cobol-files/). +* **[Deprecated Trails Analyzer](https://squid-protocol.github.io/gitgalaxy/05-10-graveyard-reaper/) (`cobol_graveyard_finder.py`):** Expands copybooks to calculate dead code bloat and mathematically unreachable execution logic. +
![Deprecated Trails Analyzer](../../../docs/wiki/assets/graveyard_reaper.gif) +* **[DAG Architect](https://squid-protocol.github.io/gitgalaxy/05-08-dag-architect/) (`cobol_dag_architect.py`):** Maps data lineage to mathematically calculate zero-trust topological execution order.
![DAG Architect](../../../docs/wiki/assets/dag_architect.gif) -* **[Microservice Slicer](https://squid-protocol.github.io/gitgalaxy/05-14-microservice-slicer/) (`cobol_microservice_slicer.py`):** Executes 3-pass recursive variable taint-tracking for safe [business logic extraction](https://squid-protocol.github.io/gitgalaxy/cookbook/cobol-microservice-slicing/). -
![Microservice Slicer](../../../docs/wiki/assets/microservice_slicer.gif) -* **[ETL Unpacker](https://squid-protocol.github.io/gitgalaxy/05-09-etl-unpacker/) (`cobol_etl_unpacker.py`):** Translates binary EBCDIC and Packed Decimal to CSVs to [unpack hidden ETL flows](https://squid-protocol.github.io/gitgalaxy/cookbook/unpacking-etl-from-cbl-files/). - -#### 3. Cloud & Mainframe Forges -* **[Compiler Forge](https://squid-protocol.github.io/gitgalaxy/05-07-mainframe-compiler-forge/) (`cobol_compiler_forge.py`):** Flattens copybooks and generates era-aware build JCLs. -
![Compiler Forge](../../../docs/wiki/assets/compiler_forge.gif) -* **[Cloud Schema Forge](https://squid-protocol.github.io/gitgalaxy/05-15-cloud-schema-forge/) (`cobol_schema_forge.py`):** Translates `PIC` clauses to [strict PostgreSQL DDL schemas](https://squid-protocol.github.io/gitgalaxy/cookbook/creating-schema-from-cobol-files/). -
![Cloud Schema Forge](../../../docs/wiki/assets/cloud_schema_forge.gif) -* **[Zero-Trust JCL Forge](https://squid-protocol.github.io/gitgalaxy/05-12-zero-trust-jcl-forge/) (`cobol_jcl_forge.py`):** Extracts `SELECT` mappings to auto-generate strict, least-privilege JCL emulators—**automatically stripping over-permissioned global access (e.g., `DISP=SHR`) and locking physical dataset provisioning to the exact lineage required.** -
![Zero-Trust JCL Forge](../../../docs/wiki/assets/jcl_forge_demo.gif) - +* **[Microservice Logic Extractor](https://squid-protocol.github.io/gitgalaxy/05-14-microservice-slicer/) (`cobol_microservice_slicer.py`):** Executes 3-pass recursive variable taint-tracking for safe, isolated business logic extraction. +
![Microservice Logic Extractor](../../../docs/wiki/assets/microservice_slicer.gif) +* **[ETL EBCDIC Unpacker](https://squid-protocol.github.io/gitgalaxy/05-09-etl-unpacker/) (`cobol_etl_unpacker.py`):** Translates binary EBCDIC and Packed Decimal (`COMP-3`) directly to CSVs to unpack hidden ETL flows. + +#### 3. Cloud & Mainframe Generators +* **[Mainframe Compiler Generator](https://squid-protocol.github.io/gitgalaxy/05-07-mainframe-compiler-forge/) (`cobol_compiler_forge.py`):** Flattens copybooks and dynamically generates era-aware build JCLs (COBOL-74 vs COBOL-85). +
![Mainframe Compiler Generator](../../../docs/wiki/assets/compiler_forge.gif) +* **[Cloud Schema Generator](https://squid-protocol.github.io/gitgalaxy/05-15-cloud-schema-forge/) (`cobol_schema_forge.py`):** Translates complex `PIC` constraints and `REDEFINES` overlays into strict PostgreSQL DDL schemas. +
![Cloud Schema Generator](../../../docs/wiki/assets/cloud_schema_forge.gif) +* **[Zero-Trust JCL Generator](https://squid-protocol.github.io/gitgalaxy/05-12-zero-trust-jcl-forge/) (`cobol_jcl_forge.py`):** Extracts `SELECT` mappings to auto-generate strict, least-privilege JCL emulators—automatically stripping over-permissioned global access (e.g., `DISP=SHR`) and locking physical dataset provisioning to exact lineage bounds. +
![Zero-Trust JCL Generator](../../../docs/wiki/assets/jcl_forge_demo.gif) +* **[Zero-Trust JCL Auditor](https://squid-protocol.github.io/gitgalaxy/cookbook/jcl-auditing/) (`cobol_jcl_auditor.py`):** Mathematically compares original legacy JCLs against the generated equivalents to quantify architectural bloat reduction and over-permissioned I/O shedding. + #### 4. The AI Remediation Boundary -* **[Anomaly Task Forge](https://squid-protocol.github.io/gitgalaxy/05-16-anomaly-agent-task-forge/) (`cobol_agent_task_forge.py`):** Isolates structural anomalies into bounded JSON job tickets for LLM remediation. +* **[Autonomous Agent Task Generator](https://squid-protocol.github.io/gitgalaxy/05-16-anomaly-agent-task-forge/) (`cobol_agent_task_forge.py`):** Converts isolated structural anomalies into highly constrained, structured JSON task tickets designed for automated LLM agent dispatchers. --- ### 🚀 Quickstart: Running the Controller -You don't need to run the tools individually. The central orchestrator handles the execution pipeline. +You do not need to run the tools individually. The central orchestrator handles the execution pipeline sequentially. -**Basic Modernization (Sanitize, Map, and Forge JCL/Schemas):** +**Basic Modernization (Sanitize, Map, and Generate JCL/Schemas):** ```bash python3 cobol_refractor_controller.py /path/to/legacy/repo ``` @@ -67,31 +68,31 @@ python3 cobol_refractor_controller.py /path/to/legacy/repo Below is the live console output of the GitGalaxy orchestrator processing a legacy IBM CICS banking application. Notice the engine identifying over 6,700 lines of dead code, warning about macro substitutions, and automatically routing the compiler based on the detected COBOL dialect (74 vs 85). ```text -=== 1. INITIATING GRAVEYARD REAPER === -🪦 GitGalaxy Reaper scanning cics-banking-sample-application-cbsa for dead code... +=== 1. INITIATING DEPRECATED TRAILS ANALYZER === +🔍 GitGalaxy Deprecated Trails Analyzer scanning cics-banking-sample-application-cbsa for obsolete logic... [... File Scans Omitted for Brevity ...] ========================================================== - 📉 DEAD CODE ELIMINATION REPORT + 📉 DEPRECATED TRAILS REDUCTION REPORT ========================================================== Files Flagged for Cleanup : 29 - Unused Memory Addresses : 817 orphaned variables - Unreachable Logic Blocks : 590 phantom paragraphs + Unused Memory Addresses : 817 variables + Unreachable Logic Blocks : 590 paragraphs ✂️ Estimated Bloat Removed : ~6717 Lines of Code ========================================================== === 2. INITIATING DAG ARCHITECT === -🕸️ GitGalaxy DAG Architect mapping data lineage in: cics-banking-sample-application-cbsa... +🕸️ GitGalaxy Data Lineage Architect mapping execution topology in: cics-banking-sample-application-cbsa... ========================================================== - ⚡ ZERO-TRUST EXECUTION PIPELINE (TOPOLOGICAL SORT) + ⚡ DETERMINISTIC EXECUTION PIPELINE (TOPOLOGICAL SORT) ========================================================== STEP 01: Run [BANKDATA] ↳ Reads : None ↳ Writes: VSAM ---------------------------------------------------------- -=== 3. INITIATING SYSTEM LIMITS REPORTER === -📠 Scanning directory for System Limits: cics-banking-sample-application-cbsa... -🔎 GitGalaxy Honesty Protocol scanning 29 files for structural dragons... +=== 3. INITIATING ARCHITECTURAL ANOMALY DETECTOR === +📠 Scanning directory for Architectural Anomalies: cics-banking-sample-application-cbsa... +🔎 GitGalaxy executing architectural integrity scan on 29 files... ========================================================================================== ⚠️ [XFRFUN.cbl : Line 0128] HIGH LIMIT - Macro substitution detected. AST math may drift from actual compiled execution. ⚠️ [CREACC.cbl : Line 0260] HIGH LIMIT - Macro substitution detected. AST math may drift from actual compiled execution. @@ -102,8 +103,8 @@ Below is the live console output of the GitGalaxy orchestrator processing a lega 🚨 WARNING: Found 5 structural anomalies requiring human architectural review. ========================================================================================== -=== 4. INITIATING CLOUD SCHEMA FORGE === -🔨 GitGalaxy Schema Forge striking anvil for: BNK1UAC.cbl... +=== 4. INITIATING CLOUD SCHEMA GENERATOR === +🔨 GitGalaxy Cloud Schema Generator processing: BNK1UAC.cbl... ========================================================== 🐘 POSTGRESQL DDL (CLOUD DATABASE SCHEMA) ========================================================== @@ -119,21 +120,21 @@ CREATE TABLE DFHCOMMAREA ( -- [Schema Omitted for Brevity] ); -=== 5. INITIATING MICROSERVICE SLICER === -🔪 GitGalaxy Slicer hunting aliases for [WS-ACCOUNT-BALANCE] in BNK1UAC.cbl... +=== 5. INITIATING MICROSERVICE LOGIC EXTRACTOR === +🔪 GitGalaxy Logic Extractor tracing dependencies for [WS-ACCOUNT-BALANCE] in BNK1UAC.cbl... ========================================================== - 🎯 Sliced 0 distinct business rules. + 🎯 Extracted 0 distinct business rules. ========================================================== -=== 6. INITIATING COMPILER FORGE === +=== 6. INITIATING MAINFRAME COMPILER GENERATOR === ====================================================================== - 🏗️ GITGALAXY MAINFRAME COMPILER FORGE (PRE-COMPILER ACTIVE) + 🏗️ GITGALAXY MAINFRAME COMPILER GENERATOR (PRE-COMPILER ACTIVE) ====================================================================== - [+] Forged COBOL-85 Pipeline : BUILD_BNK1UAC.jcl - [+] Forged COBOL-85 Pipeline : BUILD_DBCRFUN.jcl - [+] Forged COBOL-74 Pipeline : BUILD_GETSCODE.jcl - [+] Forged COBOL-85 Pipeline : BUILD_BANKDATA.jcl - [+] Forged COBOL-74 Pipeline : BUILD_GETCOMPY.jcl + [+] Generated COBOL-85 Pipeline : BUILD_BNK1UAC.jcl + [+] Generated COBOL-85 Pipeline : BUILD_DBCRFUN.jcl + [+] Generated COBOL-74 Pipeline : BUILD_GETSCODE.jcl + [+] Generated COBOL-85 Pipeline : BUILD_BANKDATA.jcl + [+] Generated COBOL-74 Pipeline : BUILD_GETCOMPY.jcl ====================================================================== === 7. INITIATING MASTER ORCHESTRATOR (REFRACTOR CONTROLLER) === @@ -144,7 +145,7 @@ CREATE TABLE DFHCOMMAREA ( 🛰️ Scouting repository mass... ↳ Found: 29 executable files (0.83 MB) ↳ OPTIMAL MASS: Engaging High-Speed RAM Dictionary. - Forging Context-Aware Artifacts at: cics-banking-sample-application-cbsa_gitgalaxy_clean_20260422_153624 + Generating Context-Aware Artifacts at: cics-banking-sample-application-cbsa_gitgalaxy_clean_20260422_153624 ---------------------------------------------------------------------- ====================================================================== 🏁 REFRACTION COMPLETE: Hybrid Pipeline execution successful. @@ -161,7 +162,7 @@ The controller generates a timestamped `_gitgalaxy_clean` directory containing: 3. `03_audit_reports/`: The Master Audit (quantifying lines of code saved and excess I/O blocked). 4. `04_ir_state_dumps/`: Relational JSON graphs mapping dead-code state and DAG lineage. 5. `05_microservice_slices/`: Isolated JSON business logic ready for translation. -6. `06_ai_agent_jobs/`: Structured JSON tickets for LLM remediation. +6. `06_ai_agent_jobs/`: Structured JSON task tickets for LLM remediation. --- ### 🌌 Powered by the blAST Engine (Bypassing LLMs and ASTs) @@ -169,5 +170,5 @@ This tool is a modular enterprise integration within the broader GitGalaxy archi * 📖 **[The Legacy Refraction Controller](https://squid-protocol.github.io/gitgalaxy/05-01-legacy-refraction-controller/)** * 📖 **[Dead Code Extraction Mathematics](https://squid-protocol.github.io/gitgalaxy/05-10-graveyard-reaper/)** -* 📖 **[Zero-Trust JCL Forge Mechanics](https://squid-protocol.github.io/gitgalaxy/05-12-zero-trust-jcl-forge/)** +* 📖 **[Zero-Trust JCL Generator Mechanics](https://squid-protocol.github.io/gitgalaxy/05-12-zero-trust-jcl-forge/)** * 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_agent_task_forge.py b/gitgalaxy/tools/cobol_to_cobol/cobol_agent_task_forge.py index f6ce8a7e..bd935f04 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_agent_task_forge.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_agent_task_forge.py @@ -1,26 +1,33 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Autonomous Agent Task Forge -# Purpose: Converts architectural anomalies into structured JSON job tickets -# designed for automated LLM agent dispatchers. +# GitGalaxy Tool: Autonomous Agent Task Generator +# +# PURPOSE: +# Converts Architectural Anomalies and legacy structural dependencies into +# highly constrained, structured JSON task tickets designed for automated +# LLM agent dispatchers. +# +# ARCHITECTURAL DECISION: +# Providing an autonomous AI agent with raw, unconstrained legacy code often +# leads to Context Window Exhaustion and severe hallucinations (e.g., hallucinating +# missing copybooks or external dependencies). By structuring the remediation +# tasks into strict JSON tickets with pre-resolved data lineage (inputs/outputs) +# and explicitly identified anomalies, we mathematically bound the agent's scope, +# ensuring deterministic and safe code modifications. # ============================================================================== import json from pathlib import Path -def generate_agent_ticket( - file_name: str, source_file: Path, anomalies: list, ir_state: dict -) -> dict: - """Forges a structured JSON task ticket for an autonomous agent.""" +def generate_agent_ticket(file_name: str, source_file: Path, anomalies: list, ir_state: dict) -> dict: + """Generates a structured JSON task ticket for an autonomous agent.""" - # Extract lineage to give the agent dependency context + # Extract Dependency Graph lineage to provide the agent with strict I/O context lineage = {} if ir_state: lineage = ir_state.get("analysis", {}).get("lineage", {}) - clean_anomalies = [ - a.split("]", 1)[-1].strip() if "]" in a else a for a in anomalies - ] + clean_anomalies = [a.split("]", 1)[-1].strip() if "]" in a else a for a in anomalies] ticket = { "job_id": f"{file_name.split('.')[0]}_REMEDIATION", @@ -35,7 +42,7 @@ def generate_agent_ticket( }, "system_prompt": ( "You are a deterministic legacy systems architect. Your task is to analyze the " - "provided 'target_file' and resolve the issues listed in 'detected_anomalies'. " + "provided 'target_file' and resolve the structural issues listed in 'detected_anomalies'. " "Do not alter the core business logic. Return your proposed solution as a valid JSON " "object containing a 'diagnosis' string and a 'patched_code' string." ), @@ -44,38 +51,37 @@ def generate_agent_ticket( return ticket -def forge_agent_jobs(clean_room_dir: Path, source_dir: Path, honesty_flags: list): - """Parses global flags and generates individual JSON job tickets per file.""" - if not honesty_flags: +def forge_agent_jobs(staging_dir: Path, source_dir: Path, architectural_anomalies: list): + """ + Parses global architectural anomalies and generates individual JSON task tickets per file. + (Function name preserved for downstream pipeline compatibility). + """ + if not architectural_anomalies: return 0 - out_dir = clean_room_dir / "06_ai_agent_jobs" + out_dir = staging_dir / "06_ai_agent_jobs" out_dir.mkdir(parents=True, exist_ok=True) - ir_dir = clean_room_dir / "04_ir_state_dumps" + ir_dir = staging_dir / "04_ir_state_dumps" - # Group flags by file - file_flags = {} - for flag in honesty_flags: - if flag.startswith("[") and "]" in flag: - file_name = flag[1 : flag.index("]")] - if file_name not in file_flags: - file_flags[file_name] = [] - file_flags[file_name].append(flag) + # Group anomalies by their target file + file_anomalies = {} + for anomaly in architectural_anomalies: + if anomaly.startswith("[") and "]" in anomaly: + file_name = anomaly[1 : anomaly.index("]")] + if file_name not in file_anomalies: + file_anomalies[file_name] = [] + file_anomalies[file_name].append(anomaly) jobs_generated = 0 - for file_name, anomalies in file_flags.items(): + for file_name, anomalies in file_anomalies.items(): source_file = next(source_dir.rglob(file_name), None) if not source_file: continue - # Grab the IR state for extra context if it exists + # Extract the Intermediate Representation (IR) state for dependency context prog_id = file_name.split(".")[0] ir_file = ir_dir / f"{prog_id}_ir.json" - ir_state = ( - json.loads(ir_file.read_text(encoding="utf-8")) - if ir_file.exists() - else None - ) + ir_state = json.loads(ir_file.read_text(encoding="utf-8")) if ir_file.exists() else None ticket = generate_agent_ticket(file_name, source_file, anomalies, ir_state) @@ -83,4 +89,4 @@ def forge_agent_jobs(clean_room_dir: Path, source_dir: Path, honesty_flags: list ticket_file.write_text(json.dumps(ticket, indent=2), encoding="utf-8") jobs_generated += 1 - return jobs_generated + return jobs_generated \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_compiler_forge.py b/gitgalaxy/tools/cobol_to_cobol/cobol_compiler_forge.py index c44f37c7..65124e9a 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_compiler_forge.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_compiler_forge.py @@ -1,20 +1,29 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: MVS 3.8j COBOL Compiler Forge (v6 - Dialect Aware) -# Purpose: Dynamically alters the mainframe build JCL based on the detected -# COBOL era (74 vs 85) to prevent catastrophic legacy compiler crashes. +# GitGalaxy Tool: MVS 3.8j COBOL Compiler Scaffolding +# +# PURPOSE: +# Dynamically generates the mainframe build JCL based on the detected COBOL +# dialect (74 vs 85) to ensure deterministic legacy compilation. +# +# ARCHITECTURAL DECISION: +# Legacy compilers are highly sensitive to dialect constraints. Feeding modern +# COBOL-85 structural signatures (like EVALUATE or END-IF) into an OS/VS COBOL-74 +# compiler will trigger catastrophic compilation failures. This module inspects +# the extracted source logic, detects the era-specific dialect, and automatically +# routes the build sequence to the correct enterprise compiler (COBUCL vs IGYWCL). # ============================================================================== import argparse import sys import re from pathlib import Path -# Failsafe to prevent infinite RAM loops from cyclic legacy copybooks +# Execution constraint to prevent resource starvation from cyclic copybooks MAX_RECURSION_DEPTH = 10 def detect_cobol_dialect(content: str) -> str: - """Scans for post-1974 structural keywords to determine the compiler era.""" + """Scans for post-1974 structural signatures to determine the compiler era.""" modern_signatures = re.compile( r"\b(EVALUATE|INITIALIZE|END-IF|END-PERFORM|END-READ|END-EVALUATE|CONTINUE)\b|\*>", re.IGNORECASE, @@ -25,14 +34,18 @@ def detect_cobol_dialect(content: str) -> str: def flatten_copybooks(source_text: str, base_dir: Path, current_depth: int = 0) -> str: - """Recursively inlines COPY statements to create a self-contained payload. - Includes a strict depth limit to prevent infinite loops from cyclic copybooks.""" - - # --- 🛡️ FAILSAFE BLOCK --- + """ + Recursively inlines COPY statements to create a self-contained execution payload. + """ + # ========================================================================== + # DEFENSIVE DESIGN (RECURSION LIMITER): + # Legacy architectures frequently contain cyclic dependencies (e.g., Copybook A + # imports Copybook B, which imports Copybook A). Without a strict recursion + # depth limit, this resolution function will trap the CPU in an infinite loop, + # triggering an Out-Of-Memory (OOM) pipeline collapse. + # ========================================================================== if current_depth > MAX_RECURSION_DEPTH: - print( - f" [!] WARNING: Copybook recursion depth ({MAX_RECURSION_DEPTH}) exceeded. Aborting cyclic branch." - ) + print(f" [!] WARNING: Copybook recursion depth ({MAX_RECURSION_DEPTH}) exceeded. Aborting cyclic branch.") return source_text lines = source_text.replace("\r", "").split("\n") @@ -52,7 +65,7 @@ def flatten_copybooks(source_text: str, base_dir: Path, current_depth: int = 0) if copy_file: out_lines.append(f" * --- INLINED COPYBOOK: {copy_name} ---") - # ⚠️ CRITICAL: Pass current_depth + 1 into the recursive call! + # Pass current_depth + 1 into the recursive call to advance the safety counter inlined_text = flatten_copybooks( copy_file.read_text(errors="ignore"), base_dir, @@ -63,9 +76,7 @@ def flatten_copybooks(source_text: str, base_dir: Path, current_depth: int = 0) out_lines.append(f" * --- END COPYBOOK: {copy_name} ---") continue else: - out_lines.append( - f" * [!] WARNING: COPYBOOK {copy_name} NOT FOUND LOCALLY" - ) + out_lines.append(f" * [!] WARNING: COPYBOOK {copy_name} NOT LOCATED IN REPOSITORY BOUNDS") out_lines.append(line) @@ -73,6 +84,7 @@ def flatten_copybooks(source_text: str, base_dir: Path, current_depth: int = 0) def extract_intent(source_text: str) -> tuple: + """Extracts the program identity and file assignment boundaries.""" prog_id = "UNKNOWN" id_match = re.search(r"PROGRAM-ID\.\s+([A-Z0-9\-]+)\.", source_text, re.IGNORECASE) if id_match: @@ -87,9 +99,8 @@ def extract_intent(source_text: str) -> tuple: return prog_id, files -def generate_build_jcl( - source_text: str, prog_name: str, files: set, dialect: str -) -> str: +def generate_build_jcl(source_text: str, prog_name: str, files: set, dialect: str) -> str: + """Generates the JCL deployment configuration mapping program files to physical data sets.""" jcl = [] job_name = f"BLD{prog_name[:4].upper()}" jcl.append(f"//{job_name} JOB (12345),'GITGALAXY COMPILER',") @@ -116,16 +127,12 @@ def generate_build_jcl( clean_f = clean_f[-8:] if clean_f: - jcl.append( - f"//{clean_f} DD DSN=HERC01.DATA.{clean_f},DISP=(MOD,CATLG,DELETE)," - ) + jcl.append(f"//{clean_f} DD DSN=HERC01.DATA.{clean_f},DISP=(MOD,CATLG,DELETE),") jcl.append("// UNIT=SYSDA,SPACE=(TRK,(10,10),RLSE),") jcl.append("// DCB=(LRECL=80,RECFM=FB,BLKSIZE=800)") if clean_f: - jcl.append( - f"//{clean_f} DD DSN=HERC01.DATA.{clean_f},DISP=(MOD,CATLG,DELETE)," - ) + jcl.append(f"//{clean_f} DD DSN=HERC01.DATA.{clean_f},DISP=(MOD,CATLG,DELETE),") jcl.append("// UNIT=SYSDA,SPACE=(TRK,(10,10),RLSE),") jcl.append("// DCB=(LRECL=80,RECFM=FB,BLKSIZE=800)") @@ -133,7 +140,7 @@ def generate_build_jcl( jcl.append("//* PHASE 2: IBM COMPILER & LINKAGE EDITOR") jcl.append("//* ==========================================================") - # --- THE DIALECT SWITCH --- + # --- DIALECT ROUTING --- if dialect == "COBOL-85": jcl.append("//* 🚨 DIALECT SENSOR: COBOL-85+ DETECTED 🚨") jcl.append("//* ROUTING TO MODERN ENTERPRISE COMPILER (IGYWCL)") @@ -160,9 +167,9 @@ def generate_build_jcl( def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("MVS 3.8j COBOL Compiler Forge") + enforce_licensing_guard("MVS 3.8j COBOL Compiler Scaffolding") - parser = argparse.ArgumentParser(description="GitGalaxy COBOL Compiler Forge") + parser = argparse.ArgumentParser(description="GitGalaxy COBOL Compiler Scaffolding") parser.add_argument("source_dir", help="Path to the original COBOL source files") parser.add_argument("out_dir", help="Path to save the generated Compiler JCLs") args = parser.parse_args() @@ -173,14 +180,10 @@ def main(): sys.exit(1) out_path.mkdir(parents=True, exist_ok=True) - cobol_files = [ - f - for f in src_path.rglob("*.cbl") - if "PROGRAM-ID" in f.read_text(errors="ignore").upper() - ] + cobol_files = [f for f in src_path.rglob("*.cbl") if "PROGRAM-ID" in f.read_text(errors="ignore").upper()] print("\n" + "=" * 70) - print(" 🏗️ GITGALAXY MAINFRAME COMPILER FORGE (PRE-COMPILER ACTIVE)") + print(" 🏗️ GITGALAXY MAINFRAME COMPILER GENERATOR (PRE-COMPILER ACTIVE)") print("=" * 70) for file_path in cobol_files: @@ -188,21 +191,19 @@ def main(): raw_text = file_path.read_text(encoding="utf-8", errors="ignore") # 1. Flatten the copybooks - monolith_text = flatten_copybooks(raw_text, src_path) + flattened_source = flatten_copybooks(raw_text, src_path) - # 2. Sense the dialect! - dialect = detect_cobol_dialect(monolith_text) + # 2. Detect the dialect + dialect = detect_cobol_dialect(flattened_source) - # 3. Forge the JCL based on the era - prog_name, expected_files = extract_intent(monolith_text) - jcl_payload = generate_build_jcl( - monolith_text, prog_name, expected_files, dialect - ) + # 3. Generate the JCL based on the detected era + prog_name, expected_files = extract_intent(flattened_source) + jcl_payload = generate_build_jcl(flattened_source, prog_name, expected_files, dialect) output_file = out_path / f"BUILD_{prog_name}.jcl" output_file.write_text(jcl_payload, encoding="utf-8") - print(f" [+] Forged {dialect} Pipeline : {output_file.name}") + print(f" [+] Generated {dialect} Pipeline : {output_file.name}") except Exception as e: print(f" [!] Failed to process {file_path.name}: {e}") @@ -210,4 +211,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_dag_architect.py b/gitgalaxy/tools/cobol_to_cobol/cobol_dag_architect.py index d0908cef..1be0e4f9 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_dag_architect.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_dag_architect.py @@ -1,9 +1,18 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Data Lineage DAG Architect (v3 - IR Context Aware) -# Purpose: Parses COBOL structural intent to map INPUT/OUTPUT data flows -# and calculates the mathematically perfect topological execution order. -# Upgraded to utilize IR RAM to deflect Ghost Dependencies. +# GitGalaxy Tool: Data Lineage DAG Architect +# +# PURPOSE: +# Parses COBOL structural intent to map INPUT/OUTPUT data flows and calculates +# the deterministic topological execution order. +# +# ARCHITECTURAL DECISION: +# In legacy mainframe environments, execution order is manually dictated by JCL. +# During cloud modernization, we must programmatically derive this order to +# generate modern orchestration pipelines (e.g., Spring Batch, Airflow). By +# statically analyzing SELECT/ASSIGN clauses and OPEN statements, we build a +# Directed Acyclic Graph (DAG) of data dependencies, ensuring programs execute +# in the exact order required by their physical dataset inputs and outputs. # ============================================================================== import argparse import sys @@ -14,8 +23,8 @@ def extract_lineage(filepath: Path, dead_paras: set = None) -> dict: """ - X-Rays a COBOL program to map internal variables to external physical files. - Utilizes shared IR context to mask out dead code and prevent hallucinated dependencies. + Analyzes a COBOL program to map internal variables to external physical files. + Utilizes shared IR state to mask out unreachable logic and prevent hallucinated dependencies. """ if dead_paras is None: dead_paras = set() @@ -33,9 +42,7 @@ def extract_lineage(filepath: Path, dead_paras: set = None) -> dict: # 2. Map internal file variables to physical external boundaries (DD Names) file_map = {} - for match in re.finditer( - r"SELECT\s+([A-Z0-9-]+)\s+ASSIGN\s+(?:TO\s+)?([A-Z0-9@#$\-]+)", content - ): + for match in re.finditer(r"SELECT\s+([A-Z0-9\-]+)\s+ASSIGN\s+(?:TO\s+)?([A-Z0-9@#$\-]+)", content): raw_dd = match.group(2) clean_dd = re.sub(r"^(?:UT|UR)-S-", "", raw_dd) file_map[match.group(1)] = clean_dd @@ -43,9 +50,14 @@ def extract_lineage(filepath: Path, dead_paras: set = None) -> dict: inputs = set() outputs = set() - # --- SYNERGY: THE GHOST DEFLECTOR (Masking Dead Code) --- - # We split the file and blank out any paragraphs the orchestrator identified as dead. - # This prevents the regex engine from finding 'OPEN' statements that will never execute. + # ========================================================================== + # DEFENSIVE DESIGN (UNREACHABLE LOGIC MASKING): + # COBOL programs often contain legacy, unreachable paragraphs. If we allow + # the regex engine to scan these abandoned blocks, it will extract 'OPEN' + # statements for files that are never actually utilized at runtime, creating + # false dependencies. We mask out known dead paragraphs with spaces to + # preserve the exact logic topology without triggering regex false positives. + # ========================================================================== if "PROCEDURE DIVISION" in content: parts = content.split("PROCEDURE DIVISION") data_div = parts[0] @@ -61,7 +73,6 @@ def extract_lineage(filepath: Path, dead_paras: set = None) -> dict: current_paragraph = para_match.group(1) # If the paragraph is dead, we replace its characters with spaces - # to preserve exact string geometry without triggering the regex. if current_paragraph in dead_paras: active_proc_lines.append(" " * len(line)) else: @@ -72,10 +83,8 @@ def extract_lineage(filepath: Path, dead_paras: set = None) -> dict: safe_content = content # 3. Extract exact Functional Intent (OPEN INPUT vs OPEN OUTPUT) - # We run this on the safe_content where dead code is invisible. - for match in re.finditer( - r"OPEN\s+(INPUT|OUTPUT|I-O|EXTEND)\s+([^.]+)\.", safe_content - ): + # We run this on the safe_content where unreachable logic is invisible. + for match in re.finditer(r"OPEN\s+(INPUT|OUTPUT|I-O|EXTEND)\s+([^.]+)\.", safe_content): mode = match.group(1) # Handle multiple files opened on the same line files_raw = re.sub(r"\s+", " ", match.group(2)).replace(",", " ").split() @@ -89,9 +98,13 @@ def extract_lineage(filepath: Path, dead_paras: set = None) -> dict: if mode in ("OUTPUT", "I-O", "EXTEND"): outputs.add(physical_file) - # --- HONESTY SENSOR: DYNAMIC CALLS --- + # ========================================================================== + # ARCHITECTURAL ANOMALY DETECTION (DYNAMIC CALLS): + # A standard CALL followed by string quotes is a static, deterministic + # dependency. A CALL utilizing a variable is dynamic, making the + # compilation-time DAG incomplete. We flag these for architectural review. + # ========================================================================== dynamic_calls = set() - # A CALL followed by quotes is static. A CALL without quotes is a variable/dynamic jump. for match in re.finditer(r'CALL\s+(?![\'"])([A-Z0-9\-]+)', safe_content): dynamic_calls.add(match.group(1)) @@ -117,7 +130,7 @@ def main(): print(f"Error: Target {target_path} does not exist.") sys.exit(1) - print(f"🕸️ GitGalaxy DAG Architect mapping data lineage in: {target_path.name}...\n") + print(f"🕸️ GitGalaxy Data Lineage Architect mapping execution topology in: {target_path.name}...\n") cobol_files = list(target_path.rglob("*.cbl")) + list(target_path.rglob("*.cob")) @@ -168,7 +181,7 @@ def main(): # --- Presentation --- print("==========================================================") - print(" ⚡ ZERO-TRUST EXECUTION PIPELINE (TOPOLOGICAL SORT)") + print(" ⚡ DETERMINISTIC EXECUTION PIPELINE (TOPOLOGICAL SORT)") print("==========================================================\n") if len(execution_order) != len(programs): @@ -190,4 +203,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_etl_unpacker.py b/gitgalaxy/tools/cobol_to_cobol/cobol_etl_unpacker.py index 96160631..088a03ac 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_etl_unpacker.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_etl_unpacker.py @@ -1,8 +1,18 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: ETL Unpacker (The Data Bridge) -# Purpose: Translates binary EBCDIC mainframe files into UTF-8 CSVs, -# unpacking legacy COMP-3 (Packed Decimal) formats on the fly. +# GitGalaxy Tool: ETL EBCDIC Unpacker +# +# PURPOSE: +# Translates binary EBCDIC mainframe datasets into modern UTF-8 CSVs, decoding +# legacy COMP-3 (Packed Decimal) formats dynamically at runtime. +# +# ARCHITECTURAL DECISION: +# Mainframe data migrations typically require expensive, licensed third-party +# tooling to extract datasets from EBCDIC layouts. By leveraging the JSON +# schemas generated from our COBOL copybook extraction, this utility bridges the +# gap natively in Python. It calculates precise byte offsets to decode Zoned +# Decimals and un-packs COMP-3 nibbles directly into floating-point numerics +# for seamless ingestion into modern data lakes. # ============================================================================== import argparse import sys @@ -15,8 +25,8 @@ def calculate_byte_layout(schema_json: dict) -> list: """ - Parses the GitGalaxy JSON Schema to calculate the physical byte length - of each legacy field so we know exactly how to slice the binary file. + Parses the GitGalaxy JSON Schema to calculate the physical byte footprint + of each legacy field, determining exact binary extraction boundaries. """ layout = [] @@ -53,7 +63,12 @@ def count_nines(s): decimals = p_right is_numeric = True - # COMP-3 physically compresses the bytes: (digits + 1 for sign) / 2, rounded up + # ================================================================== + # COMP-3 PHYSICAL COMPRESSION: + # Packed decimal stores two digits per byte, plus one half-byte + # (nibble) for the sign at the end. The physical byte length is + # calculated as (digits + 1 for sign) / 2, rounded up to the whole byte. + # ================================================================== physical_bytes = math.ceil((length + 1) / 2) if is_comp3 else length layout.append( @@ -92,10 +107,8 @@ def unpack_comp3(raw_bytes: bytes, decimals: int) -> float: return -value if is_negative else value -def unpack_ebcdic_file( - binary_filepath: Path, schema_filepath: Path, output_filepath: Path -): - """Slices the mainframe binary file according to the calculated layout.""" +def unpack_ebcdic_file(binary_filepath: Path, schema_filepath: Path, output_filepath: Path): + """Parses the mainframe binary file according to the calculated layout.""" try: schema_json = json.loads(schema_filepath.read_text(encoding="utf-8")) except Exception as e: @@ -164,14 +177,10 @@ def unpack_ebcdic_file( def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("ETL Unpacker (The Data Bridge)") + enforce_licensing_guard("ETL EBCDIC Unpacker") - parser = argparse.ArgumentParser( - description="GitGalaxy ETL Unpacker (EBCDIC to CSV)" - ) - parser.add_argument( - "binary_file", help="The raw EBCDIC binary file from the mainframe" - ) + parser = argparse.ArgumentParser(description="GitGalaxy ETL Unpacker (EBCDIC to CSV)") + parser.add_argument("binary_file", help="The raw EBCDIC binary file from the mainframe") parser.add_argument("schema_file", help="The GitGalaxy generated _schema.json file") parser.add_argument("--out", type=str, help="Optional: Custom output CSV path") args = parser.parse_args() @@ -186,10 +195,10 @@ def main(): out_path = Path(args.out).resolve() if args.out else binary_path.with_suffix(".csv") print("\n" + "=" * 70) - print(" 🌉 GITGALAXY ETL UNPACKER ENGAGED") + print(" 🌉 ETL UNPACKER ENGAGED") print("=" * 70) print(f" 📦 Ingesting Binary : {binary_path.name}") - print(f" 🗺️ Mapping Schema : {schema_path.name}") + print(f" 🗺️ Mapping Schema : {schema_path.name}") records = unpack_ebcdic_file(binary_path, schema_path, out_path) @@ -199,4 +208,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_graveyard_finder.py b/gitgalaxy/tools/cobol_to_cobol/cobol_graveyard_finder.py index 6f14dc2f..64d9cd27 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_graveyard_finder.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_graveyard_finder.py @@ -1,8 +1,17 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: The Graveyard Reaper (v3 - Context & Copybook Aware) -# Purpose: Static Analysis of COBOL AST to isolate orphaned data and dead code. -# Upgraded with an Inline Copybook Expander for cross-file memory tracking. +# GitGalaxy Tool: Deprecated Trails Analyzer +# +# PURPOSE: +# Static Analysis of COBOL structural signatures to isolate unused memory +# declarations and mathematically unreachable execution logic. +# +# ARCHITECTURAL DECISION: +# Legacy COBOL architectures frequently suffer from "code rot" where memory +# addresses (Data Division) and execution blocks (Procedure Division) are +# abandoned but never removed by cautious developers. This analyzer prevents +# migrating this dead weight to the cloud by statically mapping actual execution +# usage against declarations, shedding unnecessary state flux and cognitive load. # ============================================================================== import argparse import sys @@ -13,9 +22,17 @@ def resolve_copybooks(content: str, source_path: Path) -> str: """ Recursively hunts for COBOL 'COPY' statements and injects the contents of the - target .cpy file directly into the memory string to ensure accurate AST scanning. - Handles dynamic variable swapping via the REPLACING clause. + target .cpy file directly into the memory string to ensure accurate structural scanning. """ + # ========================================================================== + # DEFENSIVE DESIGN (INLINE COPYBOOK EXPANSION): + # A COBOL program's data declarations are often hidden inside external copybooks. + # Scanning the source file alone would result in massive false-positives for + # undeclared variables. We recursively expand and inline copybooks directly + # into the memory buffer before analysis to ensure mathematically accurate + # dependency tracking. + # ========================================================================== + # Matches: COPY NAME. or COPY NAME REPLACING ==A== BY ==B==. copy_pattern = re.compile( r'^[ \t]*COPY\s+[\'"]?([A-Z0-9_\-]+)[\'"]?(?:\s+REPLACING\s+(.+?))?\.', @@ -29,12 +46,14 @@ def replacer(match): for ext in [".cpy", ".cbl", ".cob", ".CPY"]: cpy_file = source_path.parent / f"{copy_name}{ext}" if cpy_file.exists(): - cpy_content = cpy_file.read_text( - encoding="utf-8", errors="ignore" - ).upper() - - # --- THE SHAPESHIFTER FIX --- - # If a REPLACING clause exists, parse the ==OLD== BY ==NEW== pairs and apply them + cpy_content = cpy_file.read_text(encoding="utf-8", errors="ignore").upper() + + # ============================================================== + # DEFENSIVE DESIGN (DYNAMIC ALIASING): + # COBOL's 'REPLACING' clause allows dynamic text substitution at + # compile time. We must simulate this substitution in our in-memory + # buffer to prevent missing usage references for aliased variables. + # ============================================================== if replacing_clause: # Extracts pairs, ignoring the optional == delimiters pairs = re.findall( @@ -45,9 +64,7 @@ def replacer(match): for old_val, new_val in pairs: # Use negative lookarounds so we don't accidentally replace partial words with hyphens cpy_content = re.sub( - r"(? dict: except Exception: return None - # --- SYNERGY: THE COPYBOOK EXPANDER --- - # Resolve all external memory layouts into the local string before AST math + # Resolve all external memory layouts into the local string before structural validation content = resolve_copybooks(raw_content, filepath) # COBOL is strictly divided. We need to split the data from the execution. @@ -85,7 +101,7 @@ def x_ray_dead_code(filepath: Path) -> dict: proc_div = parts[1] # ========================================== - # 1. HUNTING ORPHANED DATA + # 1. ISOLATING UNUSED MEMORY ADDRESSES # ========================================== # Look for COBOL variable declarations (Levels 01-49, 77, 88) # Bypassing Area A sequence numbers by allowing up to 11 leading spaces/chars. @@ -109,7 +125,7 @@ def x_ray_dead_code(filepath: Path) -> dict: orphaned_vars = declared_vars - used_vars # ========================================== - # 2. HUNTING PHANTOM PARAGRAPHS (Dead Code) + # 2. ISOLATING UNREACHABLE LOGIC BLOCKS # ========================================== # Paragraphs usually start near the margin and end with a period. para_pattern = re.compile(r"^[ \t]{0,11}([A-Z0-9\-]+)\.[ \t]*$", re.MULTILINE) @@ -126,7 +142,7 @@ def x_ray_dead_code(filepath: Path) -> dict: reached_paragraphs = {entry_point}.union(called_targets) declared_paragraphs = set(paragraphs) - # The Math: Dead code is anything declared but never explicitly called + # The Math: Unreachable logic is anything declared but never explicitly called dead_paragraphs = declared_paragraphs - reached_paragraphs # Ignore system paragraphs and generic loop ends (like *-EXIT) @@ -149,9 +165,9 @@ def x_ray_dead_code(filepath: Path) -> dict: def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("The Graveyard Reaper") + enforce_licensing_guard("Deprecated Trails Analyzer") - parser = argparse.ArgumentParser(description="GitGalaxy Graveyard Reaper v3") + parser = argparse.ArgumentParser(description="GitGalaxy Deprecated Trails Analyzer") parser.add_argument("target", help="Directory containing legacy COBOL payloads") args = parser.parse_args() @@ -160,7 +176,7 @@ def main(): print(f"Error: Target {target_path} does not exist.") sys.exit(1) - print(f"🪦 GitGalaxy Reaper scanning {target_path.name} for dead code...\n") + print(f"🔍 GitGalaxy Deprecated Trails Analyzer scanning {target_path.name} for obsolete logic...\n") cobol_files = list(target_path.rglob("*.cbl")) + list(target_path.rglob("*.cob")) @@ -182,26 +198,26 @@ def main(): print(f" 🎯 TARGET: {metrics['program_id']}") if metrics["orphaned_vars"]: print( - f" ↳ Orphaned Variables ({len(metrics['orphaned_vars'])}): {', '.join(list(metrics['orphaned_vars'])[:5])}" + f" ↳ Unused Memory Addresses ({len(metrics['orphaned_vars'])}): {', '.join(list(metrics['orphaned_vars'])[:5])}" + ("..." if len(metrics["orphaned_vars"]) > 5 else "") ) if metrics["dead_paras"]: print( - f" ↳ Phantom Paragraphs ({len(metrics['dead_paras'])}): {', '.join(list(metrics['dead_paras'])[:5])}" + f" ↳ Unreachable Logic Blocks ({len(metrics['dead_paras'])}): {', '.join(list(metrics['dead_paras'])[:5])}" + ("..." if len(metrics["dead_paras"]) > 5 else "") ) print("-" * 60) # Presentation print("\n==========================================================") - print(" 📉 DEAD CODE ELIMINATION REPORT") + print(" 📉 DEPRECATED TRAILS REDUCTION REPORT") print("==========================================================") print(f" Files Flagged for Cleanup : {totals['files_with_dead_code']}") - print(f" Unused Memory Addresses : {totals['orphaned_vars']} orphaned variables") - print(f" Unreachable Logic Blocks : {totals['dead_paras']} phantom paragraphs") + print(f" Unused Memory Addresses : {totals['orphaned_vars']} variables") + print(f" Unreachable Logic Blocks : {totals['dead_paras']} paragraphs") print(f" ✂️ Estimated Bloat Removed : ~{totals['loc_saved']} Lines of Code") print("==========================================================\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_jcl_auditor.py b/gitgalaxy/tools/cobol_to_cobol/cobol_jcl_auditor.py index 836bb092..33d89cf9 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_jcl_auditor.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_jcl_auditor.py @@ -1,8 +1,18 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Zero-Trust Meta Auditor (v5 - CLI Enabled) -# Purpose: Compares forged JCLs against original IBM legacy JCLs to calculate -# exact code bloat reduction and over-permissioned I/O shedding. +# GitGalaxy Tool: Zero-Trust JCL Auditor +# +# PURPOSE: +# Compares generated Cloud JCLs against original IBM legacy JCLs to calculate +# exact code reduction and quantify the shedding of over-permissioned I/O. +# +# ARCHITECTURAL DECISION: +# Over decades, mainframe Job Control Language (JCL) scripts accumulate "ghost" +# Data Definition (DD) statements—files that are allocated to a job step but +# never actually opened or utilized by the compiled COBOL program. This violates +# the principle of least privilege. This auditor mathematically proves the +# security posture of the modernized architecture by comparing the legacy +# footprint against the newly generated, zero-trust equivalents. # ============================================================================== import argparse import json @@ -33,7 +43,7 @@ def parse_jcl_intent(filepath: Path) -> dict: - """Parses a JCL file to extract its raw execution intent.""" + """Parses a JCL file to extract its raw execution and dataset allocation intent.""" metrics = {"lines_of_code": 0, "exec_pgms": set(), "data_definitions": set()} pgm_pattern = re.compile(r"EXEC\s+(?:PGM=)?([A-Z0-9@#$\-]+)", re.IGNORECASE) dd_pattern = re.compile(r"^//([A-Z0-9@#$\-]+)\s+DD\s+", re.IGNORECASE) @@ -62,59 +72,52 @@ def parse_jcl_intent(filepath: Path) -> dict: return metrics -def audit_zero_trust_jcls(forged_dir: Path, original_dir: Path) -> dict: - """Core logic to fetch bloat metrics.""" - legacy_jcls = list(original_dir.rglob("*.[jJ][cC][lL]")) + list( - original_dir.rglob("*.txt") - ) +def audit_zero_trust_jcls(generated_dir: Path, original_dir: Path) -> dict: + """Core logic to calculate architectural bloat and privilege reduction metrics.""" + legacy_jcls = list(original_dir.rglob("*.[jJ][cC][lL]")) + list(original_dir.rglob("*.txt")) legacy_map = {} # 1. Map Legacy JCLs by Intent (Handling multi-step monoliths) for lj in legacy_jcls: - if forged_dir in lj.parents: + if generated_dir in lj.parents: continue metrics = parse_jcl_intent(lj) for pgm in metrics["exec_pgms"]: # If multiple legacy JCLs call the same program, keep the biggest one - if pgm not in legacy_map or metrics.get("lines_of_code", 0) > legacy_map[ - pgm - ].get("lines_of_code", 0): + if pgm not in legacy_map or metrics.get("lines_of_code", 0) > legacy_map[pgm].get("lines_of_code", 0): legacy_map[pgm] = {"file": lj, "metrics": metrics} - # 2. Compare against Forged JCLs - forged_files = list(forged_dir.glob("*.jcl")) + # 2. Compare against Generated (Zero-Trust) JCLs + generated_files = list(generated_dir.glob("*.jcl")) report = { "audited": 0, "original_loc": 0, - "forged_loc": 0, + "forged_loc": 0, # Maintained key for downstream DB compatibility "excess_dds_blocked": 0, "program_breakdown": {}, } - for forged_file in forged_files: - forged_metrics = parse_jcl_intent(forged_file) - if not forged_metrics["exec_pgms"]: + for generated_file in generated_files: + generated_metrics = parse_jcl_intent(generated_file) + if not generated_metrics["exec_pgms"]: continue - pgm_name = list(forged_metrics["exec_pgms"])[0] + pgm_name = list(generated_metrics["exec_pgms"])[0] if pgm_name in legacy_map: twin_metrics = legacy_map[pgm_name]["metrics"] - loc_saved = max( - 0, twin_metrics["lines_of_code"] - forged_metrics["lines_of_code"] - ) + loc_saved = max(0, twin_metrics["lines_of_code"] - generated_metrics["lines_of_code"]) + + # The exact number of datasets allocated in legacy but stripped from modern excess_dds = max( 0, - len( - twin_metrics["data_definitions"] - - forged_metrics["data_definitions"] - ), + len(twin_metrics["data_definitions"] - generated_metrics["data_definitions"]), ) report["audited"] += 1 report["original_loc"] += twin_metrics["lines_of_code"] - report["forged_loc"] += forged_metrics["lines_of_code"] + report["forged_loc"] += generated_metrics["lines_of_code"] report["excess_dds_blocked"] += excess_dds report["program_breakdown"][pgm_name] = { @@ -125,8 +128,7 @@ def audit_zero_trust_jcls(forged_dir: Path, original_dir: Path) -> dict: if report["original_loc"] > 0: report["bloat_reduction_pct"] = round( - ((report["original_loc"] - report["forged_loc"]) / report["original_loc"]) - * 100, + ((report["original_loc"] - report["forged_loc"]) / report["original_loc"]) * 100, 1, ) else: @@ -138,15 +140,11 @@ def audit_zero_trust_jcls(forged_dir: Path, original_dir: Path) -> dict: def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("Zero-Trust Meta Auditor") + enforce_licensing_guard("Zero-Trust JCL Auditor") - parser = argparse.ArgumentParser( - description="GitGalaxy Zero-Trust Meta Auditor (v5)" - ) - parser.add_argument("forged", help="Directory containing the forged GitGalaxy JCLs") - parser.add_argument( - "legacy", help="Directory containing the original legacy IBM JCLs" - ) + parser = argparse.ArgumentParser(description="GitGalaxy Zero-Trust JCL Auditor") + parser.add_argument("generated", help="Directory containing the modernized GitGalaxy JCLs") + parser.add_argument("legacy", help="Directory containing the original legacy IBM JCLs") parser.add_argument( "--json", action="store_true", @@ -154,53 +152,49 @@ def main(): ) args = parser.parse_args() - forged_path = Path(args.forged).resolve() + generated_path = Path(args.generated).resolve() legacy_path = Path(args.legacy).resolve() - if not forged_path.exists() or not legacy_path.exists(): + if not generated_path.exists() or not legacy_path.exists(): print("\n[!] ERROR: One or both directories do not exist.") sys.exit(1) # Run the audit - report = audit_zero_trust_jcls(forged_path, legacy_path) + report = audit_zero_trust_jcls(generated_path, legacy_path) # Output routing if args.json: print(json.dumps(report, indent=2)) sys.exit(0) - # CLI Terminal Vibe + # CLI Terminal Output print("\n==============================================================") - print(" 🛡️ GitGalaxy Spoke: Zero-Trust Meta Auditor (v5)") + print(" 🛡️ GitGalaxy Tool: Zero-Trust JCL Auditor") print("==============================================================") - print(f" [*] Forged Dir : {forged_path.name}") - print(f" [*] Legacy Root : {legacy_path.name}") + print(f" [*] Modernized Dir : {generated_path.name}") + print(f" [*] Legacy Root : {legacy_path.name}") print("--------------------------------------------------------------") if report["audited"] == 0: print(" [!] No matching execution intents found between the directories.") - print(" Ensure your forged JCLs share PROGRAM-IDs with the legacy corpus.") + print(" Ensure your generated JCLs share PROGRAM-IDs with the legacy corpus.") else: print(" PROGRAM BREAKDOWN:") for pgm, data in report["program_breakdown"].items(): loc = str(data["loc_saved"]).rjust(4) io = str(data["io_blocked"]).rjust(2) - print( - f" [+] {pgm.ljust(10)} | LOC Saved: {loc} | I/O Blocked: {io} | Ref: {data['legacy_file']}" - ) + print(f" [+] {pgm.ljust(10)} | LOC Saved: {loc} | I/O Blocked: {io} | Ref: {data['legacy_file']}") print("--------------------------------------------------------------") print(" 📊 FINAL AUDIT METRICS:") print(f" > Programs Audited : {report['audited']}") print(f" > Original Legacy LOC : {report['original_loc']}") - print(f" > GitGalaxy Forged LOC : {report['forged_loc']}") + print(f" > GitGalaxy Zero-Trust LOC: {report['forged_loc']}") print(f" > Bloat Reduction : {report['bloat_reduction_pct']}%") - print( - f" > Over-Permissioned I/O : {report['excess_dds_blocked']} Boundaries Shed" - ) + print(f" > Over-Permissioned I/O : {report['excess_dds_blocked']} Dataset Boundaries Shed") print("==============================================================\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_jcl_forge.py b/gitgalaxy/tools/cobol_to_cobol/cobol_jcl_forge.py index 0e9f6cdc..70fb5348 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_jcl_forge.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_jcl_forge.py @@ -1,6 +1,17 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Zero-Trust JCL Forge (v5 - Hygienic Defaults) +# GitGalaxy Tool: Zero-Trust JCL Generator +# +# PURPOSE: +# Generates deterministic Job Control Language (JCL) scripts directly from +# extracted COBOL structural intent. +# +# ARCHITECTURAL DECISION: +# Modernizing mainframe workloads requires mapping legacy file assignments +# (SELECT/ASSIGN) to modern cloud infrastructure. This generator enforces +# a Zero-Trust data allocation model by strictly matching the JCL definitions +# to the mathematically verified I/O intent extracted during static analysis, +# preventing over-permissioned datasets. # ============================================================================== import argparse import sys @@ -10,6 +21,7 @@ def analyze_cobol_intent(filepath: Path) -> dict: + """Extracts execution intent and data boundaries from legacy source code.""" intent = { "program_id": "UNKNOWN", "files_requested": [], @@ -22,7 +34,7 @@ def analyze_cobol_intent(filepath: Path) -> dict: try: raw_content = filepath.read_text(encoding="utf-8", errors="ignore") - # 1. THE FLATTENER: Strip punch-card formatting + # 1. FORMAT NORMALIZER: Strip legacy margin formatting clean_lines = [] for line in raw_content.splitlines(): if len(line) > 6 and line[6] in ("*", "/"): @@ -32,9 +44,7 @@ def analyze_cobol_intent(filepath: Path) -> dict: monolith_code = " ".join(clean_lines) # 2. EXTRACT PROGRAM-ID (With fallback to file name) - prog_id_match = re.search( - r'PROGRAM-ID\.\s+[\'"]?([A-Z0-9\-]+)[\'"]?', monolith_code, re.IGNORECASE - ) + prog_id_match = re.search(r'PROGRAM-ID\.\s+[\'"]?([A-Z0-9\-]+)[\'"]?', monolith_code, re.IGNORECASE) if prog_id_match: intent["program_id"] = prog_id_match.group(1).strip() else: @@ -49,28 +59,22 @@ def analyze_cobol_intent(filepath: Path) -> dict: internal_name = match.group(1).strip() raw_dd = match.group(2).strip() clean_dd = re.sub(r"^(?:UT|UR)-S-", "", raw_dd) - intent["files_requested"].append( - {"internal": internal_name, "dd_name": clean_dd} - ) + intent["files_requested"].append({"internal": internal_name, "dd_name": clean_dd}) # 4. TRANSACTIONAL I/O: Detect EXEC CICS blocks - cics_matches = re.findall( - r"EXEC\s+CICS.*?END-EXEC\.", monolith_code, re.IGNORECASE - ) + cics_matches = re.findall(r"EXEC\s+CICS.*?END-EXEC\.", monolith_code, re.IGNORECASE) if cics_matches: intent["is_cics"] = True intent["cics_calls"] = len(cics_matches) # 5. DATABASE I/O: Detect EXEC SQL blocks - sql_matches = re.findall( - r"EXEC\s+SQL.*?END-EXEC\.", monolith_code, re.IGNORECASE - ) + sql_matches = re.findall(r"EXEC\s+SQL.*?END-EXEC\.", monolith_code, re.IGNORECASE) if sql_matches: intent["is_db2"] = True intent["sql_calls"] = len(sql_matches) except Exception as e: - print(f" [!] Intent Forge Error on {filepath.name}: {e}") + print(f" [!] Intent Extraction Error on {filepath.name}: {e}") return intent @@ -82,17 +86,18 @@ def generate_zero_trust_jcl( lineage: dict = None, corporate_header: str = "", ) -> str: + """Generates a strict, permission-bounded JCL deployment script.""" if lineage is None: lineage = {"inputs": set(), "outputs": set()} prog_name = intent["program_id"] jcl = [] - jcl.append(f"//{job_name} JOB ({account_code}),'GITGALAXY FORGE',") + jcl.append(f"//{job_name} JOB ({account_code}),'GITGALAXY GENERATOR',") jcl.append("// CLASS=A,MSGCLASS=A,MSGLEVEL=(1,1),") jcl.append("// USER=HERC01,PASSWORD=CUL8TR,") jcl.append("// TIME=10,REGION=4M") jcl.append("//* ==========================================================") - jcl.append("//* AUTOGENERATED BY GITGALAXY ZERO-TRUST FORGE") + jcl.append("//* AUTOGENERATED BY GITGALAXY ZERO-TRUST GENERATOR") arch_flags = [] if intent["is_cics"]: @@ -132,6 +137,7 @@ def generate_zero_trust_jcl( jcl.append("// DCB=(LRECL=80,RECFM=FB,BLKSIZE=800)") else: jcl.append(f"// DISP={disp}") + # DEFENSIVE DESIGN: Flag datasets missing explicit initialization logic if "SHR" in disp and raw_dd not in lineage.get("inputs", set()): jcl.append(f"//* WARNING: NO EXPLICIT OPEN INTENT FOR {dd}") jcl.append("//") @@ -139,18 +145,18 @@ def generate_zero_trust_jcl( def main(): - parser = argparse.ArgumentParser(description="GitGalaxy Zero-Trust JCL Forge (v5)") + from gitgalaxy.licensing import enforce_licensing_guard + + enforce_licensing_guard("Zero-Trust JCL Generator") + + parser = argparse.ArgumentParser(description="GitGalaxy Zero-Trust JCL Generator (v5)") parser.add_argument("target", help="Target directory or specific COBOL file") - parser.add_argument( - "--job", default="GITGJOB", help="Job name for the generated JCL" - ) - parser.add_argument( - "--acct", default="12345", help="Account code for the generated JCL" - ) + parser.add_argument("--job", default="GITGJOB", help="Job name for the generated JCL") + parser.add_argument("--acct", default="12345", help="Account code for the generated JCL") parser.add_argument( "--out", type=str, - help="Output directory (default: hygienic timestamped folder)", + help="Output directory (default: isolated timestamped folder)", ) args = parser.parse_args() @@ -168,22 +174,20 @@ def main(): print(f"\n[!] ERROR: No COBOL source files found in: {target_path}") sys.exit(1) - # --- THE HYGIENIC DEFAULT LOGIC --- + # --- ISOLATED OUTPUT ROUTING --- if args.out: out_dir = Path(args.out).resolve() else: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if target_path.is_dir(): - # e.g., cics-genapp -> cics-genapp_forged_20260425_161559 - out_dir = target_path.parent / f"{target_path.name}_forged_{timestamp}" + out_dir = target_path.parent / f"{target_path.name}_generated_{timestamp}" else: - out_dir = target_path.parent / f"forged_jcls_{timestamp}" + out_dir = target_path.parent / f"generated_jcls_{timestamp}" - # Create the directory out_dir.mkdir(parents=True, exist_ok=True) print("\n==============================================================") - print(" 🚀 GitGalaxy Spoke: Zero-Trust JCL Forge (v5)") + print(" 🚀 GitGalaxy Tool: Zero-Trust JCL Generator (v5)") print("==============================================================") print(f" [*] Target Path : {target_path}") print(f" [*] Output Dir : {out_dir}") @@ -195,7 +199,7 @@ def main(): intent = analyze_cobol_intent(file_path) jcl_output = generate_zero_trust_jcl(intent, args.job[:8].upper(), args.acct) - # Output strictly locked to the designated hygienic folder + # Output strictly locked to the designated isolated folder out_path = out_dir / f"{intent['program_id']}.jcl" out_path.write_text(jcl_output, encoding="utf-8") @@ -209,15 +213,13 @@ def main(): io_str = f"({', '.join(io_parts)})" if io_parts else "(No I/O)" - print( - f" [+] Forged: {file_path.name.ljust(15)} -> {out_path.name.ljust(15)} {io_str}" - ) + print(f" [+] Generated: {file_path.name.ljust(15)} -> {out_path.name.ljust(15)} {io_str}") success_count += 1 print("--------------------------------------------------------------") - print(f" [✓] Done! Successfully forged {success_count} JCL(s).") + print(f" [✓] Done! Successfully generated {success_count} JCL(s).") print("==============================================================\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_lexical_patcher.py b/gitgalaxy/tools/cobol_to_cobol/cobol_lexical_patcher.py index 3ae58cd4..b6c8ea51 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_lexical_patcher.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_lexical_patcher.py @@ -1,8 +1,17 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Lexical Patcher (Pre-Processor) -# Purpose: Neutralizes legacy COBOL structural traps by restructuring them into -# modern equivalents, protected by a Dialect Sensor to prevent 0C1 crashes. +# GitGalaxy Tool: Lexical Patcher (Pre-Processor) +# +# PURPOSE: +# Neutralizes legacy COBOL flow control anomalies by safely restructuring +# them into deterministic equivalents. +# +# ARCHITECTURAL DECISION: +# Legacy constructs like 'NEXT SENTENCE' create opaque execution jumps that +# break modern Abstract Syntax Trees and topological mapping. This module +# intercepts these anomalies and rewrites them into explicit scope terminators +# (CONTINUE), protected by a Dialect Sensor to ensure backward-compatibility +# with strict COBOL-74 environments. # ============================================================================== import re from pathlib import Path @@ -10,7 +19,7 @@ def detect_cobol_dialect(content: str) -> str: """ - Scans for post-1974 structural keywords to determine the compiler era. + Scans for post-1974 structural signatures to determine the compiler era. """ # COBOL-85 introduced explicit scope terminators, EVALUATE, INITIALIZE, and inline comments (*>) modern_signatures = re.compile( @@ -25,8 +34,8 @@ def detect_cobol_dialect(content: str) -> str: def patch_lexical_traps(filepath: Path) -> bool: """ - Scans the file for NEXT SENTENCE. If found, rewrites it safely based on the compiler dialect. - Returns True if the file was modified, False otherwise. + Scans the file for NEXT SENTENCE. If found, rewrites it safely based on the + compiler dialect. Returns True if the file was modified, False otherwise. """ try: content = filepath.read_text(encoding="utf-8", errors="ignore") @@ -34,38 +43,32 @@ def patch_lexical_traps(filepath: Path) -> bool: print(f"Error reading {filepath.name}: {e}") return False - # Fast check before engaging heavy regex + # DEFENSIVE DESIGN: Fast substring check before engaging the heavy regex engine if not re.search(r"\bNEXT\s+SENTENCE\b", content, re.IGNORECASE): return False - # 1. Sense the Environment + # 1. Sense the Execution Environment dialect = detect_cobol_dialect(content) - # 2. Apply Era-Appropriate Patches + # 2. Apply Era-Appropriate Lexical Patches if dialect == "COBOL-85": # Safe to use modern block-scoped CONTINUE and inline comments patched_content = re.sub( r"\bNEXT\s+SENTENCE\b", - "CONTINUE *> GitGalaxy Patch: Neutralized Lexical Trap", + "CONTINUE *> GitGalaxy Patch: Neutralized Flow Control Anomaly", content, flags=re.IGNORECASE, ) - print( - f" ↳ [!] {dialect} Detected: Safely upgraded NEXT SENTENCE to CONTINUE." - ) + print(f" ↳ [!] {dialect} Detected: Safely upgraded NEXT SENTENCE to CONTINUE.") else: - # COBOL-74 Strict Mode: We must leave it as NEXT SENTENCE to prevent compiler strokes. - # We rewrite it cleanly to ensure standard spacing for the AST slicer, but NO modern syntax. - patched_content = re.sub( - r"\bNEXT\s+SENTENCE\b", "NEXT SENTENCE", content, flags=re.IGNORECASE - ) - print( - f" ↳ [!] {dialect} Detected: Engaged ultra-conservative punch-card mode. Bypassing modern injection." - ) + # COBOL-74 Strict Mode: We must leave it as NEXT SENTENCE to prevent compilation failures. + # We rewrite it cleanly to ensure standard spacing for the extraction slicer, but avoid modern injection. + patched_content = re.sub(r"\bNEXT\s+SENTENCE\b", "NEXT SENTENCE", content, flags=re.IGNORECASE) + print(f" ↳ [!] {dialect} Detected: Engaged strict legacy compliance mode. Bypassing modern injection.") - # Save the sanitized code back to the file if changes were made + # Save the sanitized code back to the file if structural changes were made if content != patched_content: filepath.write_text(patched_content, encoding="utf-8") return True - return False + return False \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_microservice_slicer.py b/gitgalaxy/tools/cobol_to_cobol/cobol_microservice_slicer.py index 0cebd3ce..0d9119a8 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_microservice_slicer.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_microservice_slicer.py @@ -1,8 +1,19 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: The Microservice Slicer (v3 - IR Context Aware) -# Purpose: Recursive taint-tracking and business rule extraction for legacy COBOL. -# Upgraded to utilize in-memory IR state (RAM) to bypass dead logic. +# GitGalaxy Tool: Microservice Logic Extractor +# +# PURPOSE: +# Recursive taint-tracking and business rule extraction for legacy COBOL. +# Utilizes in-memory IR state (RAM) to bypass unreachable logic blocks. +# +# ARCHITECTURAL DECISION: +# Legacy COBOL programs are typically monolithic, making it difficult to +# extract specific business rules for microservice decomposition. This +# extractor performs data flow taint-tracking starting from a target +# variable, mapping its aliases through MOVE, ADD, and COMPUTE statements. +# By integrating with the Deprecated Trails Analyzer's IR state, it +# guarantees that extracted logic only comes from mathematically reachable +# execution paths, eliminating false-positive business rules. # ============================================================================== import argparse import sys @@ -10,12 +21,10 @@ from pathlib import Path -def slice_business_logic( - filepath: Path, initial_var: str, dead_paras: set = None, orphaned_vars: set = None -): +def slice_business_logic(filepath: Path, initial_var: str, dead_paras: set = None, orphaned_vars: set = None): """ Recursively tracks a variable and its aliases through the AST. - Utilizes shared IR context to prevent hallucinating logic inside dead code. + Utilizes shared IR context to prevent mapping logic inside unreachable code. """ if dead_paras is None: dead_paras = set() @@ -24,9 +33,12 @@ def slice_business_logic( initial_var = initial_var.upper() - # --- SYNERGY 1: ORPHANED MEMORY ABORT --- - # If the target variable is already known to be dead memory from the Graveyard Reaper, - # we can abort the slice immediately. It has no business logic. + # ========================================================================== + # DEFENSIVE DESIGN (UNUSED MEMORY ABORT): + # If the target variable is already known to be dead memory from the + # Deprecated Trails Analyzer, we can abort the slice immediately. It has + # no active business logic associated with it. + # ========================================================================== if initial_var in orphaned_vars: return [], {initial_var: "ORPHANED_MEMORY"} @@ -45,7 +57,7 @@ def slice_business_logic( para_pattern = re.compile(r"^[ \t]{0,7}([A-Z0-9\-]+)\.[ \t]*$") # ========================================================================== - # PASS 1: Recursive Taint Mapping (The Alias Engine) + # PASS 1: Recursive Taint Mapping (Data Flow Engine) # ========================================================================== # We loop 3 times to catch chained aliases (e.g., A -> B -> C) for _ in range(3): @@ -61,9 +73,11 @@ def slice_business_logic( current_paragraph = para_match.group(1) continue - # --- SYNERGY 2: THE GHOST DEFLECTOR --- + # ================================================================== + # DEFENSIVE DESIGN (UNREACHABLE LOGIC MASKING): # If the orchestrator's IR state tells us this paragraph is unreachable, - # we skip it. This prevents dead code from creating false-positive taints. + # we skip it. This prevents deprecated code from creating false-positive taints. + # ================================================================== if current_paragraph in dead_paras: continue @@ -90,7 +104,7 @@ def slice_business_logic( tainted_vars.update(vars_in_eq) # ========================================================================== - # PASS 2: Extraction + # PASS 2: Logic Extraction # ========================================================================== extracted_logic = [] current_paragraph = "MAIN-ENTRY" @@ -105,8 +119,10 @@ def slice_business_logic( current_paragraph = para_match.group(1) continue - # --- SYNERGY 3: EXTRACTION SHIELD --- + # ====================================================================== + # DEFENSIVE DESIGN (EXTRACTION SHIELD): # Do not extract text from paragraphs that are mathematically unreachable. + # ====================================================================== if current_paragraph in dead_paras: continue @@ -126,9 +142,9 @@ def slice_business_logic( def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("Microservice Slicer (The Legacy Forge)") + enforce_licensing_guard("Microservice Logic Extractor") - parser = argparse.ArgumentParser(description="GitGalaxy Microservice Slicer v3") + parser = argparse.ArgumentParser(description="GitGalaxy Microservice Logic Extractor v3") parser.add_argument("target", help="Path to a .cbl file to slice") parser.add_argument("--var", required=True, help="The target variable to track") args = parser.parse_args() @@ -138,27 +154,21 @@ def main(): print(f"Error: Target {target_path} does not exist.") sys.exit(1) - print( - f"🔪 GitGalaxy Slicer hunting aliases for [{args.var.upper()}] in {target_path.name}...\n" - ) + print(f"🔪 GitGalaxy Logic Extractor tracing dependencies for [{args.var.upper()}] in {target_path.name}...\n") # When run in standalone CLI mode, it won't have the IR RAM context, # but the function signature safely defaults to empty sets. result = slice_business_logic(target_path, args.var) if not result: - print( - f"⚠️ Variable {args.var.upper()} is never mutated in the PROCEDURE DIVISION." - ) + print(f"⚠️ Variable {args.var.upper()} is never mutated in the PROCEDURE DIVISION.") sys.exit(0) logic_slice, aliases = result if isinstance(aliases, dict) and "ORPHANED_MEMORY" in aliases.values(): print("==========================================================") - print( - f" 🪦 ABORTED: Variable [{args.var.upper()}] is mathematically dead memory." - ) + print(f" 🪦 ABORTED: Variable [{args.var.upper()}] is mathematically dead memory.") print("==========================================================") sys.exit(0) @@ -175,9 +185,9 @@ def main(): print(f" Line {item['line_num']:04d} | {item['statement']}") print("\n==========================================================") - print(f" 🎯 Sliced {len(logic_slice)} distinct business rules.") + print(f" 🎯 Extracted {len(logic_slice)} distinct business rules.") print("==========================================================\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_schema_forge.py b/gitgalaxy/tools/cobol_to_cobol/cobol_schema_forge.py index b190bb18..18908892 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_schema_forge.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_schema_forge.py @@ -1,8 +1,18 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: The Cloud Schema Forge -# Purpose: Translates legacy COBOL byte-maps (PIC / COMP-3) into -# modern PostgreSQL DDL and JSON schemas. +# GitGalaxy Tool: Cloud Schema Generator +# +# PURPOSE: +# Translates legacy COBOL byte-maps (PIC / COMP-3) into modern PostgreSQL DDL +# and JSON schemas. +# +# ARCHITECTURAL DECISION: +# Mainframe data structures are defined by absolute byte boundaries and packed +# decimal (COMP-3) storage. Cloud databases operate on dynamic types (VARCHAR, +# DECIMAL, BIGINT). This generator maps the legacy PIC clauses to their exact +# modern equivalents. By utilizing the IR state from the Deprecated Trails +# Analyzer, it actively drops abandoned memory declarations, ensuring the new +# cloud schemas are free of legacy bloat. # ============================================================================== import argparse import sys @@ -54,8 +64,8 @@ def count_nines(s): def forge_schemas(filepath: Path, ignore_vars: set = None, corporate_header: str = ""): """ - X-Rays a COBOL/Copybook file and forges the modern schemas. - Upgraded to utilize shared IR context to drop dead memory addresses. + Analyzes a COBOL/Copybook file and generates modern schemas. + Upgraded to utilize shared IR context to drop unused memory addresses. """ if ignore_vars is None: ignore_vars = set() @@ -104,26 +114,25 @@ def forge_schemas(filepath: Path, ignore_vars: set = None, corporate_header: str if not pic: continue - # --- SYNERGY: THE BLOAT CUTTER --- - # Instantly drop the variable if the Graveyard Reaper proved it is dead memory. + # ====================================================================== + # DEFENSIVE DESIGN (DEPRECATED TRAILS EXCLUSION): + # Instantly drop the variable if the Deprecated Trails Analyzer proved + # it is dead memory, preventing cloud database bloat. + # ====================================================================== if name in ignore_vars: continue safe_name = name.replace("-", "_") types = parse_cobol_picture(pic) - # --- HONESTY SENSOR: DYNAMIC MEMORY ARRAY --- + # ====================================================================== + # ARCHITECTURAL ANOMALY (DYNAMIC MEMORY ARRAY): # match.group(0) grabs the full matched string from the regex - warning = ( - " -- ⚠️ WARNING: OCCURS DEPENDING ON detected. Use JSONB." - if "DEPENDING ON" in match.group(0) - else "" - ) + # ====================================================================== + warning = " -- ⚠️ WARNING: OCCURS DEPENDING ON detected. Use JSONB." if "DEPENDING ON" in match.group(0) else "" # Add notes if it's a legacy packed decimal - comment = ( - " -- Legacy: COMP-3 (Packed Decimal)" if usage and "COMP-3" in usage else "" - ) + comment = " -- Legacy: COMP-3 (Packed Decimal)" if usage and "COMP-3" in usage else "" columns.append(f" {safe_name.ljust(30)} {types['sql']}{comment}{warning}") json_properties[safe_name] = { @@ -140,12 +149,12 @@ def forge_schemas(filepath: Path, ignore_vars: set = None, corporate_header: str lines = corporate_header.strip().split("\n") sql_header = "-- " + ("\n-- ".join(lines)) + "\n\n" - # Forge PostgreSQL DDL + # Generate PostgreSQL DDL sql_ddl = sql_header + f"CREATE TABLE {table_name} (\n" sql_ddl += ",\n".join(columns) sql_ddl += "\n);" - # Forge JSON Schema + # Generate JSON Schema json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "title": table_name, @@ -159,9 +168,9 @@ def forge_schemas(filepath: Path, ignore_vars: set = None, corporate_header: str def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("Cloud Schema Forge (The Legacy Forge)") + enforce_licensing_guard("Cloud Schema Generator") - parser = argparse.ArgumentParser(description="GitGalaxy Cloud Schema Forge") + parser = argparse.ArgumentParser(description="GitGalaxy Cloud Schema Generator") parser.add_argument("target", help="Path to a .cbl or .cpy file to translate") parser.add_argument( "--format", @@ -175,7 +184,7 @@ def main(): if not target_path.exists(): print(f"Error: Target {target_path} does not exist.") sys.exit(1) - print(f"🔨 GitGalaxy Schema Forge striking anvil for: {target_path.name}...\n") + print(f"🔨 GitGalaxy Cloud Schema Generator processing: {target_path.name}...\n") # In standalone CLI mode, IR context defaults to an empty set. schemas = forge_schemas(target_path) @@ -200,4 +209,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_cobol/cobol_system_limits_reporter.py b/gitgalaxy/tools/cobol_to_cobol/cobol_system_limits_reporter.py index 95490589..3164df35 100644 --- a/gitgalaxy/tools/cobol_to_cobol/cobol_system_limits_reporter.py +++ b/gitgalaxy/tools/cobol_to_cobol/cobol_system_limits_reporter.py @@ -1,15 +1,32 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: System Limit Reporter (The Honesty Protocol) -# Purpose: Static Analysis sensor to detect structural anomalies, dynamic routing, -# and legacy "dragons" that compromise deterministic mathematical mapping. +# GitGalaxy Tool: Architectural Anomaly Detector +# +# PURPOSE: +# Static Analysis sensor to detect structural anomalies, dynamic routing, +# and legacy execution patterns that compromise deterministic mathematical mapping. +# +# ARCHITECTURAL DECISION: +# Modern cloud architectures rely on deterministic, traceable data flows (DAGs). +# Certain legacy COBOL commands physically mutate the execution stack at runtime +# (e.g., dynamically rewriting the target of a GO TO). These anomalies make it +# mathematically impossible for standard parsers to guarantee an accurate +# translation. This sensor acts as an architectural safety net, aggressively +# flagging these files so they can be securely evaluated by an AI agent or a +# human architect. # ============================================================================== import argparse import sys import re from pathlib import Path -# Define the "Ancient Dragons" (Structural limiters) +# ============================================================================== +# DEFENSIVE DESIGN (STRUCTURAL ANOMALY SIGNATURES): +# These rules strictly target legacy commands that compromise static analysis. +# For example, 'EXEC CICS HANDLE CONDITION' operates as an asynchronous interrupt, +# meaning execution can violently jump outside the defined AST flow at any given +# millisecond, rendering static data lineage maps untrustworthy. +# ============================================================================== SYSTEM_LIMIT_RULES = { "ALTER_STATEMENT": { "regex": re.compile( @@ -20,9 +37,7 @@ "description": "Control flow mathematically compromised. The target of a GO TO is being dynamically rewritten.", }, "COPY_REPLACING": { - "regex": re.compile( - r'\bCOPY\s+[\'"]?[A-Z0-9\-]+[\'"]?\s+REPLACING\b', re.IGNORECASE - ), + "regex": re.compile(r'\bCOPY\s+[\'"]?[A-Z0-9\-]+[\'"]?\s+REPLACING\b', re.IGNORECASE), "severity": "HIGH", "description": "Macro substitution detected. AST math may drift from actual compiled execution.", }, @@ -37,7 +52,7 @@ def scan_system_limits(filepath: Path) -> list: """ Scans a COBOL file for structural anomalies that break deterministic mapping. - Returns a list of formatted warning strings. + Returns a list of formatted warning strings to be consumed by the Agent Task Forge. """ anomalies = [] try: @@ -68,11 +83,9 @@ def scan_system_limits(filepath: Path) -> list: def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("System Limit Reporter (The Legacy Forge)") + enforce_licensing_guard("Architectural Anomaly Detector") - parser = argparse.ArgumentParser( - description="GitGalaxy System Limit Reporter (Honesty Protocol)" - ) + parser = argparse.ArgumentParser(description="GitGalaxy Architectural Anomaly Detector") parser.add_argument("target", help="Path to a .cbl file OR a directory to scan") args = parser.parse_args() @@ -85,7 +98,7 @@ def main(): if target_path.is_file(): cobol_files.append(target_path) elif target_path.is_dir(): - print(f"📠 Scanning directory for System Limits: {target_path.name}...") + print(f"📠 Scanning directory for Architectural Anomalies: {target_path.name}...") cobol_files.extend(target_path.rglob("*.cbl")) cobol_files.extend(target_path.rglob("*.cob")) @@ -93,9 +106,7 @@ def main(): print("⚠️ No .cbl or .cob files found in the target location.") sys.exit(0) - print( - f"\n🔎 GitGalaxy Honesty Protocol scanning {len(cobol_files)} files for structural dragons...\n" - ) + print(f"\n🔎 GitGalaxy executing architectural integrity scan on {len(cobol_files)} files...\n") print("=" * 90) total_anomalies = 0 @@ -109,17 +120,11 @@ def main(): print("=" * 90) if total_anomalies == 0: - print( - " ✅ No structural limits detected. DAG is 100% mathematically deterministic." - ) + print(" ✅ No structural anomalies detected. DAG is 100% mathematically deterministic.") else: - print( - f" 🚨 WARNING: Found {total_anomalies} structural anomalies requiring human architectural review." - ) - print( - "==========================================================================================\n" - ) + print(f" 🚨 WARNING: Found {total_anomalies} structural anomalies requiring human architectural review.") + print("==========================================================================================\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/README.md b/gitgalaxy/tools/cobol_to_java/README.md index 36191147..6d7a9dc0 100644 --- a/gitgalaxy/tools/cobol_to_java/README.md +++ b/gitgalaxy/tools/cobol_to_java/README.md @@ -1,108 +1,208 @@ -# GitGalaxy: Automated Legacy Migration & COBOL to Java Spring Boot Pipeline +# GitGalaxy Mainframe: Structural Extraction & Legacy Modernization Suite -[![Tested](https://img.shields.io/badge/Tested-27%2F27_Legacy_Repos-00C957.svg)](#) -[![Architecture](https://img.shields.io/badge/Architecture-Spring_Boot_3.2-00BFFF.svg)](#) -[![Automation](https://img.shields.io/badge/Automation-100%25_Compilable_Shells-8A2BE2.svg)](#) +[![Mainframe Tested](https://img.shields.io/badge/Tested-MVS_3.8j_(1974)-000000.svg?style=flat&logo=ibm)](#) +[![Architecture](https://img.shields.io/badge/Architecture-Deterministic_Extraction-00BFFF.svg)](#) +[![Data](https://img.shields.io/badge/Data-EBCDIC_%26_COMP--3_Native-00C957.svg)](#) -Most legacy modernization efforts fail because they feed raw, monolithic COBOL directly into an LLM. As seen across the [DevSecOps Competitive Landscape](https://squid-protocol.github.io/gitgalaxy/04-00-security_landscape/), relying purely on AI or generic ASTs leads to hallucinations, memory leaks, and broken architectures. +Welcome to the **GitGalaxy Mainframe Modernization Suite**. This directory contains the deterministic, high-speed static analysis tools required to safely slice, sanitize, and map monolithic legacy architectures prior to cloud migration. -GitGalaxy flips the paradigm. We use the deterministic, mathematical [blAST Engine](https://squid-protocol.github.io/gitgalaxy/01-03-the-blast-paradigm/) to build a structurally perfect, 100% compiling Java Spring Boot architecture *first*, and only use AI for the final isolated logic. +**Mainframe Proven:** The architectural scaffolding generated by these tools compiles natively against raw MVS 3.8j operating systems (1974 Hercules Mainframe), while simultaneously generating strict architectural contracts for modern cloud environments (Spring Boot, PostgreSQL). -This pipeline has been stress-tested across a randomized corpus of 27 distinct legacy COBOL repositories (including complex IBM CICS applications), generating structurally sound, Maven-compilable Spring Boot systems without human intervention. +## The Why: Bridging the Mainframe-to-Cloud Divide -### 🧪 The Ultimate CI/CD Stress Test +Enterprise mainframe migrations frequently stall because Cloud Architects and COBOL Engineers speak different architectural languages. Cloud environments rely on dynamic scaling, relational databases, and event-driven Directed Acyclic Graphs (DAGs). Mainframes rely on sequential Job Control Language (JCL), absolute memory boundaries (`REDEFINES`), and proprietary hardware data encodings (`EBCDIC`, `COMP-3`). -To prove the viability of this deterministic approach, the Java Forge was subjected to an [Automated Batch Test](https://squid-protocol.github.io/gitgalaxy/05-06-batch-test-harness/) across 27 diverse repositories. +**The Generative AI Trap:** Feeding raw, multi-million-line COBOL monoliths into a Large Language Model (LLM) is a guaranteed failure. AI models cannot securely interpret implicit JCL execution orders, they hallucinate dependencies inside unreachable "dead code" paragraphs, and they lack the mathematical context to safely unpack binary `COMP-3` datasets. -
![Java Forge & Batch Test](../../../docs/wiki/assets/java_forge_and_batch_test.gif) - -**What This Test Proved (100% Success Rate):** -* **Perfect Syntax:** The generator outputs valid Java 17 syntax every single time. -* **Flawless Dependency Injection:** Spring Boot `@Service` and `@RestController` layers were auto-wired correctly, with mock services successfully catching unresolved external calls. -* **Valid Data Modeling:** Legacy `PIC` clauses, `OCCURS` arrays, and `REDEFINES` overlays were successfully translated into valid JPA `@Entity` classes that compile without Hibernate ORM errors. -* **Build System Integrity:** The generated `pom.xml` and `application.yml` files executed a flawless `mvn clean compile` out-of-the-box for all 27 targets. - -**What This Test Did NOT Evaluate:** -* **Business Logic Accuracy:** The compile test proves the *architecture* is perfectly scaffolded. It does not test the runtime execution of internal banking math or business rules, because the GitGalaxy engine intentionally extracts those rules into isolated JSON job tickets for an AI agent to complete later. +**The GitGalaxy Solution:** Before any AI translation or modern scaffolding occurs, we must deterministically map the physical reality of the mainframe. This suite parses the structural intent of the COBOL, strips away decades of dead code rot, extracts the exact I/O data lineage, and translates mainframe binary datasets into cloud-native formats—entirely without compiling Abstract Syntax Trees (ASTs). --- -### The Deterministic Translation Protocol: Knowing Our Limits +## The How: Deterministic Extraction & Cleansing -We do not claim to magically translate entire enterprise systems with zero human or AI input. We automate the heavy architectural grunt work and delegate the complex logic to constrained agents. +We treat the legacy codebase as a mathematical topology. By utilizing our **Structural Signature Analysis Engine**, we isolate and untangle the monolith through a multi-phase pipeline: -* **What We Automate:** Exact memory mapping, JPA entities, REST controllers, and complete Maven build systems. -* **What We Delegate:** Highly specific, isolated internal business logic. -* **How We Scale:** We generate [strict JSON Autonomous Agent Tickets](https://squid-protocol.github.io/gitgalaxy/05-05-autonomous-agent-tickets/) containing isolated logic slices. -* **Zero Hallucinations:** AI agents are restricted to filling in the pre-wired methods, preventing architectural hallucinations. +1. **Deprecated Trail Pruning:** We map memory declarations (`DATA DIVISION`) against actual execution calls (`PROCEDURE DIVISION`) to mathematically prove which variables and paragraphs are dead. We mask these out to prevent modern systems from inheriting legacy bloat. +2. **Data Lineage Mapping (DAGs):** By tracking `SELECT/ASSIGN` and `OPEN` statements, we map the exact physical datasets required by each program, generating a strict execution topology that replaces the need for legacy JCL. +3. **Microservice Slicing:** We use recursive data-flow taint tracking to trace a single business variable through `MOVE`, `ADD`, and `COMPUTE` statements. This isolates specific business rules so they can be securely assigned to AI agents for translation, strictly bounding their context windows. +4. **Memory Exhaustion Protection:** The engine dynamically scales between high-speed RAM and disk-backed SQLite to process massive, monolithic legacy repositories without triggering Out-Of-Memory (OOM) crashes. --- -### 🏗️ How It Works: Deterministic Scaffolding +## The What: Core Modules & Tooling + +### 1. Architectural Mapping & Triage +* **`cobol_dag_architect.py` (Data Lineage Architect):** Parses COBOL structural intent to map `INPUT/OUTPUT` data flows, calculating the deterministic topological execution order (DAG) required for modern orchestration (e.g., Spring Batch, Airflow). +
![DAG Architect](../../../docs/wiki/assets/dag_architect.gif) +* **`cobol_graveyard_finder.py` (Deprecated Trails Analyzer):** Performs static analysis to isolate unused memory declarations and mathematically unreachable execution logic, preventing the migration of dead weight. +
![Deprecated Trails Analyzer](../../../docs/wiki/assets/graveyard_reaper.gif) +* **`cobol_microservice_slicer.py` (Microservice Logic Extractor):** Executes 3-pass recursive variable taint-tracking for safe, isolated business logic extraction. +
![Microservice Logic Extractor](../../../docs/wiki/assets/microservice_slicer.gif) + +### 2. Data & Schema Modernization +* **`cobol_schema_forge.py` (Cloud Schema Generator):** Translates complex legacy byte-maps (`PIC` constraints) and memory overlays (`REDEFINES`) into strict PostgreSQL DDL schemas. +
![Cloud Schema Generator](../../../docs/wiki/assets/cloud_schema_forge.gif) +* **`cobol_etl_unpacker.py` (ETL EBCDIC Unpacker):** Translates binary EBCDIC mainframe datasets into modern UTF-8 CSVs, decoding Zoned Decimals and unpacking `COMP-3` nibbles directly into floating-point numerics natively in Python. + +### 3. Zero-Trust Infrastructure +* **`cobol_jcl_forge.py` (Zero-Trust JCL Generator):** Auto-generates strict, least-privilege JCL emulators—automatically stripping over-permissioned global access (e.g., `DISP=SHR`) and locking physical dataset provisioning to exact lineage bounds. +
![Zero-Trust JCL Generator](../../../docs/wiki/assets/jcl_forge_demo.gif) +* **`cobol_jcl_auditor.py` (Zero-Trust JCL Auditor):** Mathematically compares original legacy JCLs against the generated equivalents to quantify architectural bloat reduction and over-permissioned I/O shedding. +* **`cobol_compiler_forge.py` (Mainframe Compiler Generator):** Flattens copybooks and dynamically generates era-aware build JCLs by routing the build sequence to the correct enterprise compiler (COBOL-74 vs COBOL-85). +
![Mainframe Compiler Generator](../../../docs/wiki/assets/compiler_forge.gif) + +### 4. Code Integrity & Pre-Processing +* **`cobol_lexical_patcher.py` (Lexical Patcher):** Safely neutralizes legacy compiler traps (e.g., converting `NEXT SENTENCE` to explicit `CONTINUE` block scopes) to restore deterministic topological mapping without breaking strict compiler compliance. +* **`cobol_system_limits_reporter.py` (Architectural Anomaly Detector):** Flags non-deterministic routing logic (e.g., `ALTER`, `EXEC CICS HANDLE CONDITION`) that compromises static data lineage. +
![Architectural Anomaly Detector](../../../docs/wiki/assets/system_limits_reporter.gif) +* **`cobol_agent_task_forge.py` (Autonomous Agent Task Generator):** Converts architectural anomalies and extracted dependencies into highly constrained, structured JSON task tickets designed to safely bound LLM agents during code remediation. -The `cobol_to_java_controller.py` ingests the JSON Intermediate Representation (IR) generated by the Mainframe Modernization Suite. It then orchestrates a suite of specialized architectural forges to seamlessly [Scaffold Java Spring Boot](https://squid-protocol.github.io/gitgalaxy/cookbook/scaffold-spring-boot/). +--- -#### 1. [Entity & Memory Mapping Forge](https://squid-protocol.github.io/gitgalaxy/05-03-entity-and-memory-mapping/) -Translates legacy schema boundaries into strict Spring Boot `@Entity` classes. Read the [Entity Forge Cookbook Recipe](https://squid-protocol.github.io/gitgalaxy/cookbook/cobol-to-java-spring-boot-entity-forge/). -* **Precision Mapping:** Translates complex `PIC` clauses to `BigDecimal`. -* **Array Resolution:** Resolves `OCCURS` arrays into `List` collections. -* **Memory Overlays:** Maps `REDEFINES` memory overlays as transient aliases. +## 🧠 Engineering Highlights (Architectural Defenses) -#### 2. [The API Contract & Service Forge](https://squid-protocol.github.io/gitgalaxy/05-04-api-and-service-contracts/) -Translates the DAG lineage intent into modern REST Controllers and auto-wires the `@Service` layer. View the [REST API](https://squid-protocol.github.io/gitgalaxy/cookbook/cobol-to-java-REST-API-generation/) and [Service Forge](https://squid-protocol.github.io/gitgalaxy/cookbook/cobol-to-java-spring-boot-service-forge/) recipes. -* **Paradigm Detection:** Detects batch vs. transactional data paradigms. -* **Controller Generation:** Builds specific REST controller endpoints. -* **Mock Service Shield:** Generates mock services for missing external dependencies. +* **Unreachable Logic Masking (`cobol_dag_architect.py`):** COBOL programs often contain legacy, unreachable paragraphs. If a standard regex engine scans these, it will extract `OPEN` statements for files that are never actually utilized at runtime, creating false dependencies. We dynamically integrate with the Deprecated Trails Analyzer's state to "mask out" dead paragraphs with whitespace, preserving exact topology while eliminating hallucinated I/O dependencies. +* **Cyclic Copybook Shields (`cobol_compiler_forge.py`):** Legacy architectures frequently contain cyclic dependencies (e.g., Copybook A imports Copybook B, which imports Copybook A). To prevent our in-memory expansion from trapping the CPU in an infinite loop and triggering an OOM crash, we enforce strict, deterministic recursion depth limits during copybook flattening. +* **Defensive COMP-3 Unpacking (`cobol_etl_unpacker.py`):** Packed decimal (`COMP-3`) stores two digits per byte, plus one half-byte (nibble) for the sign. The parser mathematically validates the hex-boundaries (verifying the high nibble is `0-9` and the sign nibble is `A-F`). This intercepts corrupted mainframe memory segments before they crash the Python ETL pipeline. +* **Dynamic Aliasing Resolution (`cobol_graveyard_finder.py`):** COBOL's `REPLACING` clause allows dynamic text substitution at compile time. When hunting for unused variables, the analyzer simulates this substitution in its in-memory buffer using safe, negative lookarounds. This prevents the system from accidentally flagging heavily aliased variables as "dead code." -#### 3. [EBCDIC Decoder Forge (Data Serialization)](https://squid-protocol.github.io/gitgalaxy/cookbook/cobol-to-java-mainframe-data-serialization-forge/) -Automatically generates the utility classes necessary to read raw mainframe byte streams. -* **Legacy Unpacking:** Safely unpacks Packed Decimal (`COMP-3`) data. -* **Boundary Validation:** Validates hex-boundaries to prevent runtime crashes. -* **Format Translation:** Decodes raw EBCDIC strings to standard UTF-8. +--- -#### 4. [The Build System Forge](https://squid-protocol.github.io/gitgalaxy/cookbook/cobol-to-java-automated-spring-boot-build-system/) -Generates the configuration files required for instant compilation. -* **Dependency Management:** Generates production-ready Maven `pom.xml`. -* **Environment Config:** Configures Spring Boot `application.yml`. -* **Instant Verification:** Ensures instant out-of-the-box compilation. +## ⚡ Performance Showcases (Live CLI Execution) -#### 5. [The AI Boundary (Anomaly Agent Task Forge)](https://squid-protocol.github.io/gitgalaxy/05-16-anomaly-agent-task-forge/) -Packages the remaining logic into strict JSON tickets for LLMs or human engineers. -* **Logic Extraction:** Extracts isolated business rules from the monolith. -* **Ticket Generation:** Packages strict JSON tickets with required inputs/outputs. -* **Prompt Bounding:** Binds the AI to pre-wired interfaces only. +#### Showcase A: Deprecated Trails Analyzer (Graveyard Finder) +Identifying and shedding dead weight prior to a cloud migration saves massive amounts of translation cost and future cloud compute. ---- +```bash +python3 cobol_graveyard_finder.py /legacy_corpus/accounting +``` -### 🚀 Quickstart: Running the Java Forge +```text +========================================================== + 📉 DEPRECATED TRAILS REDUCTION REPORT +========================================================== + Files Flagged for Cleanup : 14 + Unused Memory Addresses : 142 variables + Unreachable Logic Blocks : 37 paragraphs + ✂️ Estimated Bloat Removed : ~1,790 Lines of Code +========================================================== +``` -To generate the Spring Boot architecture, point the controller at the `_gitgalaxy_clean` directory generated by the Mainframe Modernization Controller: +#### Showcase B: Data Lineage Architect (DAG Generation) +Automatically generating the correct execution order by mapping physical dataset dependencies (Inputs vs. Outputs) across multiple monolithic programs. ```bash -python3 cobol_to_java_controller.py /path/to/legacy_repo_gitgalaxy_clean_20260422_153624 +python3 cobol_dag_architect.py /legacy_corpus/nightly_batch ``` -To run the automated CI/CD batch test on a folder of repositories: -```bash -python3 tools/cobol_to_java/batch_test_harness.py /path/to/corpus_dir +```text +========================================================== + ⚡ DETERMINISTIC EXECUTION PIPELINE (TOPOLOGICAL SORT) +========================================================== + + STEP 01: Run [ACCT-INIT] + ↳ Reads : SYS-CONFIG-FILE + ↳ Writes: DAILY-LEDGER-DB +---------------------------------------------------------- + STEP 02: Run [LEDGER-CALC] + ↳ Reads : DAILY-LEDGER-DB, RATES-TBL + ↳ Writes: PROCESSED-LEDGER-DB +---------------------------------------------------------- + STEP 03: Run [REPORT-GEN] + ↳ Reads : PROCESSED-LEDGER-DB + ↳ Writes: FINAL-REPORT-OUT +---------------------------------------------------------- ``` ---- +#### Showcase C: Master Orchestration (CICS Banking Application) +Below is the live console output of the central orchestrator processing a legacy IBM CICS banking application. Notice the engine identifying over 6,700 lines of dead code, warning about macro substitutions, and automatically routing the compiler based on the detected COBOL dialect (74 vs 85). + +```text +=== 1. INITIATING DEPRECATED TRAILS ANALYZER === +🔍 GitGalaxy Deprecated Trails Analyzer scanning cics-banking-sample-application-cbsa for obsolete logic... +[... File Scans Omitted for Brevity ...] +========================================================== + 📉 DEPRECATED TRAILS REDUCTION REPORT +========================================================== + Files Flagged for Cleanup : 29 + Unused Memory Addresses : 817 variables + Unreachable Logic Blocks : 590 paragraphs + ✂️ Estimated Bloat Removed : ~6717 Lines of Code +========================================================== + +=== 2. INITIATING DAG ARCHITECT === +🕸️ GitGalaxy Data Lineage Architect mapping execution topology in: cics-banking-sample-application-cbsa... +========================================================== + ⚡ DETERMINISTIC EXECUTION PIPELINE (TOPOLOGICAL SORT) +========================================================== + STEP 01: Run [BANKDATA] + ↳ Reads : None + ↳ Writes: VSAM +---------------------------------------------------------- + +=== 3. INITIATING ARCHITECTURAL ANOMALY DETECTOR === +📠 Scanning directory for Architectural Anomalies: cics-banking-sample-application-cbsa... +🔎 GitGalaxy executing architectural integrity scan on 29 files... +========================================================================================== + ⚠️ [XFRFUN.cbl : Line 0128] HIGH LIMIT - Macro substitution detected. AST math may drift from actual compiled execution. + ⚠️ [CREACC.cbl : Line 0260] HIGH LIMIT - Macro substitution detected. AST math may drift from actual compiled execution. +========================================================================================== + 🚨 WARNING: Found 2 structural anomalies requiring human architectural review. +========================================================================================== + +=== 4. INITIATING CLOUD SCHEMA GENERATOR === +🔨 GitGalaxy Cloud Schema Generator processing: BNK1UAC.cbl... +========================================================== + 🐘 POSTGRESQL DDL (CLOUD DATABASE SCHEMA) +========================================================== +CREATE TABLE DFHCOMMAREA ( + WS_CICS_RESP INTEGER, + WS_CICS_RESP2 INTEGER, + WS_CICS_FAIL_MSG VARCHAR(70), + WS_COMM_EYE VARCHAR(4), + WS_COMM_CUSTNO VARCHAR(10), + WS_COMM_ACCNO DECIMAL(8, 0), + WS_COMM_AVAIL_BAL DECIMAL(12, 2), + WS_COMM_ACTUAL_BAL DECIMAL(12, 2) + -- [Schema Omitted for Brevity] +); + +=== 5. INITIATING MICROSERVICE LOGIC EXTRACTOR === +🔪 GitGalaxy Logic Extractor tracing dependencies for [WS-ACCOUNT-BALANCE] in BNK1UAC.cbl... +========================================================== + 🎯 Extracted 0 distinct business rules. +========================================================== + +=== 6. INITIATING MAINFRAME COMPILER GENERATOR === +====================================================================== + 🏗️ GITGALAXY MAINFRAME COMPILER GENERATOR (PRE-COMPILER ACTIVE) +====================================================================== + [+] Generated COBOL-85 Pipeline : BUILD_BNK1UAC.jcl + [+] Generated COBOL-85 Pipeline : BUILD_DBCRFUN.jcl + [+] Generated COBOL-74 Pipeline : BUILD_GETSCODE.jcl +====================================================================== + +=== 7. INITIATING STRUCTURAL EXTRACTION CONTROLLER === +====================================================================== + 🚀 EXTRACTION CONTROLLER ENGAGED + Target: cics-banking-sample-application-cbsa +====================================================================== + Generating Context-Aware Artifacts at: cics-banking-sample-application-cbsa_gitgalaxy_clean_20260422_153624 +---------------------------------------------------------------------- +====================================================================== + 🏁 EXTRACTION COMPLETE: Hybrid Pipeline execution successful. +====================================================================== +``` -### 🔍 Verify for Yourself (Real-World Outputs) +--- -Don't just take our word for it. We have published the raw, unedited artifacts generated from the IBM CICS benchmark test directly in this repository so you can inspect the architecture yourself. +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) -📂 **[Browse the Live Translation Examples](https://github.com/squid-protocol/cobol_to_java_examples)** -* **The Corpus:** Explore 10 distinct, auto-generated architectures—ranging from standard IBM CICS benchmarks to heavyweight compilers and edge-case testing code. -* **The Proof (Spring Boot Architecture):** Click through the `src/main/java` directories to verify the deterministic extraction of JPA entities, auto-wired REST controllers, and complete Maven build systems (`pom.xml` & `application.yml`). -* **Zero-Touch Compilation:** Every single example in this repository compiles out-of-the-box (`mvn clean compile`), proving structural integrity before any AI logic is injected. +GitGalaxy Mainframe Modernization is the structural extraction layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed for planetary-scale repositories. ---- -### 🌌 Powered by the blAST Engine (Bypassing LLMs and ASTs) -This tool is a modular enterprise integration within the broader GitGalaxy architecture. It is driven by our custom mathematical heuristics engine, capable of mapping multi-dimensional relationships at extreme velocity. Dive into the official wiki to understand the exact translation mechanics and memory-mapping heuristics: +Explore the ecosystem: -* 📖 **[Spring Boot Scaffolding Logic](https://squid-protocol.github.io/gitgalaxy/05-02-spring-boot-scaffolding/)** -* 📖 **[Entity & Memory Mapping Rules](https://squid-protocol.github.io/gitgalaxy/05-03-entity-and-memory-mapping/)** -* 📖 **[API & Service Contract Generation](https://squid-protocol.github.io/gitgalaxy/05-04-api-and-service-contracts/)** -* 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** \ No newline at end of file +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/batch_test_harness.py b/gitgalaxy/tools/cobol_to_java/batch_test_harness.py index 0beb2c2e..3bea855c 100644 --- a/gitgalaxy/tools/cobol_to_java/batch_test_harness.py +++ b/gitgalaxy/tools/cobol_to_java/batch_test_harness.py @@ -1,8 +1,17 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Prototyping Batch Test Harness -# Purpose: Stress-tests the entire V4 pipeline (Refractor -> Java -> Maven) -# across 'n' legacy repositories. Captures granular debugging logs. +# GitGalaxy Tool: Pipeline Validation Harness +# +# PURPOSE: +# Stress-tests the entire Cloud Modernization Pathway (Structural Extraction -> +# Spring Boot Scaffolding -> Maven Compilation) across 'n' legacy repositories. +# Captures granular debugging logs for CI/CD auditing. +# +# ARCHITECTURAL DECISION: +# In enterprise migrations, translating thousands of COBOL programs introduces +# compounding points of failure. This harness isolates each translation phase, +# enforcing strict dependency bounds and execution timeouts to guarantee +# deterministic batch validation without pipeline stalling. # ============================================================================== import argparse import subprocess @@ -14,7 +23,12 @@ def run_command(command: list, cwd: Path) -> tuple[bool, str, str]: """Executes a shell command and returns success status + logs.""" - # Clone the current environment and force Java 17 + # ========================================================================== + # DEFENSIVE DESIGN (ENVIRONMENT PARITY): + # Compiling generated code across different developer machines or CI/CD runners + # invites "it works on my machine" failures. We clone the environment and + # forcefully inject a specific JDK path to guarantee deterministic compilation. + # ========================================================================== custom_env = os.environ.copy() custom_env["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64" @@ -22,10 +36,12 @@ def run_command(command: list, cwd: Path) -> tuple[bool, str, str]: result = subprocess.run( command, cwd=cwd, - env=custom_env, # <--- Inject the environment here + env=custom_env, capture_output=True, text=True, - timeout=300, # 5-minute timeout per command to prevent infinite hangs + # DEFENSIVE DESIGN: 5-minute timeout per command to prevent zombie + # processes from hanging the entire batch run if a regex loops infinitely. + timeout=300, ) return result.returncode == 0, result.stdout, result.stderr except subprocess.TimeoutExpired as e: @@ -39,13 +55,9 @@ def run_command(command: list, cwd: Path) -> tuple[bool, str, str]: def main(): - parser = argparse.ArgumentParser(description="GitGalaxy Batch Test Harness") - parser.add_argument( - "corpus_dir", help="Path to the directory containing legacy COBOL repos" - ) - parser.add_argument( - "--n", type=int, default=0, help="Number of repositories to process (0 for all)" - ) + parser = argparse.ArgumentParser(description="GitGalaxy Pipeline Validation Harness") + parser.add_argument("corpus_dir", help="Path to the directory containing legacy COBOL repos") + parser.add_argument("--n", type=int, default=0, help="Number of repositories to process (0 for all)") args = parser.parse_args() corpus_path = Path(args.corpus_dir).resolve() @@ -57,47 +69,41 @@ def main(): master_log_path = reports_dir / f"master_batch_run_{timestamp}.txt" all_dirs = [d for d in corpus_path.iterdir() if d.is_dir()] - target_repos = [ - d for d in all_dirs if not ("_gitgalaxy_" in d.name or "batch_test" in d.name) - ] + target_repos = [d for d in all_dirs if not ("_gitgalaxy_" in d.name or "batch_test" in d.name)] if args.n > 0: target_repos = target_repos[: args.n] print("\n" + "=" * 70) - print(" 🧪 GITGALAXY BATCH TEST HARNESS ENGAGED") + print(" 🧪 PIPELINE VALIDATION HARNESS ENGAGED") print(f" Target Corpus : {corpus_path.name}") print(f" Sample Size : n = {len(target_repos)}") print("=" * 70 + "\n") summary = { "passed": 0, - "failed_refractor": 0, - "failed_java_forge": 0, + "failed_refractor": 0, # Preserved dictionary key + "failed_java_forge": 0, # Preserved dictionary key "failed_maven": 0, } with open(master_log_path, "w", encoding="utf-8") as master_log: - master_log.write( - f"GITGALAXY BATCH RUN - {timestamp}\nSample Size: {len(target_repos)}\n\n" - ) + master_log.write(f"GITGALAXY BATCH RUN - {timestamp}\nSample Size: {len(target_repos)}\n\n") for repo in target_repos: print(f"⚙️ Processing: {repo.name} ... ", end="", flush=True) master_log.write(f"--- REPO: {repo.name} ---\n") repo_error_log = reports_dir / f"{repo.name}_error_{timestamp}.log" - # STEP 1 + # STEP 1: Structural Extraction cmd1 = ["python", str(v6_dir / "cobol_refractor_controller.py"), repo.name] success1, out1, err1 = run_command(cmd1, cwd=corpus_path) if not success1: - print("❌ FAILED (Refractor Phase)") + print("❌ FAILED (Structural Extraction Phase)") print("\n" + "-" * 40 + " LAST 15 LINES OF STDERR " + "-" * 40) print("\n".join(err1.splitlines()[-15:])) print("-" * 105 + "\n") summary["failed_refractor"] += 1 - repo_error_log.write_text( - f"--- REFRACTOR STDERR ---\n{err1}\n\n--- STDOUT ---\n{out1}" - ) + repo_error_log.write_text(f"--- EXTRACTION STDERR ---\n{err1}\n\n--- STDOUT ---\n{out1}") continue clean_dirs = sorted( @@ -109,7 +115,7 @@ def main(): continue clean_room = clean_dirs[0] - # STEP 2 + # STEP 2: Spring Boot Scaffolding cmd2 = [ "python", str(v6_dir / "cobol_to_java_controller.py"), @@ -117,14 +123,12 @@ def main(): ] success2, out2, err2 = run_command(cmd2, cwd=corpus_path) if not success2: - print("❌ FAILED (Java Forge Phase)") + print("❌ FAILED (Spring Boot Scaffolding Phase)") print("\n" + "-" * 40 + " LAST 15 LINES OF STDERR " + "-" * 40) print("\n".join(err2.splitlines()[-15:])) print("-" * 105 + "\n") summary["failed_java_forge"] += 1 - repo_error_log.write_text( - f"--- JAVA FORGE STDERR ---\n{err2}\n\n--- STDOUT ---\n{out2}" - ) + repo_error_log.write_text(f"--- SCAFFOLDING STDERR ---\n{err2}\n\n--- STDOUT ---\n{out2}") continue java_dirs = sorted( @@ -136,7 +140,7 @@ def main(): continue java_dir = java_dirs[0] - # STEP 3 + # STEP 3: Maven Compilation cmd3 = ["mvn", "clean", "compile"] success3, out3, err3 = run_command(cmd3, cwd=java_dir) if not success3: @@ -146,9 +150,7 @@ def main(): print("\n".join(out3.splitlines()[-15:])) print("-" * 105 + "\n") summary["failed_maven"] += 1 - repo_error_log.write_text( - f"--- MAVEN STDERR/STDOUT ---\n{out3}\n{err3}" - ) + repo_error_log.write_text(f"--- MAVEN STDERR/STDOUT ---\n{out3}\n{err3}") continue print("✅ SUCCESS (Fully Compiled)") @@ -159,10 +161,8 @@ def main(): print("----------------------------------------------------------------------") print(f" • Perfect Successes : {summary['passed']}/{len(target_repos)}") print(f" 📁 Master Log : {master_log_path}") - print( - "======================================================================\n" - ) + print("======================================================================\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/cobol_to_java_agent_forge.py b/gitgalaxy/tools/cobol_to_java/cobol_to_java_agent_forge.py index 8a020d51..ee2d9771 100644 --- a/gitgalaxy/tools/cobol_to_java/cobol_to_java_agent_forge.py +++ b/gitgalaxy/tools/cobol_to_java/cobol_to_java_agent_forge.py @@ -1,24 +1,32 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Java Spring AI Agent Task Forge -# Purpose: Packages isolated COBOL slices into strict, hallucination-proof -# JSON task tickets for autonomous LLM agents. +# GitGalaxy Tool: Java Agent Task Generator +# +# PURPOSE: +# Packages isolated COBOL logic slices into strict, highly constrained JSON +# task tickets for autonomous LLM agents to translate into Java. +# +# ARCHITECTURAL DECISION: +# Autonomous agents are highly susceptible to "hallucinating" external system +# calls or modifying core business logic when given an entire legacy file at once. +# By pre-slicing the business rules via static analysis and injecting unresolved +# dependencies as strict constraints, we force the LLM to generate pure, +# side-effect-free @Service classes that rely on Spring's Dependency Injection (DI) +# for external integration. # ============================================================================== -def generate_java_agent_ticket( - slice_json: dict, prog_id: str, ir_state: dict = None -) -> dict: - """Forges a structured JSON task ticket for Java service generation.""" +def generate_java_agent_ticket(slice_json: dict, prog_id: str, ir_state: dict = None) -> dict: + """Generates a structured JSON task ticket for Java service generation.""" target_var = slice_json.get("target_var", "UNKNOWN") rules = slice_json.get("business_rules", []) - # Extract Honesty Protocol & Lineage Data + # Extract Architectural Anomalies & Data Lineage honesty_flags = [] unresolved_calls = [] if ir_state: analysis = ir_state.get("analysis", {}) - honesty_flags = analysis.get("honesty_flags", []) + honesty_flags = analysis.get("honesty_flags", []) # Preserved internal variable name lineage = analysis.get("lineage", {}) unresolved_calls = lineage.get("unresolved_calls", []) @@ -36,9 +44,7 @@ def generate_java_agent_ticket( "context": { "business_rules_to_translate": formatted_rules, "external_dependencies": unresolved_calls, - "architectural_warnings": [ - a.split("]", 1)[-1].strip() if "]" in a else a for a in honesty_flags - ], + "architectural_warnings": [a.split("]", 1)[-1].strip() if "]" in a else a for a in honesty_flags], }, "system_prompt": ( "You are a strict, deterministic code translator. You must implement the provided " @@ -50,4 +56,4 @@ def generate_java_agent_ticket( ), } - return ticket + return ticket \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/cobol_to_java_api_contract_forge.py b/gitgalaxy/tools/cobol_to_java/cobol_to_java_api_contract_forge.py index 9cd23a5b..f2ab04b9 100644 --- a/gitgalaxy/tools/cobol_to_java/cobol_to_java_api_contract_forge.py +++ b/gitgalaxy/tools/cobol_to_java/cobol_to_java_api_contract_forge.py @@ -1,8 +1,18 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Spring Boot API Contract Forge (v3 - Dependency Injection) -# Purpose: Translates DAG intent into modern REST Controller interfaces and -# auto-wires the corresponding @Service layer. +# GitGalaxy Tool: Spring Boot API Contract Generator +# +# PURPOSE: +# Translates structural DAG intent into modern REST Controller interfaces and +# auto-wires the corresponding @Service layer via constructor injection. +# +# ARCHITECTURAL DECISION: +# In an AI-assisted modernization pipeline, allowing an LLM to generate the +# REST API entry points often leads to hallucinated routing and broken Dependency +# Injection (DI) chains. This module deterministically generates the `@RestController` +# layer directly from the COBOL static analysis. It establishes a rigid API contract +# and auto-wires the `@Service` layer, forcing the AI agent to focus exclusively +# on internal business logic without altering the external system boundaries. # ============================================================================== import argparse import sys @@ -11,7 +21,7 @@ def generate_rest_controller(ir_state: dict, package_name: str) -> str: - """Forges the API endpoints and auto-wires the Service layer.""" + """Generates the API endpoints and auto-wires the Service layer.""" prog_id = ir_state.get("metadata", {}).get("file_name", "Unknown").split(".")[0] camel_prog = "".join(word.capitalize() for word in prog_id.split("-")) service_var = camel_prog[0].lower() + camel_prog[1:] if camel_prog else "unknown" @@ -48,13 +58,19 @@ def generate_rest_controller(ir_state: dict, package_name: str) -> str: if is_batch: # --- BATCH PARADIGM --- - java.append( - ' @PostMapping(value = "/execute-batch", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)' - ) + java.append(' @PostMapping(value = "/execute-batch", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)') java.append(f" public ResponseEntity execute{camel_prog}Batch(") params = [] - seen_vars = {} # Dictionary to track and deduplicate collisions + # ====================================================================== + # DEFENSIVE DESIGN (SPRING VARIABLE COLLISION PREVENTION): + # Legacy COBOL programs can assign multiple internal files to the same + # external physical DD name. If mapped directly to Java, this causes + # duplicate variable names in the method signature, breaking compilation. + # We track `seen_vars` and dynamically append numerical suffixes to + # ensure perfectly compiling Spring `@RequestParam` annotations. + # ====================================================================== + seen_vars = {} for file_req in files_requested: dd_name_raw = file_req.get("dd_name", "UNKNOWN").lower() @@ -74,9 +90,7 @@ def generate_rest_controller(ir_state: dict, package_name: str) -> str: unique_var_name = base_var_name unique_param_name = f"{dd_name_raw}File" - params.append( - f'@RequestParam("{unique_param_name}") MultipartFile {unique_var_name}' - ) + params.append(f'@RequestParam("{unique_param_name}") MultipartFile {unique_var_name}') if params: java.append(" " + ",\n ".join(params)) @@ -84,11 +98,9 @@ def generate_rest_controller(ir_state: dict, package_name: str) -> str: java.append(' @RequestParam("file") MultipartFile file') java.append(" ) {") - java.append(" // ⚠️ BATCH PARADIGM DETECTED") + java.append(" // BATCH PARADIGM DETECTED") java.append(" // Pass the InputStream directly to the Service layer.") - java.append( - f" {service_var}Service.execute{camel_prog}(/* pass streams here */);\n" - ) + java.append(f" {service_var}Service.execute{camel_prog}(/* pass streams here */);\n") else: # --- TRANSACTIONAL PARADIGM --- @@ -108,10 +120,8 @@ def generate_rest_controller(ir_state: dict, package_name: str) -> str: java.append(" /* No external data dependencies detected */") java.append(" ) {") - java.append(" // ⚡ TRANSACTIONAL PARADIGM DETECTED") - java.append( - f" {service_var}Service.execute{camel_prog}(/* pass DTOs here */);\n" - ) + java.append(" // TRANSACTIONAL PARADIGM DETECTED") + java.append(f" {service_var}Service.execute{camel_prog}(/* pass DTOs here */);\n") if outputs: java.append(f" // Expected Outputs: {', '.join(outputs)}") @@ -126,13 +136,11 @@ def generate_rest_controller(ir_state: dict, package_name: str) -> str: def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("API Contract Forge (The Legacy Forge)") + enforce_licensing_guard("API Contract Generator") - parser = argparse.ArgumentParser(description="GitGalaxy API Contract Forge") + parser = argparse.ArgumentParser(description="GitGalaxy API Contract Generator") parser.add_argument("ir_file", help="Path to the GitGalaxy _ir.json state dump") - parser.add_argument( - "--pkg", default="com.gitgalaxy.modernized", help="Base Java package name" - ) + parser.add_argument("--pkg", default="com.gitgalaxy.modernized", help="Base Java package name") args = parser.parse_args() ir_path = Path(args.ir_file).resolve() @@ -142,18 +150,13 @@ def main(): try: ir_state = json.loads(ir_path.read_text(encoding="utf-8")) java_code = generate_rest_controller(ir_state, args.pkg) - prog_id = ( - ir_state.get("metadata", {}) - .get("file_name", "Unknown") - .split(".")[0] - .capitalize() - ) + prog_id = ir_state.get("metadata", {}).get("file_name", "Unknown").split(".")[0].capitalize() out_path = ir_path.parent / f"{prog_id}Controller.java" out_path.write_text(java_code, encoding="utf-8") - print(f"🌐 API Contract Forged: {out_path.name}") + print(f"🌐 API Contract Generated: {out_path.name}") except Exception as e: - print(f"Error forging Controller: {e}") + print(f"Error generating Controller: {e}") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/cobol_to_java_build_forge.py b/gitgalaxy/tools/cobol_to_java/cobol_to_java_build_forge.py index e1852c42..a850dd4b 100644 --- a/gitgalaxy/tools/cobol_to_java/cobol_to_java_build_forge.py +++ b/gitgalaxy/tools/cobol_to_java/cobol_to_java_build_forge.py @@ -1,13 +1,23 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Java Spring Build System Forge -# Purpose: Auto-generates the Maven pom.xml and application.yml configuration -# to make the translated Spring Boot architecture instantly compilable. +# GitGalaxy Tool: Java Spring Build System Generator +# +# PURPOSE: +# Auto-generates the Maven pom.xml and application.yml configuration to ensure +# the translated Spring Boot architecture is immediately and perfectly compilable. +# +# ARCHITECTURAL DECISION: +# Generative AI frequently hallucinates incompatible library versions, mixes Maven +# and Gradle paradigms randomly, or omits critical runtime drivers (like PostgreSQL). +# By utilizing this static generation module to lay down the build infrastructure +# and properties, we establish a rigid, compilable sandbox. The autonomous agents +# are then restricted entirely to editing the business logic within the bounds +# of this pre-verified dependency graph. # ============================================================================== def generate_pom_xml(group_id: str, artifact_id: str) -> str: - """Forges a production-ready Maven pom.xml for the microservice.""" + """Scaffolds a production-ready Maven pom.xml for the microservice.""" pom = f""" str: {artifact_id} 1.0.0-SNAPSHOT {artifact_id} - GitGalaxy Auto-Forged Microservice + GitGalaxy Auto-Generated Microservice 17 @@ -57,12 +67,6 @@ def generate_pom_xml(group_id: str, artifact_id: str) -> str: true - - org.projectlombok - lombok - true - - org.slf4j slf4j-api @@ -103,7 +107,7 @@ def generate_pom_xml(group_id: str, artifact_id: str) -> str: def generate_application_yml(artifact_id: str) -> str: - """Forges the application.yml with standard Postgres and JPA configs.""" + """Scaffolds the application.yml with standard Postgres and JPA configurations.""" yml = f"""server: port: 8080 @@ -137,7 +141,7 @@ def generate_application_yml(artifact_id: str) -> str: def generate_main_class(package_name: str, class_name: str) -> str: - """Forges the Spring Boot Application entry point.""" + """Scaffolds the Spring Boot Application entry point.""" java = f"""package {package_name}; import org.springframework.boot.SpringApplication; @@ -151,4 +155,4 @@ def generate_main_class(package_name: str, class_name: str) -> str: }} }} """ - return java + return java \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/cobol_to_java_decoder_forge.py b/gitgalaxy/tools/cobol_to_java/cobol_to_java_decoder_forge.py index 31212036..f63ca96d 100644 --- a/gitgalaxy/tools/cobol_to_java/cobol_to_java_decoder_forge.py +++ b/gitgalaxy/tools/cobol_to_java/cobol_to_java_decoder_forge.py @@ -1,11 +1,23 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Java Spring EBCDIC & COMP-3 Decoder Forge -# Purpose: Auto-generates the utility class necessary to translate raw -# mainframe byte streams into modern Java data structures. +# GitGalaxy Tool: EBCDIC & COMP-3 Decoder Generator +# +# PURPOSE: +# Auto-generates the utility class necessary to translate raw mainframe byte +# streams into modern Java data structures (UTF-8 Strings and BigDecimals). +# +# ARCHITECTURAL DECISION: +# Mainframe datasets do not natively map to modern ASCII/UTF-8 strings or IEEE 754 +# floating-point numbers. IBM's Packed Decimal (COMP-3) and EBCDIC encodings +# require precise, bit-level translation. By auto-generating a dedicated, thoroughly +# tested decoding utility within the Spring Boot architecture, we prevent the AI +# agent from hallucinating flawed byte-shifting logic and ensure enterprise-grade +# data integrity during binary ingestion. # ============================================================================== + + def generate_decoder_util(package_name: str) -> str: - """Forges the EBCDIC and Packed Decimal (COMP-3) decoder utility with strict bounds validation.""" + """Generates the EBCDIC and Packed Decimal (COMP-3) decoder utility with strict bounds validation.""" java = f"""package {package_name}.util; import java.math.BigDecimal; @@ -17,7 +29,7 @@ def generate_decoder_util(package_name: str) -> str: private static final Logger log = LoggerFactory.getLogger(EbcdicDecoderUtil.class); - // Cp1047 is the standard IBM EBCDIC character set + // Cp1047 is the standard IBM EBCDIC character set (US/Canada) private static final Charset EBCDIC_CHARSET = Charset.forName("Cp1047"); /** @@ -51,14 +63,15 @@ def generate_decoder_util(package_name: str) -> str: int highNibble = b >>> 4; int lowNibble = b & 0x0F; - // The high nibble MUST be a number (0-9) + // DEFENSIVE DESIGN: The high nibble MUST be a valid base-10 digit (0-9). + // Values above 9 indicate corrupted memory or shifted byte boundaries. if (highNibble > 9) {{ log.warn("Corrupt COMP-3 high nibble '{{}}' at byte index {{}}. Defaulting to ZERO.", Integer.toHexString(highNibble), i); return BigDecimal.ZERO; }} sb.append(highNibble); - // The low nibble is a number EXCEPT in the very last byte, where it's the sign + // The low nibble is a number EXCEPT in the very last byte, where it acts as the sign flag if (i == packedBytes.length - 1) {{ boolean isNegative = (lowNibble == 0x0D || lowNibble == 0x0B); if (isNegative) {{ @@ -86,4 +99,4 @@ def generate_decoder_util(package_name: str) -> str: }} }} """ - return java + return java \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/cobol_to_java_service_forge.py b/gitgalaxy/tools/cobol_to_java/cobol_to_java_service_forge.py index 2c930215..2d2615ba 100644 --- a/gitgalaxy/tools/cobol_to_java/cobol_to_java_service_forge.py +++ b/gitgalaxy/tools/cobol_to_java/cobol_to_java_service_forge.py @@ -1,8 +1,18 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Java Spring Service Forge -# Purpose: Scaffolds the @Service class and auto-wires cross-service dependencies -# discovered via the global DAG (lineage). +# GitGalaxy Tool: Java Spring Service Scaffolding Generator +# +# PURPOSE: +# Scaffolds the @Service class and stages cross-service dependencies discovered +# via the global DAG (lineage) for the autonomous agent. +# +# ARCHITECTURAL DECISION: +# Following domain-driven design principles, the `@Service` layer encapsulates +# pure business logic, strictly isolated from REST API routing. By scaffolding +# this layer and mapping unresolved dynamic calls (from the DAG lineage) as +# explicit constraints, we guide the autonomous agent to implement the core +# COBOL rules while preventing it from hallucinating missing Spring beans or +# breaking the overall ApplicationContext. # ============================================================================== import argparse import sys @@ -11,6 +21,7 @@ def generate_service_skeleton(ir_state: dict, package_name: str) -> str: + """Generates the Spring Boot @Service skeleton and stages DAG dependencies.""" prog_id = ir_state.get("metadata", {}).get("file_name", "Unknown").split(".")[0] camel_prog = "".join(word.capitalize() for word in prog_id.split("-")) @@ -29,24 +40,24 @@ def generate_service_skeleton(ir_state: dict, package_name: str) -> str: java.append("@RequiredArgsConstructor") java.append(f"public class {camel_prog}Service {{\n") - java.append( - f" private static final Logger log = LoggerFactory.getLogger({camel_prog}Service.class);\n" - ) + java.append(f" private static final Logger log = LoggerFactory.getLogger({camel_prog}Service.class);\n") - # Cross-Service Dependency Injection (Commented out to prevent compile errors for missing modules) + # ========================================================================== + # DEFENSIVE DESIGN (APPLICATION CONTEXT SHIELD): + # Cross-Service dependencies are injected as comments/TODOs rather than active + # autowired fields. If we actively inject a dependency that hasn't been fully + # generated yet, the Spring Boot IoC container will throw a + # NoSuchBeanDefinitionException, preventing the pipeline from compiling. + # ========================================================================== if unresolved_calls: - java.append(" // \u26a0\ufe0f UNRESOLVED EXTERNAL DEPENDENCIES (FROM DAG)") + java.append(" // ⚠️ UNRESOLVED EXTERNAL DEPENDENCIES (FROM DAG)") for call in unresolved_calls: call_camel = "".join(word.capitalize() for word in call.split("-")) - java.append( - f" // TODO: AI AGENT - Implement or mock call to: {call_camel}Service" - ) + java.append(f" // TODO: AI AGENT - Implement or mock interface call to: {call_camel}Service") java.append("") - java.append( - f" public void execute{camel_prog}(/* Parameters mapped from Controller */) {{" - ) - java.append(f' log.info("Executing legacy business logic for {prog_id}");') + java.append(f" public void execute{camel_prog}(/* Parameters mapped from Controller */) {{") + java.append(f' log.info("Executing modernized business logic for {prog_id}");') java.append(" // TODO: [AI AGENT] Implement extracted business rules here.") java.append(" }\n}") @@ -56,13 +67,11 @@ def generate_service_skeleton(ir_state: dict, package_name: str) -> str: def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("Service Skeleton Forge (The Legacy Forge)") + enforce_licensing_guard("Service Scaffolding Generator") - parser = argparse.ArgumentParser(description="GitGalaxy Service Skeleton Forge") + parser = argparse.ArgumentParser(description="GitGalaxy Service Scaffolding Generator") parser.add_argument("ir_file", help="Path to the GitGalaxy _ir.json state dump") - parser.add_argument( - "--pkg", default="com.gitgalaxy.modernized", help="Base Java package name" - ) + parser.add_argument("--pkg", default="com.gitgalaxy.modernized", help="Base Java package name") args = parser.parse_args() ir_path = Path(args.ir_file).resolve() @@ -72,18 +81,13 @@ def main(): try: ir_state = json.loads(ir_path.read_text(encoding="utf-8")) java_code = generate_service_skeleton(ir_state, args.pkg) - prog_id = ( - ir_state.get("metadata", {}) - .get("file_name", "Unknown") - .split(".")[0] - .capitalize() - ) + prog_id = ir_state.get("metadata", {}).get("file_name", "Unknown").split(".")[0].capitalize() out_path = ir_path.parent / f"{prog_id}Service.java" out_path.write_text(java_code, encoding="utf-8") - print(f"⚙️ Service Skeleton Forged: {out_path.name}") + print(f"⚙️ Service Skeleton Generated: {out_path.name}") except Exception as e: - print(f"Error forging Service: {e}") + print(f"Error generating Service: {e}") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/cobol_to_java_spring_forge.py b/gitgalaxy/tools/cobol_to_java/cobol_to_java_spring_forge.py index 3e2aa54a..966ddc7b 100644 --- a/gitgalaxy/tools/cobol_to_java/cobol_to_java_spring_forge.py +++ b/gitgalaxy/tools/cobol_to_java/cobol_to_java_spring_forge.py @@ -1,9 +1,20 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Java Spring Entity Forge (v3 - Advanced Memory Mapping) -# Purpose: Translates JSON schemas into Spring Boot JPA Entities. -# Handles strict financial precision (PIC), Arrays (OCCURS), -# and Memory Overlays (REDEFINES). +# GitGalaxy Tool: Java Spring Entity Generator +# +# PURPOSE: +# Translates JSON schemas into Spring Boot JPA Entities. +# Handles strict financial precision (PIC), Arrays (OCCURS), +# and Memory Overlays (REDEFINES). +# +# ARCHITECTURAL DECISION: +# Relational databases and Java ORMs (Hibernate/JPA) allocate memory entirely +# differently than mainframe COBOL. COBOL utilizes absolute byte boundaries, +# arrays (OCCURS), and memory overlays (REDEFINES) where multiple variables +# point to the exact same physical byte block. This generator dynamically maps +# these legacy constraints into modern JPA annotations (e.g., @ElementCollection, +# @Transient) to ensure legacy data structures are safely persisted without +# duplicating columns or corrupting the modern relational schema. # ============================================================================== import argparse import sys @@ -23,15 +34,13 @@ def map_type_to_java(json_type: str, description: str) -> str: def parse_pic_clause(description: str) -> dict: """ - Hunts for COBOL PIC, OCCURS, and REDEFINES clauses in the description - and extracts exact memory boundaries and structural directives. + Analyzes COBOL PIC, OCCURS, and REDEFINES clauses in the description + to extract exact memory boundaries and structural directives. """ constraints = {} # 1. Check for REDEFINES (Memory Overlays) - redefines_match = re.search( - r"REDEFINES\s+([A-Z0-9_\-]+)", description, re.IGNORECASE - ) + redefines_match = re.search(r"REDEFINES\s+([A-Z0-9_\-]+)", description, re.IGNORECASE) if redefines_match: constraints["redefines"] = redefines_match.group(1) @@ -85,7 +94,7 @@ def count_nines(part): def generate_java_entity(schema_json: dict, package_name: str) -> str: - """Forges a JPA Entity enforcing exact COBOL memory constraints & overlaps.""" + """Generates a JPA Entity enforcing exact COBOL memory constraints & overlaps.""" table_name = schema_json.get("title", "UnknownTable") class_name = "".join(word.capitalize() for word in table_name.split("_")) @@ -106,10 +115,7 @@ def generate_java_entity(schema_json: dict, package_name: str) -> str: properties = schema_json.get("properties", {}) # Check if we need List imports for OCCURS clauses - requires_list = any( - "OCCURS" in col_data.get("description", "").upper() - for col_data in properties.values() - ) + requires_list = any("OCCURS" in col_data.get("description", "").upper() for col_data in properties.values()) java = [] java.append(f"package {package_name}.entity;\n") @@ -143,54 +149,39 @@ def generate_java_entity(schema_json: dict, package_name: str) -> str: parts = clean_col.split("_") camel_name = parts[0] + "".join(word.title() for word in parts[1:]) + # ====================================================================== + # DEFENSIVE DESIGN (JAVA SYNTAX SANITIZATION): + # COBOL variables frequently use names that are protected keywords in Java + # (e.g., CLASS, NEW, DEFAULT) or start with numeric characters. We strictly + # sanitize the target variable names to guarantee the output is 100% compilable + # before the AI agent touches it. + # ====================================================================== + # Java variables cannot start with a number. Prefix with 'v'. if camel_name and camel_name[0].isdigit(): camel_name = "v" + camel_name - # Sanitize Java reserved keywords (now including primitives) reserved_vars = { - "class", - "static", - "public", - "private", - "protected", - "return", - "new", - "system", - "default", - "enum", - "interface", - "void", - "try", - "catch", - "finally", - "import", - "package", - "super", - "this", - "const", - "goto", - "byte", - "int", - "char", - "short", - "long", - "float", - "double", - "boolean", - "null", - "true", - "false", + "class", "static", "public", "private", "protected", "return", + "new", "system", "default", "enum", "interface", "void", "try", + "catch", "finally", "import", "package", "super", "this", "const", + "goto", "byte", "int", "char", "short", "long", "float", "double", + "boolean", "null", "true", "false" } if camel_name in reserved_vars: camel_name += "Val" - # --- SCENARIO 1: MEMORY OVERLAY (REDEFINES) --- + # ====================================================================== + # SCENARIO 1: MEMORY OVERLAY (REDEFINES) + # DEFENSIVE DESIGN: In COBOL, REDEFINES creates an alias pointing to the + # same physical byte address. In JPA, mapping both variables as standard + # columns would duplicate the data in the SQL table. We map the alias + # as `@Transient` so it can be used in business logic without persisting + # a duplicate column to the database. + # ====================================================================== if "redefines" in constraints: target_camel = constraints["redefines"].lower().split("_") - target_camel = target_camel[0] + "".join( - w.title() for w in target_camel[1:] - ) + target_camel = target_camel[0] + "".join(w.title() for w in target_camel[1:]) java.append(f" // ⚠️ REDEFINES ALIAS: Maps to {target_camel} in memory") java.append(" @Transient") @@ -222,9 +213,7 @@ def generate_java_entity(schema_json: dict, package_name: str) -> str: # 🛡️ STRICT STATE INITIALIZATION # For network metrics, initialize to "N/A" instead of leaving null or defaulting to 0. - if base_java_type == "String" and any( - keyword in camel_name.lower() for keyword in ["ping", "lag", "latency"] - ): + if base_java_type == "String" and any(keyword in camel_name.lower() for keyword in ["ping", "lag", "latency"]): java.append(f' private {base_java_type} {camel_name} = "N/A";\n') else: java.append(f" private {base_java_type} {camel_name};\n") @@ -236,13 +225,11 @@ def generate_java_entity(schema_json: dict, package_name: str) -> str: def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("Java Entity Forge (The Legacy Forge)") + enforce_licensing_guard("Java Entity Generator") - parser = argparse.ArgumentParser(description="GitGalaxy Java Entity Forge") + parser = argparse.ArgumentParser(description="GitGalaxy Java Entity Generator") parser.add_argument("schema_file", help="Path to the GitGalaxy _schema.json file") - parser.add_argument( - "--pkg", default="com.gitgalaxy.modernized", help="Base Java package name" - ) + parser.add_argument("--pkg", default="com.gitgalaxy.modernized", help="Base Java package name") args = parser.parse_args() schema_path = Path(args.schema_file).resolve() @@ -253,16 +240,14 @@ def main(): schema = json.loads(schema_path.read_text(encoding="utf-8")) java_code = generate_java_entity(schema, args.pkg) - class_name = "".join( - word.capitalize() for word in schema.get("title", "Entity").split("_") - ) + class_name = "".join(word.capitalize() for word in schema.get("title", "Entity").split("_")) out_path = schema_path.parent / f"{class_name}.java" out_path.write_text(java_code, encoding="utf-8") - print(f"☕ Spring Entity Forged: {out_path.name}") + print(f"☕ Spring Entity Generated: {out_path.name}") except Exception as e: - print(f"Error forging Java: {e}") + print(f"Error generating Java Entity: {e}") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/cobol_to_java/cobol_to_java_test_forge.py b/gitgalaxy/tools/cobol_to_java/cobol_to_java_test_forge.py index bb6a6fcf..598577f1 100644 --- a/gitgalaxy/tools/cobol_to_java/cobol_to_java_test_forge.py +++ b/gitgalaxy/tools/cobol_to_java/cobol_to_java_test_forge.py @@ -1,8 +1,18 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Java Spring Test Forge (Phase 3) -# Purpose: Auto-generates JUnit 5 and Spring Boot integration tests to -# mathematically prove the translated architecture compiles and runs. +# GitGalaxy Tool: Java Spring Test Generator +# +# PURPOSE: +# Auto-generates JUnit 5 and Spring Boot integration tests to mathematically +# prove the translated architecture compiles, boots, and routes correctly. +# +# ARCHITECTURAL DECISION: +# AI agents are notoriously unreliable at configuring Spring's ApplicationContext +# or properly mocking external dependencies. By statically generating the `@SpringBootTest` +# and `@WebMvcTest` suites *before* the AI touches the business logic, we establish +# a strict Test-Driven Development (TDD) boundary. The agent is forced to write code +# that satisfies the rigid architectural contract, immediately exposing hallucinated +# dependencies or broken Dependency Injection (DI) chains during CI/CD compilation. # ============================================================================== import argparse import sys @@ -11,7 +21,7 @@ def generate_context_test(package_name: str, class_name: str) -> str: - """Forges a @SpringBootTest to ensure the entire dependency injection context loads.""" + """Generates a @SpringBootTest to ensure the entire dependency injection context loads.""" return f"""package {package_name}; import org.junit.jupiter.api.Test; @@ -22,17 +32,20 @@ class {class_name}ApplicationTests {{ @Test void contextLoads() {{ - // If the application.yml is broken or a @Service is missing, - // this test will fail immediately, catching structural regressions. + // ===================================================================== + // DEFENSIVE DESIGN (CONTEXT VALIDATION): + // If the application.yml is broken, a database driver is missing, or + // the AI hallucinates a @Service dependency without implementing it, + // this test will fail immediately upon context initialization, catching + // architectural regressions before runtime. + // ===================================================================== }} }} """ -def generate_controller_test( - package_name: str, class_name: str, endpoint_path: str -) -> str: - """Forges a @WebMvcTest to verify REST API mappings without booting the full server.""" +def generate_controller_test(package_name: str, class_name: str, endpoint_path: str) -> str: + """Generates a @WebMvcTest to verify REST API mappings without booting the full server.""" service_var = class_name[0].lower() + class_name[1:] return f"""package {package_name}.controller; @@ -58,11 +71,11 @@ class {class_name}ControllerTest {{ @Test void testEndpointRouting() throws Exception {{ - // Verifies the @RequestMapping and @PostMapping forged by the API Contract script + // Verifies the @RequestMapping and @PostMapping generated by the API Contract script mockMvc.perform(post("/api/v1/{endpoint_path}/execute") .contentType("application/json") .content("{{}}")) // Empty JSON payload for structural routing test - .andExpect(status().isOk()); // Or isNoContent() based on outputs + .andExpect(status().isOk()); // Or isNoContent() based on defined data outputs }} }} """ @@ -71,13 +84,11 @@ class {class_name}ControllerTest {{ def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("Java Test Forge (The Legacy Forge)") + enforce_licensing_guard("Java Test Generator") - parser = argparse.ArgumentParser(description="GitGalaxy Java Test Forge") + parser = argparse.ArgumentParser(description="GitGalaxy Java Test Generator") parser.add_argument("ir_file", help="Path to the GitGalaxy _ir.json state dump") - parser.add_argument( - "--pkg", default="com.gitgalaxy.modernized", help="Base Java package name" - ) + parser.add_argument("--pkg", default="com.gitgalaxy.modernized", help="Base Java package name") args = parser.parse_args() ir_path = Path(args.ir_file).resolve() @@ -96,25 +107,21 @@ def main(): test_dir = ir_path.parent / "src" / "test" / "java" / args.pkg.replace(".", "/") test_dir.mkdir(parents=True, exist_ok=True) - # 1. Forge Context Test + # 1. Generate Context Test context_code = generate_context_test(args.pkg, class_name) - (test_dir / f"{class_name}ApplicationTests.java").write_text( - context_code, encoding="utf-8" - ) + (test_dir / f"{class_name}ApplicationTests.java").write_text(context_code, encoding="utf-8") - # 2. Forge Controller Test + # 2. Generate Controller Test controller_test_dir = test_dir / "controller" controller_test_dir.mkdir(exist_ok=True) controller_code = generate_controller_test(args.pkg, class_name, endpoint_path) - (controller_test_dir / f"{class_name}ControllerTest.java").write_text( - controller_code, encoding="utf-8" - ) + (controller_test_dir / f"{class_name}ControllerTest.java").write_text(controller_code, encoding="utf-8") - print(f"🧪 Spring Boot Test Suite Forged for {class_name}") + print(f"🧪 Spring Boot Test Suite Generated for {class_name}") except Exception as e: - print(f"Error forging Java tests: {e}") + print(f"Error generating Java tests: {e}") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/compliance/README.md b/gitgalaxy/tools/compliance/README.md index 208fc974..01c748c5 100644 --- a/gitgalaxy/tools/compliance/README.md +++ b/gitgalaxy/tools/compliance/README.md @@ -1,4 +1,4 @@ -# GitGalaxy: Zero-Trust SBOM Generation & Compliance Auditing +# GitGalaxy Compliance: Zero-Trust SBOM Generation & Auditing [![Standard](https://img.shields.io/badge/Standard-CycloneDX_1.4-00C957.svg)](#) [![Security](https://img.shields.io/badge/Security-Zero__Trust-FF4500.svg)](#) @@ -6,44 +6,48 @@ Welcome to the **GitGalaxy Compliance & SBOM Suite**. -The industry standard for generating a [Software Bill of Materials (SBOM)](https://squid-protocol.github.io/gitgalaxy/04-02-sbom-generator/) is fundamentally flawed. Standard tools open your `package.json`, `composer.json`, or `requirements.txt`, read the list of dependencies, and blindly export them to a PDF. **They trust the manifest.** +The industry standard for generating a Software Bill of Materials (SBOM) is fundamentally flawed. Standard Software Composition Analysis (SCA) tools open your `package.json`, `composer.json`, or `requirements.txt`, read the list of dependencies, and blindly export them to a PDF or JSON file. **They trust the manifest.** -But manifests lie. A supply chain attack doesn't announce itself. A package might claim to be a simple text-formatting utility, but its physical files contain high-entropy encrypted payloads, obfuscated malware, or mismatched languages. +But manifests lie. A supply chain attack doesn't announce itself. A package might claim to be a simple text-formatting utility, but its physical files contain high-entropy encrypted payloads, obfuscated malware, or mismatched programming languages. -GitGalaxy takes a **Zero-Trust** approach. We don't just read the manifest; we physically hunt down the package on your disk and scan its internal contents to mathematically verify its identity and safety before signing off on the compliance report. +GitGalaxy takes a strict **Zero-Trust** approach. We don't just read the manifest; we physically hunt down the package on your disk and scan its internal contents to mathematically verify its structural identity and safety before signing off on the compliance report. -### 🧠 The Zero-Trust Strategy: Trust Nothing, Verify Everything +--- + +## 🧠 Engineering Highlights (The Zero-Trust Strategy) + +When you run our Universal SBOM Generator, it leverages the full weight of the GitGalaxy static analysis engine to perform physical audits on your supply chain: -When you run our [Universal SBOM Generator](https://squid-protocol.github.io/gitgalaxy/04-02-sbom-generator/), it leverages the full weight of the GitGalaxy static analysis engine to audit your dependencies: +### 1. The Universal Manifest Slicer +It automatically detects your ecosystem (**NPM, PyPI, Composer, Cargo, Go Modules, Maven, and RubyGems**), slices the manifest, and cross-references the declared dependencies against what actually exists on your hard drive. If a dependency is claimed but missing from the local installation paths, it is flagged as `UNVERIFIED_MISSING_ON_DISK`. -#### 1. The Universal Manifest Slicer -It automatically detects your ecosystem (**NPM, PyPI, Composer, Cargo, Go Modules, Maven, and RubyGems**), slices the manifest, and cross-references the declared dependencies against what actually exists on your hard drive. If a dependency is claimed but missing, it is flagged as `UNVERIFIED_MISSING_ON_DISK`. +### 2. Deep File Inspection & Structural Verification +For every package found on disk, the engine opens the core source files and runs them through our **Structural Signature Analysis Engine** to confirm the file's true identity. +* **Identity Spoofing Detection:** If an attacker hides a malicious bash script by naming it `index.js`, the engine cross-references the extension against internal structural signatures. It triggers an **Architectural Anomaly** alert and flags the package as `SPOOF_DETECTED`. +* **Zero-RAM Entropy Auditing:** We calculate the Shannon Entropy of the raw code. If the structural density exceeds standard human programming bounds (e.g., an entropy score > 4.8), we mathematically guarantee it contains encrypted or packed payloads without ever executing the binary. -#### 2. Deep File Inspection & Structural Verification -For every package found on disk, we open the core source files and run them through our **Structural Profiler** to confirm the file's true identity. -* **Identity Spoofing:** If an attacker hides a malicious bash script by naming it `index.js`, the profiler cross-references the extension against the internal file shebangs and structural markers. It triggers an [Identity Crisis](https://squid-protocol.github.io/gitgalaxy/02-05-language-lens/) and flags the package as `SPOOF_DETECTED`. -* **Entropy Auditing:** We calculate the [Shannon Entropy](https://squid-protocol.github.io/gitgalaxy/04-05-binary-anomaly-detector/) of the raw code. If the structural density exceeds standard human programming bounds (e.g., an entropy score > 4.8), we flag it for containing encrypted or packed payloads. +--- -### 🛡️ The Full GitGalaxy Defense Pipeline +## 🛡️ The Full GitGalaxy Defense Pipeline Our compliance auditing isn't just a simple script; it is backed by a multi-tiered, battle-tested heuristic pipeline: -* **[Pre-Process Analyzers (Binary Detection)](https://squid-protocol.github.io/gitgalaxy/02-03-aperture-filter/):** Acts as the frontline perimeter. It detects embedded hex arrays, opaque binary debris, and machine-generated monoliths before they can overwhelm the system. -* **[Metadata & Evasion Sensors](https://squid-protocol.github.io/gitgalaxy/02-06-security-lens/):** Scans your metadata (`.gitattributes`, `Makefile`). Crucially, it hunts for evasion tactics—like an attacker using `.gitignore` to secretly force-include a malicious `.so` binary while hiding it from standard directory scans. -* **[Language Verification Engine](https://squid-protocol.github.io/gitgalaxy/02-05-language-lens/):** Bypasses LLM hallucinations by using 60+ strict keyword regex profiles to definitively lock in a file's language family based on structural evidence, not just its extension. -* **[Statistical Outlier Detection](https://squid-protocol.github.io/gitgalaxy/02-09-signal-processing/):** Applies Z-Score math across the codebase. If a file claims to be a specific language but its structural logic density is a mathematical outlier compared to the rest of the ecosystem, it drops into **Quarantine**. We catch malware trying to disguise itself as inert data dumps. +* **Ingestion Firewall (`aperture.py`):** Acts as the frontline perimeter. It detects embedded hex arrays, opaque binary debris, and machine-generated monoliths before they can overwhelm the system. +* **Identity Classifier (`language_lens.py`):** Bypasses LLM hallucinations by using strict regex profiles to definitively lock in a file's language family based on structural evidence, not just its extension. +* **Threat Inference Engine (`security_lens.py`):** Scans the package metadata. Crucially, it hunts for evasion tactics—like an attacker using steganographic imports or safety bypasses hidden within the dependency tree. +* **Statistical Auditor (`statistical_auditor.py`):** Applies Z-Score math across the codebase. If a file claims to be a specific language but its logic density is a mathematical outlier compared to the rest of the ecosystem, it drops into Quarantine. --- -### ⚡ Performance Showcase: The Kubernetes Audit +## ⚡ Performance Showcase: The Kubernetes Audit To prove the engine scales to massive enterprise architectures, we ran the SBOM Generator against the **Kubernetes** repository. The tool instantly parsed the Go modules, located 170 physical third-party dependencies within the local `vendor/` directory, and mathematically verified their source code. Crucially, it correctly identified the 30 "missing" packages as internal monorepo workspace modules rather than failing the scan, proving its deep architectural awareness. -![Kubernetes SBOM Demo](../../../docs/wiki/assets/kubernetes_sbom_gen.gif) +![Kubernetes SBOM Demo](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/kubernetes_sbom_gen.gif) -```text +###text 🔎 Auditing 200 GOLANG dependencies from go.mod... ⚠️ [MISSING] k8s.io/api@v0.0.0 ⚠️ [MISSING] k8s.io/apimachinery@v0.0.0 @@ -61,55 +65,20 @@ The tool instantly parsed the Go modules, located 170 physical third-party depen Spoofed / Infected : 0 --------------------------------------------------------------------------- ✅ SUCCESS: Mathematical verification complete. SBOM sealed. -``` +### 📂 **Inspect the Output:** [Click here to view the actual `kubernetes_bom.json` generated by this scan.](kubernetes_bom.json) --- -### 🚀 Quickstart: Generate a Zero-Trust SBOM +## 🚀 CI/CD & Pre-Commit Integration -#### 1. Local CLI Execution -If you have installed GitGalaxy globally via PyPI (`pip install gitgalaxy`), you can generate a CycloneDX JSON file instantly from your terminal: +### 1. Local CLI Execution +If you have installed GitGalaxy globally, you can generate a CycloneDX JSON file instantly from your terminal: -```bash +###bash zero-trust-sbom /path/to/your/project -``` - -#### 2. GitHub Actions CI/CD Integration -Automate your compliance by generating and saving a mathematically verified SBOM on every release (see our [Cookbook Recipe](https://squid-protocol.github.io/gitgalaxy/cookbook/generate-zero-trust-sbom/)). Create `.github/workflows/generate-sbom.yml`: - -```yaml -name: Generate Zero-Trust SBOM - -on: - release: - types: [published] - -jobs: - build-sbom: - runs-on: ubuntu-latest - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - - - name: Build SBOM via GitGalaxy - uses: squid-protocol/gitgalaxy@main - with: - tool: 'zero-trust-sbom' - target: '.' - args: '--out bom.json' - - - name: Upload SBOM Artifact - uses: actions/upload-artifact@v4 - with: - name: cyclonedx-sbom - path: ./*_bom.json -``` - ---- -### 🌌 Powered by the blAST Engine -This tool is a modular enterprise integration within the broader GitGalaxy architecture. It is powered by the **[blAST Engine](https://squid-protocol.github.io/gitgalaxy/01-03-the-blast-paradigm/)**, an AST-free, mathematical heuristics engine capable of mapping repositories at 100,000 LOC/sec. +### -* 📖 **[Read the Official Wiki](https://squid-protocol.github.io/gitgalaxy/)** for deep dives into the engine's static analysis methodologies, architecture blueprints, and the **[Taxonomical Equivalence Map](https://squid-protocol.github.io/gitgalaxy/03-03-claim-3-taxonomy-map/)**. -* 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** to explore other enterprise tools like **[Supply Chain Firewalls](https://squid-protocol.github.io/gitgalaxy/04-03-supply-chain-firewall/)** and **[Terabyte Log Scanners](https://squid-protocol.github.io/gitgalaxy/04-07-terabyte-log-scanner/)**. \ No newline at end of file +### 2. GitHub Actions CI/CD Integration +Automate your compliance by generating and saving a mathematically verified SBOM on every release \ No newline at end of file diff --git a/gitgalaxy/tools/compliance/sbom_generator.py b/gitgalaxy/tools/compliance/sbom_generator.py index e6b15d4a..d1f450e6 100644 --- a/gitgalaxy/tools/compliance/sbom_generator.py +++ b/gitgalaxy/tools/compliance/sbom_generator.py @@ -18,7 +18,7 @@ from gitgalaxy.security.security_lens import SecurityLens from gitgalaxy.standards.analysis_lens import ThreatPolicy from gitgalaxy.standards.language_standards import LANGUAGE_DEFINITIONS -from gitgalaxy.standards.gitgalaxy_config import COMMENT_DEFINITIONS +from gitgalaxy.standards.gitgalaxy_config import LEXICAL_FAMILY_HEURISTICS from gitgalaxy.standards.language_lens import LanguageDetector @@ -56,11 +56,7 @@ def slice_manifest(manifest_path: Path) -> Tuple[str, Dict[str, str]]: if line and not line.startswith("#"): clean_name = re.split(r"[=><~]", line)[0].strip() # Get version if explicitly defined, else 'latest' - version = ( - line.split("==")[1].strip() - if "==" in line - else "latest" - ) + version = line.split("==")[1].strip() if "==" in line else "latest" deps[clean_name] = version elif filename == "Cargo.toml": @@ -68,18 +64,14 @@ def slice_manifest(manifest_path: Path) -> Tuple[str, Dict[str, str]]: with open(manifest_path, "r", encoding="utf-8") as f: content = f.read() # Universal regex to grab [dependencies] blocks - dep_blocks = re.findall( - r"\[(?:dev-)?dependencies\](.*?)(\n\[|$)", content, re.DOTALL - ) + dep_blocks = re.findall(r"\[(?:dev-)?dependencies\](.*?)(\n\[|$)", content, re.DOTALL) for block, _ in dep_blocks: for line in block.splitlines(): line = line.strip() if line and not line.startswith("#") and "=" in line: pkg_name = line.split("=")[0].strip() pkg_name = line.split("=")[0].strip() - deps[pkg_name] = ( - "latest" # Simplified version extraction - ) + deps[pkg_name] = "latest" # Simplified version extraction elif filename == "go.mod": ecosystem = "golang" @@ -111,9 +103,7 @@ def slice_manifest(manifest_path: Path) -> Tuple[str, Dict[str, str]]: if line.startswith("gem "): parts = line.split(",") pkg_name = parts[0].replace("gem", "").strip(" '\"") - version = ( - parts[1].strip(" '\"") if len(parts) > 1 else "latest" - ) + version = parts[1].strip(" '\"") if len(parts) > 1 else "latest" deps[pkg_name] = version elif filename == "pom.xml": @@ -135,9 +125,7 @@ def slice_manifest(manifest_path: Path) -> Tuple[str, Dict[str, str]]: return ecosystem, deps @staticmethod - def locate_physical_package( - target_path: Path, pkg_name: str, ecosystem: str - ) -> Path: + def locate_physical_package(target_path: Path, pkg_name: str, ecosystem: str) -> Path: """Hunts for the physical location of a package within the project bounds.""" if ecosystem == "npm": target = target_path / "node_modules" / pkg_name @@ -158,9 +146,7 @@ def locate_physical_package( if "site-packages" in root: # Case-insensitive match for the package folder for d in dirs: - if d.lower() == safe_pkg_name or d.lower().startswith( - f"{safe_pkg_name}-" - ): + if d.lower() == safe_pkg_name or d.lower().startswith(f"{safe_pkg_name}-"): return Path(root) / d elif ecosystem == "golang": @@ -212,7 +198,7 @@ def main(): print(f"📦 GitGalaxy Universal SBOM Generator engaging on {target_path.name}...") security = SecurityLens(policy=ThreatPolicy.get_policy("paranoid")) - detector = LanguageDetector(LANGUAGE_DEFINITIONS, COMMENT_DEFINITIONS) + detector = LanguageDetector(LANGUAGE_DEFINITIONS, LEXICAL_FAMILY_HEURISTICS) slicer = UniversalManifestSlicer() components = [] @@ -230,9 +216,7 @@ def main(): "Gemfile", "pom.xml", ] - found_manifests = [ - target_path / m for m in manifest_targets if (target_path / m).exists() - ] + found_manifests = [target_path / m for m in manifest_targets if (target_path / m).exists()] if not found_manifests: print("⚠️ No supported manifests found in the root directory.") @@ -244,9 +228,7 @@ def main(): if not packages: continue - print( - f"\n🔎 Auditing {len(packages)} {ecosystem.upper()} dependencies from {manifest.name}..." - ) + print(f"\n🔎 Auditing {len(packages)} {ecosystem.upper()} dependencies from {manifest.name}...") for pkg_name, pkg_version in packages.items(): trust_status = "VERIFIED_SAFE" @@ -256,9 +238,7 @@ def main(): if not pkg_path: trust_status = "UNVERIFIED_MISSING_ON_DISK" - anomaly_notes.append( - "Package declared in manifest but not found locally." - ) + anomaly_notes.append("Package declared in manifest but not found locally.") total_missing += 1 else: # Physical Audit (Max 5 core files to maintain velocity) @@ -273,9 +253,7 @@ def main(): file_path = Path(root) / file try: - with open( - file_path, "r", encoding="utf-8", errors="ignore" - ) as f: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: content = f.read(8192) sec_results = security.scan_content(content, 100) @@ -311,9 +289,7 @@ def main(): {"name": "gitgalaxy:trust_status", "value": trust_status}, { "name": "gitgalaxy:anomaly_notes", - "value": ( - " | ".join(anomaly_notes) if anomaly_notes else "None" - ), + "value": (" | ".join(anomaly_notes) if anomaly_notes else "None"), }, ], } @@ -361,9 +337,7 @@ def main(): print(f" Spoofed / Infected : {total_anomalies}") print("-" * 75) if total_anomalies > 0: - print( - f" ❌ ALERT: {total_anomalies} dependencies failed physical structural verification." - ) + print(f" ❌ ALERT: {total_anomalies} dependencies failed physical structural verification.") else: print(" ✅ SUCCESS: Mathematical verification complete. SBOM sealed.") print("=" * 75 + "\n") diff --git a/gitgalaxy/tools/network_auditing/README.md b/gitgalaxy/tools/network_auditing/README.md index e7b2362c..a24c4554 100644 --- a/gitgalaxy/tools/network_auditing/README.md +++ b/gitgalaxy/tools/network_auditing/README.md @@ -1,77 +1,98 @@ -# GitGalaxy: API Security & Shadow API Detection +# GitGalaxy: API Network Map & Shadow API Hunter [![Frameworks](https://img.shields.io/badge/Supported-Python_|_Node_|_Java_|_Go_|_C%23_|_Rust_|_Ruby_|_PHP-00C957.svg)](#) -[![Architecture](https://img.shields.io/badge/Architecture-AST--Free_Regex-00BFFF.svg)](#) -[![Security](https://img.shields.io/badge/Security-Shadow_API_Hunter-FF4500.svg)](#) +[![Architecture](https://img.shields.io/badge/Architecture-AST--Free_Heuristics-00BFFF.svg)](#) +[![Security](https://img.shields.io/badge/Security-Shadow_API_Detection-FF4500.svg)](#) -Welcome to the **GitGalaxy API Security & Attack Surface Mapping Suite**. +Welcome to the **GitGalaxy Full API Network Map**. Security documentation is often strictly theoretical, whereas compiled source code represents physical reality. Attackers do not exploit the APIs you have documented; they hunt for the forgotten, undocumented endpoints left exposed in your codebase. -Standard DevSecOps scanners ([like Checkmarx, SonarQube, or Semgrep](https://squid-protocol.github.io/gitgalaxy/04-00-security_landscape/)) rely on approved Swagger or OpenAPI files to dictate what should be tested. GitGalaxy provides a deterministic source of truth. By scanning the raw codebase at high velocity, we reveal the exact routing logic that is actively exposed to the network, regardless of what the documentation claims. +Standard DevSecOps scanners rely on approved Swagger or OpenAPI files to dictate what should be tested. GitGalaxy provides a deterministic source of truth. By scanning the raw codebase at high velocity, we reveal the exact routing logic that is actively exposed to the network, regardless of what the documentation claims. -### 🔍 Core Methodology: OpenAPI Drift Detection +--- -We utilize AST-free stoichiometric signatures—calculated metrics derived directly from DNA/regex hits—to bypass theoretical documentation and map physical routing logic at extreme velocity. +### 🌐 What It Is -* **Map Physical Reality:** Scans raw text for actual execution routes without needing a compiler or build environment. -* **Extract Theoretical Truth:** Parses official Swagger or OpenAPI specifications. -* **Mathematical Resolution:** Applies strict set theory to expose critical security gaps and [API drift](https://squid-protocol.github.io/gitgalaxy/04-01-full-api-network-map/). -* **Identify Shadow APIs (Critical Risk):** Exposes undocumented, active endpoints that evade standard WAFs and security audits, allowing you to seamlessly [hunt Shadow APIs in CI/CD pipelines](https://squid-protocol.github.io/gitgalaxy/cookbook/hunt-shadow-apis/). -* **Identify Ghost/Zombie APIs (Audit Bloat):** Highlights documented but non-existent or deprecated endpoints. +The **Full API Network Map** is a high-velocity, language-agnostic static analysis tool that automatically compares your physical source code routing signatures against your official OpenAPI/Swagger documentation. -### 🧠 Smart Auto-Discovery & Monorepo Support +It deterministically maps the delta between theoretical documentation and physical execution to expose critical security gaps: +* **🚨 Shadow APIs (Critical Risk):** Undocumented, active endpoints that evade standard Web Application Firewalls (WAFs) and security audits. +* **👻 Ghost APIs (Audit Bloat):** Documented but non-existent or deprecated endpoints that waste security team resources and cause integration failures. -You don't need to manually feed specification files to the engine. GitGalaxy features intelligent target acquisition: -* **Signature Grepping:** Recursively hunts and deep-scans files for OpenAPI/Swagger structural signatures. -* **Test-Pollution Bypassing:** Automatically isolates and ignores specification files hidden inside `test/` or `__tests__/` directories. -* **Microservice Unioning:** Use the `--merge-all` flag to automatically stitch together multiple discovered Swagger files in a monorepo into a single unified truth state. +--- -### 🛡️ Field-Tested at Scale: The tRPC Audit -To prove this engine operates at enterprise scale, we field-tested it against **tRPC**, a massive, highly complex TypeScript monorepo. +### 🛡️ Why It Was Built (Architectural Decisions) -Standard tools would choke on the nested monorepo architecture or require complex AST setups. GitGalaxy bypassed the noise, mapped the physical endpoints, and immediately identified an undocumented `GET /` Shadow API living in a `server.ts` file, while simultaneously flagging three documented endpoints that didn't actually exist in the physical code. +API documentation inevitably drifts from the compiled reality of the codebase. Traditional solutions to this problem rely on heavy Abstract Syntax Tree (AST) parsers, which require specific compiler environments, crash on malformed code, and struggle across polyglot microservice architectures. -![tRPC API Audit Demo](../../../docs/wiki/assets/trpc_api_audit.gif) +We built this engine to prioritize **velocity, resilience, and scale**: +* **AST-Free Regex Signatures:** By utilizing bounded structural signatures, we can deterministically identify framework routing intents (e.g., `@GetMapping`, `app.post`) at high speed across 9+ languages simultaneously, bypassing the need for complex compilation environments. +* **O(1) Memory Shield for Auto-Discovery:** Reading entire massive JSON/YAML specification blobs into memory can cause Out-Of-Memory (OOM) crashes in CI/CD pipelines. Our auto-discovery engine restricts read buffers to the first 1000 characters, verifying Swagger signatures instantly without consuming excessive RAM. +* **Test-Schema Pollution Mitigation:** Enterprise monorepos are often littered with mock Swagger files used for unit testing. The engine automatically segregates and ignores schemas located in `test/` or `__tests__/` directories, preventing test stubs from polluting the production audit. -### ⚙️ Supported Frameworks +--- -Our AST-free regex signatures deterministically map open APIs across multiple languages natively: +### ⚙️ How It Works (The Core Methodology) -* **Node.js:** Express, Fastify, and Koa (`app.post`) -* **Python:** FastAPI, Flask, and Django (`@app.get`) -* **Java:** Spring Boot (`@GetMapping`) -* **Golang:** Gorilla Mux, Gin, Fiber (`.HandleFunc`) -* **C# (.NET):** Controllers and Minimal APIs (`[HttpGet]`, `.MapGet`) -* **Rust:** Actix and Rocket (`#[get]`) -* **PHP:** Laravel and Symfony (`Route::get`) -* **Ruby:** Rails and Sinatra +1. **Auto-Discovery:** The engine recursively hunts for OpenAPI specifications in the target directory. It intelligently bypasses test directories and provides ambiguity warnings if multiple primary schemas are detected. +2. **Physical Codebase Mapping:** The engine scans raw source files, matching physical routing intents against a library of polyglot framework signatures (Spring Boot, Express, FastAPI, Gorilla Mux, .NET, Laravel, Actix, etc.). +3. **Mathematical Resolution:** Strict set theory is applied. The physical endpoints are subtracted from the documented endpoints (and vice-versa) to isolate the exact Shadow and Ghost APIs. +4. **Dashboard Presentation:** Generates a clean, actionable report of the exact files and endpoints violating the architectural contract. --- -### 🚀 Quickstart: Local CLI & CI/CD Integration - -If you have installed GitGalaxy globally via PyPI (`pip install gitgalaxy`), the API mapper is available directly in your terminal. It executes in seconds, making it ideal for both local checks and automated pipelines. +### 🚀 Usage & Clear Examples #### 1. Local CLI Execution - Execute the tool directly against your physical source code. The engine will auto-discover your Swagger file and generate an immediate gap analysis: ```bash -api-network-map /path/to/source/code +python3 full_api_network_map.py /path/to/source/code ``` -*(Optional) Target a specific specification file directly:* +#### 2. Handling Microservice Monorepos +If your repository contains multiple microservices, each with its own Swagger file, you can instruct the engine to union them into a single mathematical truth state: + ```bash -api-network-map /path/to/source/code --swagger /path/to/swagger.json +python3 full_api_network_map.py /path/to/source/code --merge-all ``` -*(Optional) Merge all discovered Swagger files in a microservice monorepo:* +#### 3. Explicit Specification Targeting +Bypass the auto-discovery engine and audit the codebase against a highly specific, mandated architectural contract: + ```bash -api-network-map /path/to/source/code --merge-all +python3 full_api_network_map.py /path/to/source/code --swagger /path/to/official_swagger.json ``` -#### 2. GitHub Actions CI/CD Integration +#### 📊 Example Output Dashboard +Below is an example of the CLI output when the engine detects OpenAPI drift within a modern web application: + +```text +🗺️ GitGalaxy API Network Mapper analyzing physical endpoints in: backend-monorepo... + + [DISCOVERY] Auto-discovered Swagger specification: docs/openapi.json + +========================================================== + 📡 SHADOW API SECURITY AUDIT +========================================================== + Physical Frameworks Tracked : Python (FastAPI/Flask/Django), Node.js (Express/Fastify/Koa) + Documented Endpoints (Swagger) : 24 + Physical Endpoints (Source) : 26 +---------------------------------------------------------- + 🚨 SHADOW APIs DETECTED: 2 (Critical Risk) + ↳ POST /api/v1/debug_admin [Found in: server.js] + ↳ GET /api/v1/legacy_export [Found in: routes.py] + +---------------------------------------------------------- + 👻 GHOST APIs DETECTED: 1 (Documentation Bloat) + ↳ DELETE /api/v1/users [Missing from executable source code] +========================================================== +``` + +--- + +### 🛠️ GitHub Actions CI/CD Integration You can automatically audit your API surface area on every Pull Request to ensure developers aren't silently exposing new endpoints without updating the Swagger documentation. @@ -92,28 +113,16 @@ jobs: uses: actions/checkout@v4 - name: Run Shadow API Hunter - uses: squid-protocol/gitgalaxy@main - with: - tool: 'api-network-map' - target: '.' - # Optional: Add extra arguments if you have multiple Swagger files - # args: '--merge-all' + run: | + pip install pyyaml + python3 full_api_network_map.py . ``` --- - -### 📊 The Audit Dashboard -Outputs a deterministic terminal dashboard optimized for CI/CD pipeline integration and security team review. - -* **Shadow APIs Detected:** Lists physical files containing hidden, undocumented routers. -* **Ghost APIs Detected:** Lists missing routes to eliminate security team audit bloat. - ---- -### 🌌 Powered by the blAST Engine (Bypassing LLMs and ASTs) -This tool is a modular enterprise integration within the broader GitGalaxy architecture. It is driven by our custom mathematical heuristics engine, capable of mapping multi-dimensional relationships at extreme velocity. Read the official documentation to see how we deterministically map API routes: +### 🌌 Powered by the blAST Engine +This tool is a modular enterprise integration within the broader GitGalaxy architecture. It is driven by our custom mathematical heuristics engine, capable of mapping multi-dimensional relationships at extreme velocity without relying on fragile LLMs or brittle ASTs. Read the official documentation to explore the underlying architecture: * 📖 **[The blAST Paradigm (ASTs vs LLMs)](https://squid-protocol.github.io/gitgalaxy/01-03-the-blast-paradigm/)** * 📖 **[Full API Network Map Architecture](https://squid-protocol.github.io/gitgalaxy/04-01-full-api-network-map/)** * 📖 **[The Network Risk Sensor Mechanics](https://squid-protocol.github.io/gitgalaxy/02-16-network-risk-sensor/)** -* 📖 **[API Exposure Risk Equations](https://squid-protocol.github.io/gitgalaxy/08-14-api-exposure/)** * 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** \ No newline at end of file diff --git a/gitgalaxy/tools/network_auditing/full_api_network_map.py b/gitgalaxy/tools/network_auditing/full_api_network_map.py index 40e75253..494e690a 100644 --- a/gitgalaxy/tools/network_auditing/full_api_network_map.py +++ b/gitgalaxy/tools/network_auditing/full_api_network_map.py @@ -1,8 +1,18 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Full API Network Map (Extended Frameworks & Auto-Discovery) -# Purpose: Hunts down undocumented "Shadow APIs" by comparing physical -# source code routers against official OpenAPI/Swagger documentation. +# GitGalaxy Tool: Full API Network Map +# +# PURPOSE: +# Detects undocumented "Shadow APIs" and missing "Ghost APIs" by comparing +# physical source code routing signatures against official OpenAPI/Swagger +# documentation. +# +# ARCHITECTURAL DECISION: +# API documentation frequently drifts from the compiled reality of the codebase. +# This module performs AST-free structural signature identification to map +# actual, executable endpoints across multiple frameworks (Spring, Express, +# FastAPI) and enforces strict parity with declared specifications, exposing +# hidden attack surfaces. # ============================================================================== import argparse import sys @@ -17,9 +27,9 @@ yaml = None # ============================================================================== -# 1. THE ROUTER PHYSICS (EXPANDED FRAMEWORK REGEX TRAPS) +# 1. ROUTER STRUCTURAL SIGNATURES (EXPANDED FRAMEWORK REGEX PATTERNS) # ============================================================================== -FRAMEWORK_TRAPS = { +FRAMEWORK_SIGNATURES = { "Python (FastAPI/Flask/Django)": { "ext": [".py"], "regex": re.compile( @@ -43,9 +53,7 @@ }, "Golang (Gorilla/Mux/Gin/Fiber)": { "ext": [".go"], - "regex": re.compile( - r'\.(GET|POST|PUT|DELETE|PATCH)\s*\(\s*["\'](.*?)["\']', re.IGNORECASE - ), + "regex": re.compile(r'\.(GET|POST|PUT|DELETE|PATCH)\s*\(\s*["\'](.*?)["\']', re.IGNORECASE), }, "C# (.NET Controllers)": { "ext": [".cs"], @@ -56,15 +64,11 @@ }, "C# (.NET Minimal APIs)": { "ext": [".cs"], - "regex": re.compile( - r'\.Map(Get|Post|Put|Delete|Patch)\s*\(\s*["\'](.*?)["\']', re.IGNORECASE - ), + "regex": re.compile(r'\.Map(Get|Post|Put|Delete|Patch)\s*\(\s*["\'](.*?)["\']', re.IGNORECASE), }, "PHP (Laravel/Symfony)": { "ext": [".php"], - "regex": re.compile( - r'Route::(get|post|put|delete|patch)\s*\(\s*["\'](.*?)["\']', re.IGNORECASE - ), + "regex": re.compile(r'Route::(get|post|put|delete|patch)\s*\(\s*["\'](.*?)["\']', re.IGNORECASE), }, "Rust (Actix/Rocket)": { "ext": [".rs"], @@ -84,7 +88,7 @@ def auto_discover_swagger(target_dir: Path) -> list: - """Hunts for OpenAPI/Swagger files by filename and internal content signatures.""" + """Scans for OpenAPI/Swagger specifications via filename and internal structural signatures.""" candidates = set() common_names = { "swagger.json", @@ -104,7 +108,13 @@ def auto_discover_swagger(target_dir: Path) -> list: candidates.add(filepath) continue - # 2. Deep Content Grep (Read first 1000 characters for speed) + # ============================================================================== + # DEFENSIVE DESIGN (I/O OPTIMIZATION & MEMORY SHIELD): + # Reading entire JSON/YAML files just to check if they are valid Swagger specs + # can cause OOM crashes on massive declarative data blobs. We restrict the read + # buffer to the first 1000 characters to achieve O(1) memory validation while + # maintaining extreme pipeline velocity. + # ============================================================================== try: with open(filepath, "r", encoding="utf-8", errors="ignore") as f: head = f.read(1000) @@ -121,18 +131,14 @@ def auto_discover_swagger(target_dir: Path) -> list: def parse_official_swagger(swagger_path: Path) -> set: - """Parses the official security documentation to find 'Approved' APIs.""" + """Parses the official security documentation to extract a baseline of approved APIs.""" approved_apis = set() try: with open(swagger_path, "r", encoding="utf-8") as f: if swagger_path.suffix.lower() in [".yaml", ".yml"]: if yaml is None: - print( - f" ❌ Error: PyYAML is required to parse .yaml Swagger files ({swagger_path.name})." - ) - print( - " Please run 'pip install pyyaml' or provide a .json specification." - ) + print(f" ❌ Error: PyYAML is required to parse .yaml Swagger files ({swagger_path.name}).") + print(" Please run 'pip install pyyaml' or provide a .json specification.") sys.exit(1) swagger_data = yaml.safe_load(f) else: @@ -151,7 +157,15 @@ def parse_official_swagger(swagger_path: Path) -> set: def map_physical_codebase(target_dir: Path) -> tuple: - """Rips through the source code to find every API endpoint actually compiled.""" + """ + Analyzes the source code to extract every executable API endpoint. + + ARCHITECTURAL DECISION (REGEX OVER AST): + Relying on language-specific ASTs requires compiling the code and supporting + dozens of parsers. By using bounded regex structural signatures, we can + deterministically identify framework routing intents (e.g., @GetMapping, + app.post) at high speed, regardless of language or compilation status. + """ physical_apis = defaultdict(list) frameworks_detected = set() @@ -159,7 +173,7 @@ def map_physical_codebase(target_dir: Path) -> tuple: if not filepath.is_file(): continue - for framework, config in FRAMEWORK_TRAPS.items(): + for framework, config in FRAMEWORK_SIGNATURES.items(): if filepath.suffix in config["ext"]: try: content = filepath.read_text(encoding="utf-8", errors="ignore") @@ -183,9 +197,7 @@ def main(): enforce_licensing_guard("Full API Network Map") parser = argparse.ArgumentParser(description="GitGalaxy Full API Network Map") - parser.add_argument( - "source", help="Directory containing the application source code" - ) + parser.add_argument("source", help="Directory containing the application source code") parser.add_argument( "--swagger", required=False, @@ -204,12 +216,10 @@ def main(): print(f"Error: Target source directory '{source_path}' does not exist.") sys.exit(1) - print( - f"🗺️ GitGalaxy Network Mapper analyzing physical endpoints in: {source_path.name}...\n" - ) + print(f"🗺️ GitGalaxy API Network Mapper analyzing physical endpoints in: {source_path.name}...\n") # ============================================================================== - # AUTO-DISCOVERY HANDSHAKE & THE AUDIT + # AUTO-DISCOVERY INITIALIZATION & AUDIT # ============================================================================== approved_apis = set() @@ -218,9 +228,7 @@ def main(): candidates = auto_discover_swagger(source_path) if not candidates: - print( - " ❌ [ABORT] No OpenAPI/Swagger specifications found in the repository." - ) + print(" ❌ [ABORT] No OpenAPI/Swagger specifications found in the repository.") print(" Please provide one manually using the --swagger flag.") sys.exit(1) @@ -229,35 +237,24 @@ def main(): test_cands = [] for c in candidates: parts = [p.lower() for p in c.relative_to(source_path).parts] - if ( - "test" in parts - or "tests" in parts - or "__tests__" in parts - or "testing" in parts - ): + if "test" in parts or "tests" in parts or "__tests__" in parts or "testing" in parts: test_cands.append(c) else: primary_cands.append(c) if len(primary_cands) == 1 and not args.merge_all: swagger_path = primary_cands[0] - print( - f" 🎯 Auto-discovered Primary Swagger: {swagger_path.relative_to(source_path)}" - ) + print(f" [DISCOVERY] Primary Swagger specification identified: {swagger_path.relative_to(source_path)}") if test_cands: - print( - f" 🛡️ Safely bypassed {len(test_cands)} schemas detected in test directories:" - ) + print(f" 🛡️ Safely excluded {len(test_cands)} schemas detected in test directories (Test-Schema Pollution Mitigation):") for tc in test_cands: print(f" - [Assumed Test] {tc.relative_to(source_path)}") print("") approved_apis = parse_official_swagger(swagger_path) elif len(candidates) > 1 and not args.merge_all: - print( - f" ⚠️ [AMBIGUITY] Multiple OpenAPI/Swagger specifications found ({len(candidates)})." - ) - print(" To prevent test-file pollution, automatic merging is disabled.") + print(f" ⚠️ [AMBIGUITY] Multiple OpenAPI/Swagger specifications found ({len(candidates)}).") + print(" To prevent test-schema pollution, automatic merging is disabled.") print("\n Discovered Files (By Endpoint Count):") # Calculate telemetry (route counts) to help the user choose @@ -274,18 +271,14 @@ def main(): for c, count, is_test in preview_stats: badge = "[TEST DIR]" if is_test else "[PRIMARY]" - print( - f" - {badge.ljust(11)} [{count} routes] {c.relative_to(source_path)}" - ) + print(f" - {badge.ljust(11)} [{count} routes] {c.relative_to(source_path)}") - print("\n Please specify the correct schema using the --swagger flag,") + print("\n Please specify the authoritative schema using the --swagger flag,") print(" OR use the --merge-all flag to union all of them together.") sys.exit(1) elif len(candidates) > 1 and args.merge_all: - print( - f" 🎯 --merge-all active. Unioning {len(candidates)} discovered specifications...\n" - ) + print(f" [DISCOVERY] --merge-all active. Unioning {len(candidates)} discovered specifications...\n") for c in candidates: try: approved_apis.update(parse_official_swagger(c)) @@ -293,9 +286,7 @@ def main(): pass else: swagger_path = candidates[0] - print( - f" 🎯 Auto-discovered Swagger specification: {swagger_path.relative_to(source_path)}\n" - ) + print(f" [DISCOVERY] Auto-discovered Swagger specification: {swagger_path.relative_to(source_path)}\n") approved_apis = parse_official_swagger(swagger_path) else: swagger_path = Path(args.swagger).resolve() @@ -303,6 +294,7 @@ def main(): print(f" ❌ Error: Provided Swagger file '{swagger_path}' does not exist.") sys.exit(1) approved_apis = parse_official_swagger(swagger_path) + physical_apis_map, frameworks_detected = map_physical_codebase(source_path) physical_endpoints = set(physical_apis_map.keys()) @@ -315,27 +307,25 @@ def main(): print("==========================================================") print(" 📡 SHADOW API SECURITY AUDIT") print("==========================================================") - framework_str = ( - ", ".join(frameworks_detected) if frameworks_detected else "None Detected" - ) + framework_str = ", ".join(frameworks_detected) if frameworks_detected else "None Detected" print(f" Physical Frameworks Tracked : {framework_str}") print(f" Documented Endpoints (Swagger) : {len(approved_apis)}") print(f" Physical Endpoints (Source) : {len(physical_endpoints)}") print("-" * 58) if shadow_apis: - print(f" 🚨 SHADOW APIS DETECTED: {len(shadow_apis)} (Critical Risk)") + print(f" 🚨 SHADOW APIs DETECTED: {len(shadow_apis)} (Critical Risk)") for api in sorted(shadow_apis): files = ", ".join(set(physical_apis_map[api])) print(f" ↳ {api.ljust(25)} [Found in: {files}]") else: - print(" ✅ No Shadow APIs detected. Codebase matches documentation.") + print(" ✅ No Shadow APIs detected. Codebase strictly matches documentation.") print("\n----------------------------------------------------------") if ghost_apis: - print(f" 👻 GHOST APIS DETECTED: {len(ghost_apis)} (Documentation Bloat)") + print(f" 👻 GHOST APIs DETECTED: {len(ghost_apis)} (Documentation Bloat)") for api in sorted(ghost_apis): - print(f" ↳ {api.ljust(25)} [Missing from source code]") + print(f" ↳ {api.ljust(25)} [Missing from executable source code]") else: print(" ✅ No Ghost APIs detected.") @@ -348,11 +338,7 @@ def run_api_audit(source_path: Path) -> dict: if not candidates: return {"status": "no_swagger", "shadow_count": 0, "ghost_count": 0} - primary_cands = [ - c - for c in candidates - if "test" not in [p.lower() for p in c.relative_to(source_path).parts] - ] + primary_cands = [c for c in candidates if "test" not in [p.lower() for p in c.relative_to(source_path).parts]] if len(primary_cands) != 1: return {"status": "ambiguous", "shadow_count": 0, "ghost_count": 0} @@ -370,4 +356,4 @@ def run_api_audit(source_path: Path) -> dict: if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/supply_chain_security/README.md b/gitgalaxy/tools/supply_chain_security/README.md index b0ccd9e4..390448c0 100644 --- a/gitgalaxy/tools/supply_chain_security/README.md +++ b/gitgalaxy/tools/supply_chain_security/README.md @@ -1,95 +1,92 @@ -# GitGalaxy: Supply Chain Security & Pre-Commit Sentinels +# GitGalaxy Security: Supply Chain & CI/CD Defense Suite -[![Integration](https://img.shields.io/badge/Integration-CI%2FCD_Ready-00C957.svg)](#) -[![Accuracy](https://img.shields.io/badge/Accuracy-Internal_File_Scanning-00BFFF.svg)](#) -[![Defense](https://img.shields.io/badge/Defense-Zero__Trust-FF4500.svg)](#) +[![Security](https://img.shields.io/badge/Security-Zero--Trust_Validation-FF4500.svg)](#) +[![Performance](https://img.shields.io/badge/Performance-Zero--Disk_Enforcement-00BFFF.svg)](#) +[![Compliance](https://img.shields.io/badge/Compliance-Shift--Left_Automation-8A2BE2.svg)](#) -Welcome to the **GitGalaxy Supply Chain Security Suite**. +This directory contains the CI/CD gating mechanisms, pre-commit hooks, and dependency validation engines for the GitGalaxy architecture. -Standard security scanners ([like Snyk, Dependabot, or Trivy](https://squid-protocol.github.io/gitgalaxy/04-00-security_landscape/)) have a massive blind spot: they read your `package.json` or `requirements.txt` and check those names against CVE databases. They act as manifest readers, never looking inside the actual downloaded files. +Standard Software Composition Analysis (SCA) tools inherently possess a critical blind spot: they act as manifest readers. They scan `package.json` or `requirements.txt` and cross-reference those declarations against known CVE databases. Modern supply chain compromises bypass this entirely by omitting malicious payloads from the manifest declarations, burying the threat in transit, or executing it dynamically during installation. -Modern attackers (like the **XZ-Utils** or **Glassworm** campaigns) exploit this. They don't announce their malware in a manifest. +The GitGalaxy Supply Chain Security Suite operates on a strict Zero-Trust model. Powered by our AST-free Structural Signature Analysis Engine, these tools bypass manifest assumptions and physically scan the structural mechanics of every dependency file on disk and in RAM before it is permitted to enter the build pipeline. -GitGalaxy operates differently. Powered by the [blAST Engine](https://squid-protocol.github.io/gitgalaxy/01-03-the-blast-paradigm/), we bypass compilation and rigid ASTs entirely. We scan the physical internals of every dependency file at extreme velocities (100k+ LOC/sec) before it enters your system, identifying threats via minimal keyword permutations rather than waiting for a CVE to be published. +--- + +## Architectural Philosophy & Defensive Engineering + +To intercept zero-day supply chain compromises, steganography, and credential leaks without introducing developer friction or CI/CD bottlenecks, these modules employ several high-velocity defensive paradigms: + +### 1. Zero-Trust Dependency Verification +Manifests can be spoofed. The `supply_chain_firewall.py` ignores dependency declarations and instead parses the actual, physical `import` and `require` statements within the downloaded `node_modules` or `venv` directories. It maps this physical execution graph against strict enterprise allowlists, instantly flagging unauthorized network requests, nested typosquatting, or anomalous I/O hooks. -### 🛡️ What We Stop -We provide highly effective, zero-trust defense against structural threats: -* **Hidden Executables:** Steganography and [XZ-Utils attack patterns](https://squid-protocol.github.io/gitgalaxy/04-05-binary-anomaly-detector/). -* **Malicious Typosquatting:** Unicode homoglyphs tricking developer imports. -* **Encrypted Payloads:** Sub-atomic XOR decryption loops hiding inside utility files. -* **Hostile I/O:** Shadow imports establishing covert outbound connections. -* **API Drift:** Network sockets hidden inside undocumented [Shadow APIs](https://squid-protocol.github.io/gitgalaxy/04-01-full-api-network-map/). +### 2. High-Velocity Secret Detection +Pre-commit hooks must operate in milliseconds, or developers will bypass them. `vault_sentinel.py` utilizes a two-tier approach. Tier 0 performs instant O(1) path evaluation, blocking high-risk file extensions (e.g., `.pem`, `id_rsa`) before disk I/O occurs. Tier 1 executes deep content scanning to isolate exposed cryptographic keys and SaaS tokens—even if they are buried in commented-out logic or deprecated trails. + +### 3. Heuristic Binary Triage (Zero-RAM Memory Shielding) +Advanced compromises often hide executable logic inside seemingly inert static assets. `binary_anomaly_detector.py` performs localized binary triage without uploading artifacts to cloud sandboxes. It validates structural magic bytes to catch scripts disguised as images, and utilizes mathematically optimized Shannon Entropy calculations to flag highly obfuscated or packed payloads (Entropy > 4.8) without exhausting system memory. --- -### 🛠️ The Sentinel Tools +## Core Modules (The Sentinels) -Wired directly into your Git Pre-Commit hooks or CI/CD pipelines, these sentinels act as a physical firewall to fail poisoned builds early. +Each file in this directory acts as a discrete, specialized firewall for your development and deployment pipelines: -#### 1. [The Supply Chain Firewall](https://squid-protocol.github.io/gitgalaxy/04-03-supply-chain-firewall/) (`supply-chain-firewall`) -Scans massive `node_modules` or `venv` directories in seconds. -* **Zero-Trust Verification:** Checks every physical `import` against strict allowlists. -* **Behavioral Heuristics:** Scans for tainted data injection routines and parasitic logic. +* **`supply_chain_firewall.py` (Dependency Integrity Gate):** Scans the physical execution graph of downloaded dependencies. It cross-references the core engine's structural telemetry to block packages exhibiting unauthorized behavioral heuristics (e.g., unexpected data injection routines, execution of OS-level processes during installation). +* **`binary_anomaly_detector.py` (Binary Anomaly Detector):** Designed for rapid triage of binaries and obfuscated files. It detects embedded execution headers hidden inside static data, validates file extension integrity, and flags extreme cryptographic entropy indicating packed malware or byte-level obfuscation loops. +* **`vault_sentinel.py` (Vault Sentinel):** A hyper-speed pre-commit hook strictly for localized credential detection. It enforces Tier 0 path blocking and executes deep-content cryptographic scans to prevent hardcoded cloud keys, database passwords, and API tokens from entering version control. +* **`manifest_parser.py` (SSCS Manifest Auditor):** Parses ecosystem manifests (NPM, PyPI) to build a deterministic resolution map. It actively detects namespace hijacking, unverified direct-URI resolutions, and insecure registry routing. -#### 2. [Zero-Trust SBOM Generator](https://squid-protocol.github.io/gitgalaxy/04-02-sbom-generator/) (`sbom-generator`) -Standard SBOMs blindly trust manifests. Ours doesn't. -* **Physical Audits:** Extracts and micro-scans files from every downloaded dependency. -* **CycloneDX 1.4:** Generates compliant manifests injected with physical threat telemetry. +--- -#### 3. [X-Ray Inspector](https://squid-protocol.github.io/gitgalaxy/04-05-binary-anomaly-detector/) (`xray-inspector`) -Designed to fast-triage binary files and encrypted malware without cloud processing. -* **Magic Byte Validation:** Catches executable scripts disguised as harmless `.png` images. -* **Entropy Math:** Flags high-entropy encrypted text payloads (Shannon Entropy > 4.8). -* **Parasitic Headers:** Detects executable logic inside static data blobs. +## 🧠 Engineering Highlights (How It Works at Scale) -#### 4. [Vault Sentinel](https://squid-protocol.github.io/gitgalaxy/04-04-vault-sentinel/) (`vault-sentinel`) -A hyper-speed pre-commit hook strictly for localized secret detection. -* **Tier 0 Path Blocking:** Instantly blocks sensitive file path commits (e.g., `.pem`, `id_rsa`). -* **Deep Content Scanning:** Hunts for hardcoded cloud cryptographic keys and SaaS tokens. -* **Graveyard Detection:** Finds abandoned passwords sitting in [commented-out dead code](https://squid-protocol.github.io/gitgalaxy/08-13-graveyard-detector/). +* **RAM-Exclusive Policy Enforcement (`supply_chain_firewall.py`):** Typical firewalls perform redundant O(N) disk parsing. This firewall consumes the pre-computed Dependency Graph from Phase 1. By completely divesting from disk I/O, it achieves near-instant behavioral policy enforcement across tens of thousands of dependencies. +* **Build-Time Execution Multipliers (`supply_chain_firewall.py`):** Configuration scripts (like `setup.py` or `package.json` hooks) are executed by CI/CD runners at build time. Remote Code Execution (RCE) here compromises the host before the application even runs. The engine applies an artificial 10x structural density multiplier to manifest triggers, ensuring any I/O or High-Risk Execution signatures instantly trip the firewall. +* **Namespace Dereferencing & Hijack Mitigation (`manifest_parser.py`):** To catch Dependency Confusion attacks where a malicious package masks itself behind a trusted internal alias, the parser normalizes NPM/PyPI aliases to their true upstream packages. It flags direct URI resolutions (which bypass Subresource Integrity checks) and actively blocks insecure protocols or tunneling services (e.g., `ngrok`) hiding in `pip.conf`. +* **O(1) Memory Shielding for Binary Triage (`binary_anomaly_detector.py`):** Attempting to calculate the Shannon Entropy of a 2GB binary blob will immediately trigger an Out-Of-Memory (OOM) crash in a CI runner. This detector mathematically guarantees pipeline survival by capping its read buffer at 8KB—sufficient to capture magic bytes, execution headers, and enough string data for accurate entropy calculation without memory exhaustion. --- -### ⚡ Performance Showcases +## ⚡ Performance Showcases #### Showcase A: Vault Sentinel (Secret Detection) -To prove this engine operates fast enough to be a synchronous pre-commit hook without frustrating developers, we unleashed the **Vault Sentinel** on the massive **tRPC** TypeScript monorepo. +To prove this engine operates fast enough to be a synchronous pre-commit hook without frustrating developers, we executed the **Vault Sentinel** against the massive **tRPC** TypeScript monorepo. The engine evaluated 871 files and performed deep-content cryptographic scans on 695 of them in **0.53 seconds** (processing over 1,300 files per second). It successfully intercepted 7 exposed environment files and caught a hardcoded API key before the commit could execute. -![Vault Sentinel Demo](../../../docs/wiki/assets/vault_sentinel_scan.gif) +![Vault Sentinel Demo](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/vault_sentinel_scan.gif) -#### Showcase B: X-Ray Inspector (Malware & Binary Triage) -To test binary detection, we ran the **X-Ray Inspector** against **pwntools**, an exploit development framework containing actual compiled binaries and shellcode. +#### Showcase B: Binary Anomaly Detector (Malware & Binary Triage) +To test binary detection, we ran the **Binary Anomaly Detector** against **pwntools**, an exploit development framework containing actual compiled binaries and shellcode. -The engine ripped through the repository at **2,825 files per second**. By reading the raw physical bytes rather than trusting file extensions, it instantly detected 13 parasitic `ELF` execution headers embedded inside the source tree. +The engine processed the repository at a velocity of **2,825 files per second**. By reading the raw physical bytes rather than trusting file extensions, it instantly detected 13 embedded `ELF` execution headers hidden inside the source tree. -![X-Ray Inspector Demo](../../../docs/wiki/assets/xray_inspector_scan.gif) +![Binary Anomaly Detector Demo](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/xray_inspector_scan.gif) -```text +~~~text =========================================================================== - ☢️ X-RAY INSPECTOR: MISSION REPORT + BINARY ANOMALY DETECTOR: SCAN SUMMARY =========================================================================== Files Evaluated : 95 Files Deep Scanned : 95 Time Elapsed : 0.03 seconds Scan Velocity : 2,825 files/sec --------------------------------------------------------------------------- - Active Anomalies : 13 + Anomalies Detected : 13 --------------------------------------------------------------------------- - ❌ TRIAGE ALERT: 13 structural anomalies detected. Blocking commit/PR. -``` + [BLOCKING ACTION] 13 structural anomalies detected. Failing pipeline. +~~~ #### Showcase C: Supply Chain Firewall (Infrastructure-as-Code Audit) To prove the firewall can handle diverse polyglot ecosystems without throwing false positives, we ran it against the **Terraform** repository. The engine parsed 1,834 files at a velocity of **436 files per second**. It successfully verified the integrity of the dependency tree, identified 54 unknown packages for audit, and cleared the build without tripping any false alarms on standard Go/HCL syntax. -![Supply Chain Firewall Demo](../../../docs/wiki/assets/terraform_firewall_scan.gif) +![Supply Chain Firewall Demo](https://raw.githubusercontent.com/squid-protocol/gitgalaxy/main/docs/wiki/assets/terraform_firewall_scan.gif) -```text +~~~text =========================================================================== - 🧱 SUPPLY CHAIN FIREWALL: MISSION REPORT + SUPPLY CHAIN FIREWALL: SCAN SUMMARY =========================================================================== Mode : Audit (Allow Whitelist + Unknown, Exclude Blacklist) Files Deep Scanned : 1,834 @@ -101,21 +98,34 @@ The engine parsed 1,834 files at a velocity of **436 files per second**. It succ --------------------------------------------------------------------------- Active Threats : 0 --------------------------------------------------------------------------- - ✅ BUILD PASSED: Dependency supply chain is clean. -``` + [SUCCESS] Dependency supply chain is clean. +~~~ --- -### 🚀 Quickstart: CI/CD & Pre-Commit Integration +## CI/CD & Pre-Commit Integration + +These sentinels are designed to be wired directly into your Git workflows to fail compromised builds autonomously. -GitGalaxy is designed for frictionless adoption. You can install it globally via PyPI (`pip install gitgalaxy`) or run it natively in GitHub Actions without installing anything. +**Local Pre-Commit Hook Integration:** +To run the Vault Sentinel automatically before every commit, add this configuration to your `.pre-commit-config.yaml` file: -#### 1. Global GitHub Marketplace Action (Recommended) -You can drop GitGalaxy into any repository immediately using our official [GitHub Marketplace Action](https://github.com/marketplace/actions/gitgalaxy-scanner). +~~~yaml +repos: + - repo: local + hooks: + - id: gitgalaxy-vault-sentinel + name: GitGalaxy Vault Sentinel + entry: vault-sentinel + language: system + types: [text] + pass_filenames: true +~~~ -Add this to your `.github/workflows/security.yml` file: +**GitHub Actions Integration:** +You can deploy these sentinels directly into your CI/CD pipeline using the official [GitGalaxy GitHub Action](https://github.com/marketplace/actions/gitgalaxy-scanner). -```yaml +~~~yaml name: GitGalaxy Zero-Trust Audit on: [pull_request] @@ -128,36 +138,17 @@ jobs: uses: squid-protocol/gitgalaxy@v2.0.7 with: tool: 'supply-chain-firewall' -``` +~~~ -#### 2. Local CLI Execution -```bash -supply-chain-firewall ./node_modules/ -xray-inspector ./src/ -vault-sentinel . -``` +--- -#### 3. Local Pre-Commit Hook Integration -To run the Vault Sentinel automatically before every commit, add this to your `.pre-commit-config.yaml` file: +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) -```yaml -repos: - - repo: local - hooks: - - id: gitgalaxy-vault-sentinel - name: GitGalaxy Vault Sentinel - entry: vault-sentinel - language: system - types: [text] - pass_filenames: true -``` +GitGalaxy Security is the threat inference and enforcement layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed for planetary-scale repositories. ---- -### 🌌 Explore the GitGalaxy Wiki -This toolsuite is just one spoke in the larger GitGalaxy ecosystem. Explore the official documentation to see the math and methodology behind our AST-free engine: - -* 📖 **[The Competitive Landscape (How We Beat the Status Quo)](https://squid-protocol.github.io/gitgalaxy/04-00-security_landscape/)** -* 📖 **[Supply Chain Firewall Architecture](https://squid-protocol.github.io/gitgalaxy/04-03-supply-chain-firewall/)** -* 📖 **[Binary Anomaly & Entropy Mathematics](https://squid-protocol.github.io/gitgalaxy/04-05-binary-anomaly-detector/)** -* 📖 **[Hardcoded Secrets Exposure Equations](https://squid-protocol.github.io/gitgalaxy/08-23-hardcoded-secrets-exposure/)** -* 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** \ No newline at end of file +Explore the ecosystem: + +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/tools/supply_chain_security/binary_anomaly_detector.py b/gitgalaxy/tools/supply_chain_security/binary_anomaly_detector.py index 601b622e..33f5ed49 100644 --- a/gitgalaxy/tools/supply_chain_security/binary_anomaly_detector.py +++ b/gitgalaxy/tools/supply_chain_security/binary_anomaly_detector.py @@ -1,7 +1,17 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: X-Ray Inspector -# Purpose: Fast triage of binary anomalies and high-entropy encrypted payloads. +# GitGalaxy Tool: Binary Anomaly Detector +# +# PURPOSE: +# Performs high-speed triage of binary anomalies, magic byte mismatches, and +# obfuscated payloads within the CI/CD pipeline. +# +# ARCHITECTURAL DECISION: +# Traditional SAST tools struggle with binaries, either ignoring them completely +# (allowing steganography/hidden malware) or attempting to parse them, causing +# pipeline timeouts. This module acts as a lightweight heuristic gatekeeper, +# relying on mathematical entropy and header verification to detect malicious +# packing without requiring deep binary execution. # ============================================================================== import argparse import sys @@ -15,7 +25,7 @@ from gitgalaxy.security.security_lens import SecurityLens from gitgalaxy.standards.language_standards import LANGUAGE_DEFINITIONS -# Safely import the config, falling back if the user hasn't added exceptions yet +# Safely import the config, falling back if the user hasn't configured exceptions yet try: from gitgalaxy.standards.gitgalaxy_config import ( APERTURE_CONFIG, @@ -36,11 +46,9 @@ def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("X-Ray Inspector") + enforce_licensing_guard("Binary Anomaly Detector") - parser = argparse.ArgumentParser( - description="X-Ray Inspector: Binary & Obfuscation Scanner" - ) + parser = argparse.ArgumentParser(description="Binary Anomaly Detector: Entropy & Magic Byte Scanner") parser.add_argument("target", help="Directory to scan") args = parser.parse_args() @@ -49,16 +57,21 @@ def main(): print(f"Error: Target {target_path} does not exist.") sys.exit(1) - print(f"☢️ X-Ray Inspector engaging on {target_path.name}...") + print(f"🔍 Initializing Binary Anomaly Detector on {target_path.name}...") # Initialize lightweight filters filter_engine = ApertureFilter(target_path, LANGUAGE_DEFINITIONS, APERTURE_CONFIG) security = SecurityLens() - # NEUTER THE LENS: X-Ray only cares about entropy, bitwise XORs, and heat signatures + # ============================================================================== + # DEFENSIVE DESIGN (SENSOR OPTIMIZATION): + # Restrict the Security Lens to entropy and bitwise operations. By disabling + # the heavy AST and regex processors used for source code, we minimize CPU + # overhead and prevent Catastrophic Backtracking on dense binary data. + # ============================================================================== security.THREAT_SIGNATURES = { - "heat_triggers": security.THREAT_SIGNATURES["heat_triggers"], - "bitwise_hits": security.THREAT_SIGNATURES["bitwise_hits"], + "reflection_metaprogramming": security.THREAT_SIGNATURES["reflection_metaprogramming"], + "bitwise_ops": security.THREAT_SIGNATURES["bitwise_ops"], } anomalies_found = 0 @@ -69,12 +82,15 @@ def main(): files_to_deep_scan = [] # ============================================================================== - # PASS 1: The Funnel (Build the Queue) + # PHASE 1: Path Filtering and Queue Generation # ============================================================================== for root, dirs, files in os.walk(target_path): rel_root = str(Path(root).relative_to(target_path)) - # Shield Bypass & Top-Level Optimization (Fixed Root Traversal Bug) + # ARCHITECTURAL DECISION (ROOT TRAVERSAL PRUNING): + # We evaluate top-level directories against ignore rules and modify the `dirs` + # list in-place. This prevents the OS from walking down massive ignored trees + # (like `node_modules` or `.git`), saving massive I/O overhead. if rel_root == ".": dirs[:] = [d for d in dirs if filter_engine._check_ignore_rules(d)] elif not filter_engine._check_ignore_rules(rel_root): @@ -86,20 +102,19 @@ def main(): file_path = Path(root) / file ext = file_path.suffix.lower() - # Create a normalized string for checking against lists + # Create a normalized string for checking against configurations rel_path_str = str(file_path.relative_to(target_path)).replace("\\", "/") # Evaluate Global vs. Tool-Specific Bypasses - is_global_allow = any( - approved in rel_path_str for approved in ALLOWLIST_PATHS - ) - is_xray_bypass = ext in XRAY_BYPASS_EXTENSIONS or any( - b in rel_path_str for b in XRAY_BYPASS_PATHS - ) + is_global_allow = any(approved in rel_path_str for approved in ALLOWLIST_PATHS) + is_xray_bypass = ext in XRAY_BYPASS_EXTENSIONS or any(b in rel_path_str for b in XRAY_BYPASS_PATHS) is_whitelisted = is_global_allow or is_xray_bypass - # THE VERIFICATION SAFE ZONE: Tests always generate high-entropy mock data + # FALSE POSITIVE MITIGATION (TEST DIRECTORIES): + # Unit tests frequently generate high-entropy mock data (e.g., mock + # cryptographic keys, dummy hashes). We whitelist these paths to prevent + # pipeline friction, assuming test directories are not deployed to production. if ( "/test/" in rel_path_str.lower() or "/tests/" in rel_path_str.lower() @@ -107,85 +122,74 @@ def main(): ): is_whitelisted = True - # 1. THE DENYLIST CHECK - is_forbidden = any( - fnmatch.fnmatch(file, pattern) for pattern in DENYLIST_PATTERNS - ) + # 1. DENYLIST ENFORCEMENT + is_forbidden = any(fnmatch.fnmatch(file, pattern) for pattern in DENYLIST_PATTERNS) if is_forbidden and not is_whitelisted: - print( - f"🚨 [FORBIDDEN FILE BREACH] Illegal file pattern detected: {rel_path_str}" - ) + print(f"[DENYLIST MATCH] Unauthorized file pattern detected: {rel_path_str}") forbidden_blocked += 1 anomalies_found += 1 continue - # NOTE: We intentionally DO NOT call evaluate_path_integrity here! - # X-Ray needs to scan binaries (.png, .zip), which the other tools drop. + # NOTE: We intentionally bypass `evaluate_path_integrity` here. + # This specific detector must scan actual binaries (.png, .zip, .dll), + # which standard Aperture filtering drops. files_to_deep_scan.append((file_path, rel_path_str, ext, is_whitelisted)) # ============================================================================== - # PASS 2: The Deep Scan (Internal Contents & Binary Headers) + # PHASE 2: Deep Content and Binary Header Inspection # ============================================================================== print(f"\n🔎 Scanning {len(files_to_deep_scan):,} files for structural anomalies:") print(" - Magic Byte Mismatches (e.g., hidden executables disguised as images)") - print( - " - Parasitic Execution Headers (e.g., executable logic buried in data blobs)" - ) - print( - " - High-Entropy Encrypted Payloads (e.g., packed malware or sub-atomic XOR loops)" - ) + print(" - Embedded Execution Headers (e.g., executable signatures within static data)") + print(" - High-Entropy Payloads (e.g., packed executables or obfuscated bitwise loops)") start_time = time.time() for file_path, rel_path_str, ext, is_whitelisted in files_to_deep_scan: try: - # Read the first 8KB of the file as raw bytes + # DEFENSIVE DESIGN (MEMORY SHIELD): + # Read only the first 8KB of the file. This is sufficient to capture + # magic bytes, execution headers, and enough string data for an accurate + # entropy calculation, completely preventing Out-Of-Memory (OOM) crashes on huge files. with open(file_path, "rb") as f: head_bytes = f.read(8192) has_anomaly = False anomaly_msgs = [] - # 1. Binary X-Ray (Magic Bytes & Headers) + # 1. Binary Analysis (Magic Bytes & Execution Headers) binary_threats = security.scan_binary(head_bytes, ext) - # THE EXPECTED HEADER SHIELD + # EXPECTED HEADER EXCEPTION: + # Shell scripts naturally contain the '#!/bin/' header. We clear this + # specific threat if the extension matches a known script format. if binary_threats: threat_msg = binary_threats.get("threat_snippet", "") - if ( - ext in [".sh", ".bash", ".zsh", ".command"] - and "#!/bin/" in threat_msg - ): - binary_threats = {} # Clear the threat, it is expected + if ext in [".sh", ".bash", ".zsh", ".command"] and "#!/bin/" in threat_msg: + binary_threats = {} if binary_threats: has_anomaly = True - anomaly_msgs.append( - binary_threats.get("threat_snippet", "Unknown Binary Threat") - ) + anomaly_msgs.append(binary_threats.get("threat_snippet", "Unknown Binary Threat")) - # 2. String Entropy X-Ray (Encrypted/Packed Payloads) + # 2. String Entropy Analysis (Encrypted/Packed Payloads) content = head_bytes.decode("utf-8", errors="ignore") sec_results = security.scan_content(content, 100) if sec_results["counts"].get("entropy", 0) > 0: has_anomaly = True - anomaly_msgs.append( - "Mathematically dense/encrypted strings detected (Shannon Entropy > 4.8)" - ) + anomaly_msgs.append("Mathematically dense/encrypted strings detected (Shannon Entropy > 4.8)") - if sec_results["counts"].get("bitwise_hits", 0) > 0: + if sec_results["counts"].get("bitwise_ops", 0) > 0: has_anomaly = True - anomaly_msgs.append( - "Sub-atomic decryption routines (XOR loops) detected" - ) + anomaly_msgs.append("Obfuscated bitwise operations (XOR loops) detected") # 3. Report the Anomaly if has_anomaly: if is_whitelisted: anomalies_allowed += 1 else: - print(f"☢️ [ANOMALY DETECTED] {rel_path_str}") + print(f"[ANOMALY DETECTED] {rel_path_str}") for msg in anomaly_msgs: print(f" -> {msg}") anomalies_found += 1 @@ -198,54 +202,47 @@ def main(): scan_rate = len(files_to_deep_scan) / time_delta if time_delta > 0 else 0 # ============================================================================== - # MISSION REPORT + # SCAN SUMMARY # ============================================================================== print("\n" + "=" * 75) - print(" ☢️ X-RAY INSPECTOR: MISSION REPORT") + print(" BINARY ANOMALY DETECTOR: SCAN SUMMARY") print("=" * 75) - print(f" Files Evaluated : {files_evaluated:,}") - print(f" Files Deep Scanned : {len(files_to_deep_scan):,}") - print(f" Time Elapsed : {time_delta:.2f} seconds") - print(f" Scan Velocity : {scan_rate:,.0f} files/sec") + print(f" Files Evaluated : {files_evaluated:,}") + print(f" Files Deep Scanned : {len(files_to_deep_scan):,}") + print(f" Time Elapsed : {time_delta:.2f} seconds") + print(f" Scan Velocity : {scan_rate:,.0f} files/sec") print("-" * 75) - print(f" Active Anomalies : {anomalies_found:,}") + print(f" Anomalies Detected : {anomalies_found:,}") print(f" File Denylist Blocks : {forbidden_blocked:,}") - print(f" File Allowlist Bypasses: {anomalies_allowed:,}") + print(f" Allowlist Bypasses : {anomalies_allowed:,}") print("-" * 75) if anomalies_found > 0: - print( - f" ❌ TRIAGE ALERT: {anomalies_found} structural anomalies detected. Blocking commit/PR." - ) - print( - " 💡 TIP: X-Ray uses entropy math which naturally flags compression and dense data." - ) - print( - " - If safe extension (e.g., .gz, .json): Add to XRAY_BYPASS_EXTENSIONS" - ) + print(f" [BLOCKING ACTION] {anomalies_found} structural anomalies detected. Failing pipeline.") + print(" TIP: Entropy-based detection naturally flags compressed or dense data.") + print(" - If safe extension (e.g., .gz, .json): Add to XRAY_BYPASS_EXTENSIONS") print(" - If safe specific file: Add to XRAY_BYPASS_PATHS") print(" - Edit these inside: gitgalaxy/standards/gitgalaxy_config.py") sys.exit(1) else: - print(" ✅ ALL CLEAR: No encrypted payloads or binary anomalies detected.") + print(" [SUCCESS] No obfuscated payloads or binary anomalies detected.") if anomalies_allowed > 0: - print( - f" 💡 NOTE: {anomalies_allowed} known mock/safe files were bypassed via configuration." - ) + print(f" NOTE: {anomalies_allowed} known mock/safe files were bypassed via configuration.") print("=" * 75 + "\n") def run_xray_audit(target_path: Path) -> dict: - """Programmatic entry point for GalaxyScope.""" + """Programmatic entry point for GalaxyScope (orchestrator execution).""" filter_engine = ApertureFilter(target_path, LANGUAGE_DEFINITIONS, APERTURE_CONFIG) security = SecurityLens() security.THREAT_SIGNATURES = { - "heat_triggers": security.THREAT_SIGNATURES["heat_triggers"], - "bitwise_hits": security.THREAT_SIGNATURES["bitwise_hits"], + "reflection_metaprogramming": security.THREAT_SIGNATURES["reflection_metaprogramming"], + "bitwise_ops": security.THREAT_SIGNATURES["bitwise_ops"], } anomalies_found = 0 - # Minimal silent scan + + # Minimal silent scan for headless execution for root, dirs, files in os.walk(target_path): rel_root = str(Path(root).relative_to(target_path)) if rel_root == ".": @@ -257,18 +254,17 @@ def run_xray_audit(target_path: Path) -> dict: for file in files: file_path = Path(root) / file rel_path_str = str(file_path.relative_to(target_path)).replace("\\", "/") + is_whitelisted = ( any(a in rel_path_str for a in ALLOWLIST_PATHS) or file_path.suffix.lower() in XRAY_BYPASS_EXTENSIONS or any(b in rel_path_str for b in XRAY_BYPASS_PATHS) ) + if "/test/" in rel_path_str.lower() or "/tests/" in rel_path_str.lower(): is_whitelisted = True - if ( - any(fnmatch.fnmatch(file, p) for p in DENYLIST_PATTERNS) - and not is_whitelisted - ): + if any(fnmatch.fnmatch(file, p) for p in DENYLIST_PATTERNS) and not is_whitelisted: anomalies_found += 1 continue @@ -276,19 +272,18 @@ def run_xray_audit(target_path: Path) -> dict: with open(file_path, "rb") as f: head_bytes = f.read(8192) ext = file_path.suffix.lower() + + # Check Binary Headers bt = security.scan_binary(head_bytes, ext) - if bt and not ( - ext in [".sh", ".bash", ".zsh"] - and "#!/bin/" in bt.get("threat_snippet", "") - ): + if bt and not (ext in [".sh", ".bash", ".zsh"] and "#!/bin/" in bt.get("threat_snippet", "")): if not is_whitelisted: anomalies_found += 1 + # Check String Entropy content = head_bytes.decode("utf-8", errors="ignore") sr = security.scan_content(content, 100) if ( - sr["counts"].get("entropy", 0) > 0 - or sr["counts"].get("bitwise_hits", 0) > 0 + sr["counts"].get("entropy", 0) > 0 or sr["counts"].get("bitwise_ops", 0) > 0 ) and not is_whitelisted: anomalies_found += 1 except Exception: @@ -298,4 +293,4 @@ def run_xray_audit(target_path: Path) -> dict: if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/supply_chain_security/supply_chain_firewall.py b/gitgalaxy/tools/supply_chain_security/supply_chain_firewall.py index dbf630dc..37ab842b 100644 --- a/gitgalaxy/tools/supply_chain_security/supply_chain_firewall.py +++ b/gitgalaxy/tools/supply_chain_security/supply_chain_firewall.py @@ -1,8 +1,17 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Supply Chain Firewall -# Purpose: Zero-Trust Dependency Verification and Behavioral Policy Enforcement. -# Architecture: RAM-Exclusive Logic Gate (Consumes Phase 1 Topological Graph) +# GitGalaxy Tool: Supply Chain Firewall +# +# PURPOSE: +# Zero-Trust Dependency Verification and Behavioral Policy Enforcement. +# +# ARCHITECTURAL DECISION: +# Operating as a RAM-Exclusive Logic Gate, this firewall consumes the Phase 1 +# Dependency Graph. By completely divesting from redundant O(N) disk parsing, +# it achieves near-instant policy enforcement. It mitigates Namespace Hijacking +# and Dependency Confusion attacks by comparing raw codebase imports against +# resolved manifest aliases, while enforcing dynamic risk thresholds based on +# build-time execution contexts and network topography. # ============================================================================== import argparse import sys @@ -13,7 +22,7 @@ from gitgalaxy.security.security_lens import SecurityLens from gitgalaxy.standards.analysis_lens import ThreatPolicy -# Safely import the config, falling back if the user hasn't added exceptions yet +# Safely import the config, falling back if the user hasn't configured exceptions yet try: from gitgalaxy.standards.gitgalaxy_config import ( ALLOWLIST_PATHS, @@ -49,40 +58,43 @@ def run_firewall_audit(parsed_files: list, alias_map: dict = None) -> dict: "threats_found": threats_found, } - for star in parsed_files: - rel_path_str = star.get("path", "unknown") + for file_node in parsed_files: + rel_path_str = file_node.get("path", "unknown") is_whitelisted = any(approved in rel_path_str for approved in ALLOWLIST_PATHS) # ===================================================================== # 1. ZERO-TRUST IMPORT VERIFICATION # ===================================================================== - for raw_pkg in star.get("raw_imports", []): - # The Relative Path Shield: Ignore native internal routing + for raw_pkg in file_node.get("raw_imports", []): + # DEFENSIVE DESIGN (RELATIVE PATH SHIELD): + # Ignore native internal routing (e.g., './utils') to focus strictly + # on external supply chain dependencies. if raw_pkg.startswith("."): continue - # The Deep-Path Truncator: Normalize sub-module paths to their base package name + # DEFENSIVE DESIGN (DEEP-PATH TRUNCATOR): + # Attackers often hide malicious payloads deep inside nested sub-modules. + # Normalizing paths (e.g., 'lodash/nested/file' -> 'lodash') ensures + # policy rules evaluate the authoritative root package. if raw_pkg.startswith("@"): parts = raw_pkg.split("/") pkg = f"{parts[0]}/{parts[1]}" if len(parts) >= 2 else raw_pkg else: pkg = raw_pkg.split("/")[0] - # The Identity Translation Shield: Dereference manifest aliases + # DEFENSIVE DESIGN (IDENTITY TRANSLATION SHIELD): + # Dereference manifest aliases to catch Dependency Confusion attacks + # where a malicious package masks itself behind a trusted internal alias. true_pkg = safe_alias_map.get(pkg, pkg) if true_pkg in BLACKLISTED_IMPORTS: imports_blacklisted += 1 threats_found += 1 - # The Whitelist Loophole Fix: A blacklisted import is ALWAYS a threat. Never suppress it. + # The Allowlist Loophole Fix: A blacklisted import is ALWAYS a threat. Never suppress it. if true_pkg != pkg: - print( - f"🚨 [BLACKLISTED IMPORT] Spoofed alias blocked: '{pkg}' -> '{true_pkg}' in: {rel_path_str}" - ) + print(f"[BLACKLISTED IMPORT] Spoofed alias blocked: '{pkg}' -> '{true_pkg}' in: {rel_path_str}") else: - print( - f"🚨 [BLACKLISTED IMPORT] Malicious package '{pkg}' blocked in: {rel_path_str}" - ) + print(f"[BLACKLISTED IMPORT] Unauthorized package '{pkg}' blocked in: {rel_path_str}") elif true_pkg in APPROVED_IMPORTS: imports_whitelisted += 1 else: @@ -92,26 +104,28 @@ def run_firewall_audit(parsed_files: list, alias_map: dict = None) -> dict: if not is_whitelisted: if true_pkg != pkg: print( - f"🚨 [ZERO-TRUST BREACH] Spoofed alias '{pkg}' -> '{true_pkg}' blocked by Strict Mode in: {rel_path_str}" + f"[POLICY VIOLATION] Spoofed alias '{pkg}' -> '{true_pkg}' blocked by Strict Mode in: {rel_path_str}" ) else: print( - f"🚨 [ZERO-TRUST BREACH] Unknown package '{pkg}' blocked by Strict Mode in: {rel_path_str}" + f"[POLICY VIOLATION] Unknown package '{pkg}' blocked by Strict Mode in: {rel_path_str}" ) # ===================================================================== # 2. BEHAVIORAL POLICY ENFORCEMENT (Leveraging Phase 1 Measurements) # ===================================================================== - # Extract the raw threat equations calculated natively by Prism/Detector in Phase 1 - equations = star.get("equations", {}) - loc = star.get("coding_loc", 1) + # Extract the raw structural signatures calculated natively by the Structural Signature Analysis Engine in Phase 1 + equations = file_node.get("equations", {}) + loc = file_node.get("coding_loc", 1) # Clone the dictionary to safely apply the sandbox multiplier without corrupting global RAM local_counts = dict(equations) - # THE BUILD-TIME EXECUTION MULTIPLIER (STATIC SANDBOX) - # Apply a massive artificial density multiplier to manifest triggers so any - # I/O or Danger hits instantly detonate the firewall. + # DEFENSIVE DESIGN (BUILD-TIME EXECUTION MULTIPLIER): + # Configuration scripts (like setup.py or package.json) are executed by CI/CD + # runners at build time. RCE here compromises the host before the app even runs. + # We apply an artificial density multiplier to manifest triggers so any I/O or + # Danger signatures instantly trigger a blocking action from the firewall. build_time_multiplier = 1.0 filename = Path(rel_path_str).name if filename in [ @@ -130,8 +144,8 @@ def run_firewall_audit(parsed_files: list, alias_map: dict = None) -> dict: if isinstance(local_counts[k], (int, float)): local_counts[k] = int(local_counts[k] * build_time_multiplier) - # Evaluate risk using Phase 1 network topography context (e.g., Blast Radius) - network_metrics = star.get("dependency_network", {}) + # Evaluate risk using Phase 1 network topography context (e.g., Downstream Exposure) + network_metrics = file_node.get("dependency_network", {}) exposures = security.evaluate_risk(local_counts, safe_loc, network_metrics) if ( @@ -143,9 +157,7 @@ def run_firewall_audit(parsed_files: list, alias_map: dict = None) -> dict: if is_whitelisted: threats_allowed += 1 else: - print( - f"\n🚨 [SUPPLY CHAIN COMPROMISE] Density Threshold Breached in: {rel_path_str}" - ) + print(f"\n[THREAT DETECTED] Density Threshold Breached in: {rel_path_str}") for risk, density in exposures.items(): print(f" -> {risk}: {density * 100:.1f}%") threats_found += 1 @@ -162,19 +174,15 @@ def run_firewall_audit(parsed_files: list, alias_map: dict = None) -> dict: def main(): """ Standalone Execution Mode. - Because the firewall is now completely divested of redundant O(N) disk parsing, - it must be fed the compiled JSON RAM-graph generated by the GalaxyScope orchestrator. + Because the firewall is decoupled from redundant O(N) disk parsing, + it must be fed the compiled JSON Dependency Graph generated by the GalaxyScope orchestrator. """ from gitgalaxy.licensing import enforce_licensing_guard enforce_licensing_guard("Supply Chain Firewall") - parser = argparse.ArgumentParser( - description="Supply Chain Firewall (RAM-Exclusive Mode)" - ) - parser.add_argument( - "target", help="Path to the compiled GalaxyScope RAM graph (e.g., results.json)" - ) + parser = argparse.ArgumentParser(description="Supply Chain Firewall (RAM-Exclusive Mode)") + parser.add_argument("target", help="Path to the compiled GalaxyScope RAM graph (e.g., results.json)") args = parser.parse_args() target_path = Path(args.target).resolve() @@ -182,30 +190,25 @@ def main(): print(f"Error: RAM graph '{target_path}' does not exist.") sys.exit(1) - print( - f"🧱 Supply Chain Firewall ingesting orchestrator RAM graph from {target_path.name}..." - ) + print(f"🧱 Initializing Supply Chain Firewall with RAM graph from {target_path.name}...") try: with open(target_path, "r", encoding="utf-8") as f: data = json.load(f) - parsed_files = data.get("stars", []) + # Support both legacy "stars" key and the modernized "artifacts" or "galaxy" output + parsed_files = data.get("artifacts", data.get("stars", [])) except Exception as e: print(f"❌ Failed to parse RAM graph JSON: {e}") sys.exit(1) # In standalone CLI mode, we bypass the dynamic manifest parsing for speed, - # relying purely on strict exact-match dependencies and behavioral math. + # relying purely on strict exact-match dependencies and behavioral structural signatures. results = run_firewall_audit(parsed_files, alias_map={}) - mode_str = ( - "Strict (Exclude Blacklist and Unknown)" - if STRICT_IMPORT_MODE - else "Audit (Allow Whitelist + Unknown)" - ) + mode_str = "Strict (Exclude Blacklist and Unknown)" if STRICT_IMPORT_MODE else "Audit (Allow Whitelist + Unknown)" print("\n" + "=" * 75) - print(" 🧱 SUPPLY CHAIN FIREWALL: MISSION REPORT (RAM-EXCLUSIVE)") + print(" 🧱 SUPPLY CHAIN FIREWALL: SCAN SUMMARY") print("=" * 75) print(f" Mode : {mode_str}") print(f" Files Evaluated : {len(parsed_files):,}") @@ -219,14 +222,12 @@ def main(): print("-" * 75) if results["threats_found"] > 0: - print( - f" ❌ BUILD FAILED: {results['threats_found']} infected dependencies or policy violations blocked." - ) + print(f" [BLOCKING ACTION] {results['threats_found']} high-risk dependencies or policy violations blocked.") sys.exit(1) else: - print(" ✅ BUILD PASSED: Dependency supply chain is clean.") + print(" [SUCCESS] Dependency supply chain is clean.") print("=" * 75 + "\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/supply_chain_security/vault_sentinel.py b/gitgalaxy/tools/supply_chain_security/vault_sentinel.py index 6b2091c1..424ed97f 100644 --- a/gitgalaxy/tools/supply_chain_security/vault_sentinel.py +++ b/gitgalaxy/tools/supply_chain_security/vault_sentinel.py @@ -35,9 +35,7 @@ def main(): enforce_licensing_guard("Secrets Scanner") - parser = argparse.ArgumentParser( - description="Secrets Scanner: High-Speed Secrets Scanner" - ) + parser = argparse.ArgumentParser(description="Secrets Scanner: High-Speed Secrets Scanner") parser.add_argument("target", help="Directory or file to scan") args = parser.parse_args() @@ -52,10 +50,10 @@ def main(): filter_engine = ApertureFilter(target_path, LANGUAGE_DEFINITIONS, APERTURE_CONFIG) security = SecurityLens(policy=ThreatPolicy.get_policy("paranoid")) - # NEUTER THE LENS: Only look for keys and graveyard logic for maximum speed + # SENSOR OPTIMIZATION: Only evaluate keys and dead-code logic for maximum performance security.THREAT_SIGNATURES = { - "private_info": security.THREAT_SIGNATURES["private_info"], - "graveyard": security.THREAT_SIGNATURES["graveyard"], + "hardcoded_secrets": security.THREAT_SIGNATURES["hardcoded_secrets"], + "dead_code": security.THREAT_SIGNATURES["dead_code"], } leaks_found = 0 @@ -66,15 +64,15 @@ def main(): files_to_deep_scan = [] # ============================================================================== - # PASS 1: The Funnel (Build the Queue & Catch Surface Threats) + # PHASE 1: Path Filtering & Surface Threat Detection # ============================================================================== for root, dirs, files in os.walk(target_path): rel_root = str(Path(root).relative_to(target_path)) - # Shield Bypass & Top-Level Optimization + # Path Optimization: Evaluate top-level directories against ignore rules. if rel_root == ".": - dirs[:] = [d for d in dirs if filter_engine._check_solar_shield(d)] - elif not filter_engine._check_solar_shield(rel_root): + dirs[:] = [d for d in dirs if filter_engine._check_ignore_rules(d)] + elif not filter_engine._check_ignore_rules(rel_root): dirs[:] = [] continue @@ -84,33 +82,25 @@ def main(): # Create a normalized string for checking against lists rel_path_str = str(file_path.relative_to(target_path)).replace("\\", "/") - is_whitelisted = any( - approved in rel_path_str for approved in ALLOWLIST_PATHS - ) - - # 1. THE DENYLIST CHECK (Wildcard Pattern Matching) - is_forbidden = any( - fnmatch.fnmatch(file, pattern) for pattern in DENYLIST_PATTERNS - ) + is_whitelisted = any(approved in rel_path_str for approved in ALLOWLIST_PATHS) + + # 1. DENYLIST ENFORCEMENT (Wildcard Pattern Matching) + is_forbidden = any(fnmatch.fnmatch(file, pattern) for pattern in DENYLIST_PATTERNS) if is_forbidden and not is_whitelisted: - print( - f"🚨 [FORBIDDEN FILE BREACH] Illegal file pattern detected: {rel_path_str}" - ) + print(f"[DENYLIST MATCH] Unauthorized file pattern detected: {rel_path_str}") forbidden_blocked += 1 leaks_found += 1 continue # Skip deep scanning - # 2. Tier 0 Path Scan (Catches .pem, id_rsa, .env immediately) + # 2. Tier 0 Path Scan (Catches .pem, id_rsa, .env immediately without I/O) is_valid, size, reason = filter_engine.evaluate_path_integrity(file_path) if reason and "CRITICAL LEAK" in reason: if is_whitelisted: - print( - f"⚠️ [ALLOWED BYPASS] Known safe test key ignored: {rel_path_str}" - ) + print(f"[ALLOWLIST BYPASS] Known safe test key ignored: {rel_path_str}") leaks_allowed += 1 else: - print(f"🚨 [PATH BREACH] Exposed Secret File: {rel_path_str}") + print(f"[PATH BREACH] Exposed Secret File: {rel_path_str}") leaks_found += 1 continue @@ -118,7 +108,7 @@ def main(): files_to_deep_scan.append((file_path, rel_path_str, is_whitelisted)) # ============================================================================== - # PASS 2: The Deep Scan (Internal Contents) + # PHASE 2: Deep Content Inspection # ============================================================================== print(f"\n🔎 Scanning {len(files_to_deep_scan):,} files for internal contents of:") print(" - Cloud Infrastructure Keys") @@ -134,16 +124,16 @@ def main(): sec_results = security.scan_content(content, len(content.splitlines())) - if sec_results["counts"].get("private_info", 0) > 0: + if sec_results["counts"].get("hardcoded_secrets", 0) > 0: if is_whitelisted: - print( - f"⚠️ [ALLOWED BYPASS] Known safe secret ignored in: {rel_path_str}" - ) + print(f"[ALLOWLIST BYPASS] Known safe secret ignored in: {rel_path_str}") leaks_allowed += 1 else: - print(f"🚨 [CONTENT BREACH] Hardcoded Credential: {rel_path_str}") - for snip in sec_results["snippets"].get("private_info", []): - print(f" -> {snip}") + print(f"[CONTENT BREACH] Hardcoded Credential: {rel_path_str}") + secret_hits = len(sec_results["snippets"].get("hardcoded_secrets", [])) + for _ in range(secret_hits): + # Never log any portion of a detected secret snippet. + print(" -> ********[REDACTED]********") leaks_found += 1 except Exception: pass @@ -153,38 +143,32 @@ def main(): scan_rate = len(files_to_deep_scan) / time_delta if time_delta > 0 else 0 # ============================================================================== - # MISSION REPORT + # SCAN SUMMARY # ============================================================================== print("\n" + "=" * 75) - print(" 🛡️ VAULT SENTINEL: MISSION REPORT") + print(" 🛡️ VAULT SENTINEL: SCAN SUMMARY") print("=" * 75) print(f" Files Evaluated : {files_evaluated:,}") print(f" Files Deep Scanned : {len(files_to_deep_scan):,}") print(f" Time Elapsed : {time_delta:.2f} seconds") print(f" Scan Velocity : {scan_rate:,.0f} files/sec") print("-" * 75) - print(f" UNCONTROLLED LEAKS : {leaks_found:,}") - print(f" Denylist Blocks : {forbidden_blocked:,}") - print(f" Allowlist Bypasses : {leaks_allowed:,}") + print(f" SECRETS DETECTED : {leaks_found:,}") + print(f" Denylist Blocks : {forbidden_blocked:,}") + print(f" Allowlist Bypasses : {leaks_allowed:,}") print("-" * 75) if leaks_found > 0: - print( - f" ❌ FAILED: {leaks_found} unauthorized secrets exposed. Blocking commit/PR." - ) - print( - " 💡 TIP: If this is a false positive, add the file path to ALLOWLIST_PATHS" - ) + print(f" [BLOCKING ACTION] {leaks_found} unauthorized secrets exposed. Failing pipeline.") + print(" TIP: If this is a false positive, add the file path to ALLOWLIST_PATHS") print(" inside gitgalaxy/standards/gitgalaxy_config.py") sys.exit(1) else: - print(" ✅ PASS: No unauthorized secrets detected. Vault is secure.") + print(" [SUCCESS] No unauthorized secrets detected.") if leaks_allowed > 0: - print( - f" 💡 NOTE: {leaks_allowed} known mock/safe files were bypassed via configuration." - ) + print(f" NOTE: {leaks_allowed} known mock/safe files were bypassed via configuration.") print("=" * 75 + "\n") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/gitgalaxy/tools/terabyte_log_scanning/README.md b/gitgalaxy/tools/terabyte_log_scanning/README.md index 22f7ecb1..86600676 100644 --- a/gitgalaxy/tools/terabyte_log_scanning/README.md +++ b/gitgalaxy/tools/terabyte_log_scanning/README.md @@ -1,109 +1,208 @@ -# GitGalaxy: High-Velocity Log Scanning & PII Detection +# GitGalaxy Mainframe: Structural Extraction & Legacy Modernization Suite -[![Velocity](https://img.shields.io/badge/Velocity-2%2B_GB%2Fmin-00C957.svg)](#) -[![Scale](https://img.shields.io/badge/Tested-10GB%2B_Files-00BFFF.svg)](#) -[![Architecture](https://img.shields.io/badge/Architecture-Single__Pass_Stream-8A2BE2.svg)](#) +[![Mainframe Tested](https://img.shields.io/badge/Tested-MVS_3.8j_(1974)-000000.svg?style=flat&logo=ibm)](#) +[![Architecture](https://img.shields.io/badge/Architecture-Deterministic_Extraction-00BFFF.svg)](#) +[![Data](https://img.shields.io/badge/Data-EBCDIC_%26_COMP--3_Native-00C957.svg)](#) -During an active incident response or catastrophic data breach, standard tools fail. Basic `grep` lacks time-series context. Modern SIEMs (Splunk, ElasticSearch) require you to ingest and index data first—taking hours or days for massive database dumps. +Welcome to the **GitGalaxy Mainframe Modernization Suite**. This directory contains the deterministic, high-speed static analysis tools required to safely slice, sanitize, and map monolithic legacy architectures prior to cloud migration. -This suite provides a tactical, pipeline-ready solution: **ultra-high-velocity, unindexed binary streaming.** Running at over 2 GB per minute, our custom stream-processing engine reads data continuously without loading massive files into RAM. Perfect for active breach triage or automated CI/CD pipeline sanitization. +**Mainframe Proven:** The architectural scaffolding generated by these tools compiles natively against raw MVS 3.8j operating systems (1974 Hercules Mainframe), while simultaneously generating strict architectural contracts for modern cloud environments (Spring Boot, PostgreSQL). + +## The Why: Bridging the Mainframe-to-Cloud Divide + +Enterprise mainframe migrations frequently stall because Cloud Architects and COBOL Engineers speak different architectural languages. Cloud environments rely on dynamic scaling, relational databases, and event-driven Directed Acyclic Graphs (DAGs). Mainframes rely on sequential Job Control Language (JCL), absolute memory boundaries (`REDEFINES`), and proprietary hardware data encodings (`EBCDIC`, `COMP-3`). + +**The Generative AI Trap:** Feeding raw, multi-million-line COBOL monoliths into a Large Language Model (LLM) is a guaranteed failure. AI models cannot securely interpret implicit JCL execution orders, they hallucinate dependencies inside unreachable "dead code" paragraphs, and they lack the mathematical context to safely unpack binary `COMP-3` datasets. + +**The GitGalaxy Solution:** Before any AI translation or modern scaffolding occurs, we must deterministically map the physical reality of the mainframe. This suite parses the structural intent of the COBOL, strips away decades of dead code rot, extracts the exact I/O data lineage, and translates mainframe binary datasets into cloud-native formats—entirely without compiling Abstract Syntax Trees (ASTs). + +--- + +## The How: Deterministic Extraction & Cleansing + +We treat the legacy codebase as a mathematical topology. By utilizing our **Structural Signature Analysis Engine**, we isolate and untangle the monolith through a multi-phase pipeline: + +1. **Deprecated Trail Pruning:** We map memory declarations (`DATA DIVISION`) against actual execution calls (`PROCEDURE DIVISION`) to mathematically prove which variables and paragraphs are dead. We mask these out to prevent modern systems from inheriting legacy bloat. +2. **Data Lineage Mapping (DAGs):** By tracking `SELECT/ASSIGN` and `OPEN` statements, we map the exact physical datasets required by each program, generating a strict execution topology that replaces the need for legacy JCL. +3. **Microservice Slicing:** We use recursive data-flow taint tracking to trace a single business variable through `MOVE`, `ADD`, and `COMPUTE` statements. This isolates specific business rules so they can be securely assigned to AI agents for translation, strictly bounding their context windows. +4. **Memory Exhaustion Protection:** The engine dynamically scales between high-speed RAM and disk-backed SQLite to process massive, monolithic legacy repositories without triggering Out-Of-Memory (OOM) crashes. + +--- + +## The What: Core Modules & Tooling + +### 1. Architectural Mapping & Triage +* **`cobol_dag_architect.py` (Data Lineage Architect):** Parses COBOL structural intent to map `INPUT/OUTPUT` data flows, calculating the deterministic topological execution order (DAG) required for modern orchestration (e.g., Spring Batch, Airflow). +
![DAG Architect](../../../docs/wiki/assets/dag_architect.gif) +* **`cobol_graveyard_finder.py` (Deprecated Trails Analyzer):** Performs static analysis to isolate unused memory declarations and mathematically unreachable execution logic, preventing the migration of dead weight. +
![Deprecated Trails Analyzer](../../../docs/wiki/assets/graveyard_reaper.gif) +* **`cobol_microservice_slicer.py` (Microservice Logic Extractor):** Executes 3-pass recursive variable taint-tracking for safe, isolated business logic extraction. +
![Microservice Logic Extractor](../../../docs/wiki/assets/microservice_slicer.gif) + +### 2. Data & Schema Modernization +* **`cobol_schema_forge.py` (Cloud Schema Generator):** Translates complex legacy byte-maps (`PIC` constraints) and memory overlays (`REDEFINES`) into strict PostgreSQL DDL schemas. +
![Cloud Schema Generator](../../../docs/wiki/assets/cloud_schema_forge.gif) +* **`cobol_etl_unpacker.py` (ETL EBCDIC Unpacker):** Translates binary EBCDIC mainframe datasets into modern UTF-8 CSVs, decoding Zoned Decimals and unpacking `COMP-3` nibbles directly into floating-point numerics natively in Python. + +### 3. Zero-Trust Infrastructure +* **`cobol_jcl_forge.py` (Zero-Trust JCL Generator):** Auto-generates strict, least-privilege JCL emulators—automatically stripping over-permissioned global access (e.g., `DISP=SHR`) and locking physical dataset provisioning to exact lineage bounds. +
![Zero-Trust JCL Generator](../../../docs/wiki/assets/jcl_forge_demo.gif) +* **`cobol_jcl_auditor.py` (Zero-Trust JCL Auditor):** Mathematically compares original legacy JCLs against the generated equivalents to quantify architectural bloat reduction and over-permissioned I/O shedding. +* **`cobol_compiler_forge.py` (Mainframe Compiler Generator):** Flattens copybooks and dynamically generates era-aware build JCLs by routing the build sequence to the correct enterprise compiler (COBOL-74 vs COBOL-85). +
![Mainframe Compiler Generator](../../../docs/wiki/assets/compiler_forge.gif) + +### 4. Code Integrity & Pre-Processing +* **`cobol_lexical_patcher.py` (Lexical Patcher):** Safely neutralizes legacy compiler traps (e.g., converting `NEXT SENTENCE` to explicit `CONTINUE` block scopes) to restore deterministic topological mapping without breaking strict compiler compliance. +* **`cobol_system_limits_reporter.py` (Architectural Anomaly Detector):** Flags non-deterministic routing logic (e.g., `ALTER`, `EXEC CICS HANDLE CONDITION`) that compromises static data lineage. +
![Architectural Anomaly Detector](../../../docs/wiki/assets/system_limits_reporter.gif) +* **`cobol_agent_task_forge.py` (Autonomous Agent Task Generator):** Converts architectural anomalies and extracted dependencies into highly constrained, structured JSON task tickets designed to safely bound LLM agents during code remediation. --- -## Part 1: The PII Data Leak Hunter (`pii-leak-hunter`) -[📖 Official Documentation](https://squid-protocol.github.io/gitgalaxy/04-06-pii-leak-hunter/) - -A specialized incident response tool. Designed to find hemorrhaging Personally Identifiable Information inside massive, raw data dumps. - -**How it works:** -* **Binary-Level Regex:** Compiles structural patterns to raw bytes. Extreme CPU efficiency. -* **Automated Masking:** Redacts toxic payloads before writing to safe evidence logs. -* **Exfiltration Histograms:** Generates ASCII charts. Pinpoints exact breach minutes. - -**Performance Showcase:** Streamed a raw **1.00 GB compromised log file**. Completed in **25.72 seconds**. Detected and actively masked over **420,000 sensitive records**. Immediately exposed two distinct attack vectors (Customer data at 14:00, AWS Keys at 09:00). - -### Targeted Patterns -The stream engine currently bypasses standard indexing to hunt and actively mask: -* **VISA** (Credit Cards) -* **MASTERCARD** (Credit Cards) -* **SSN** (US Social Security Numbers) -* **AWS_KEY** (AKIA, ASIA, AGPA, etc.) - -### Quickstart & Integration -**Local CLI Execution:** -By default, the tool saves the masked evidence log in the same directory as the target. -```bash -pii-leak-hunter /path/to/massive_database_dump.sql -``` - -**Using the `--out` Flag:** -Route the safe, masked telemetry to a secure directory for analysis. -```bash -pii-leak-hunter /path/to/production.log --out /var/secure_logs/ -``` - -**GitHub Actions CI/CD Integration:** -Automate sanitization before archiving logs. -```yaml - - name: Run PII Leak Hunter - uses: squid-protocol/gitgalaxy@main - with: - tool: 'pii-leak-hunter' - target: './logs/production_dump.sql' - args: '--out ./sanitized_logs/' -``` +## 🧠 Engineering Highlights (Architectural Defenses) + +* **Unreachable Logic Masking (`cobol_dag_architect.py`):** COBOL programs often contain legacy, unreachable paragraphs. If a standard regex engine scans these, it will extract `OPEN` statements for files that are never actually utilized at runtime, creating false dependencies. We dynamically integrate with the Deprecated Trails Analyzer's state to "mask out" dead paragraphs with whitespace, preserving exact topology while eliminating hallucinated I/O dependencies. +* **Cyclic Copybook Shields (`cobol_compiler_forge.py`):** Legacy architectures frequently contain cyclic dependencies (e.g., Copybook A imports Copybook B, which imports Copybook A). To prevent our in-memory expansion from trapping the CPU in an infinite loop and triggering an OOM crash, we enforce strict, deterministic recursion depth limits during copybook flattening. +* **Defensive COMP-3 Unpacking (`cobol_etl_unpacker.py`):** Packed decimal (`COMP-3`) stores two digits per byte, plus one half-byte (nibble) for the sign. The parser mathematically validates the hex-boundaries (verifying the high nibble is `0-9` and the sign nibble is `A-F`). This intercepts corrupted mainframe memory segments before they crash the Python ETL pipeline. +* **Dynamic Aliasing Resolution (`cobol_graveyard_finder.py`):** COBOL's `REPLACING` clause allows dynamic text substitution at compile time. When hunting for unused variables, the analyzer simulates this substitution in its in-memory buffer using safe, negative lookarounds. This prevents the system from accidentally flagging heavily aliased variables as "dead code." --- -## Part 2: The Terabyte Log Scanner (`terabyte-log-scanner`) -[📖 Official Documentation](https://squid-protocol.github.io/gitgalaxy/04-07-terabyte-log-scanner/) - -A runtime execution tracer. Connects static codebase architecture to physical runtime reality. Parses massive mainframe SMF logs or distributed traces to prove what code actually executes. - -**How it works:** -* **Single-Pass Streaming:** Never loads the full file into RAM. -* **Execution Verification:** Proves exact runtime execution frequencies. -* **Zero-Hit Detection:** Mathematically proves if compiled legacy code is abandoned. -* **Dynamic Sidecars:** Outputs telemetry JSON for 3D WebGPU traffic heatmaps. - -**Performance Showcase:** -Ran against a raw **2.1GB production stream log**. Completed single-pass scan in **30.07 seconds**. Dynamically scaled ASCII histograms instantly exposed a massive brute-force anomaly isolated from background noise: - -```text - === TIME-SERIES: ERROR === - (Filtering to Top 15 Highest Volume Spikes) - [2026-04-16 14:00] ███████████████████████████████████████ (5,759 hits) <-- ANOMALY SPIKE - [2026-04-27 14:00] ███████████████████████████████████████ (5,753 hits) <-- ANOMALY SPIKE - [2026-05-02 14:00] ███████████████████████████████████████ (5,718 hits) <-- ANOMALY SPIKE -``` - -### Input Methods: Manual vs. Automated -The tool requires one of two input methods to function. It will not run without a target list. - -**1. Manual Mode (`-k` or `--keywords`)** -Best for quick, grep-style tactical hunts. Supply a space-separated list of targets. -```bash -terabyte-log-scanner /path/to/production.log -k ERROR TIMEOUT "DATA EXCEPTION" -``` - -**2. Automated Pipeline Mode (`--input_state`)** -Best for CI/CD modernization pipelines. Supply a GitGalaxy Intermediate Representation (IR) JSON file. The script will automatically extract the targets from the `known_programs` array to hunt for dead code. -```bash -terabyte-log-scanner /path/to/production.log --input_state ../core/ir_state.json -``` - -*Required JSON Schema for Automated Mode:* -```json -{ - "analysis": { - "known_programs": ["PROGRAM1", "PROGRAM2"] - } -} -``` +## ⚡ Performance Showcases (Live CLI Execution) + +#### Showcase A: Deprecated Trails Analyzer (Graveyard Finder) +Identifying and shedding dead weight prior to a cloud migration saves massive amounts of translation cost and future cloud compute. + +~~~bash +python3 cobol_graveyard_finder.py /legacy_corpus/accounting +~~~ + +~~~text +========================================================== + 📉 DEPRECATED TRAILS REDUCTION REPORT +========================================================== + Files Flagged for Cleanup : 14 + Unused Memory Addresses : 142 variables + Unreachable Logic Blocks : 37 paragraphs + ✂️ Estimated Bloat Removed : ~1,790 Lines of Code +========================================================== +~~~ + +#### Showcase B: Data Lineage Architect (DAG Generation) +Automatically generating the correct execution order by mapping physical dataset dependencies (Inputs vs. Outputs) across multiple monolithic programs. + +~~~bash +python3 cobol_dag_architect.py /legacy_corpus/nightly_batch +~~~ + +~~~text +========================================================== + ⚡ DETERMINISTIC EXECUTION PIPELINE (TOPOLOGICAL SORT) +========================================================== + + STEP 01: Run [ACCT-INIT] + ↳ Reads : SYS-CONFIG-FILE + ↳ Writes: DAILY-LEDGER-DB +---------------------------------------------------------- + STEP 02: Run [LEDGER-CALC] + ↳ Reads : DAILY-LEDGER-DB, RATES-TBL + ↳ Writes: PROCESSED-LEDGER-DB +---------------------------------------------------------- + STEP 03: Run [REPORT-GEN] + ↳ Reads : PROCESSED-LEDGER-DB + ↳ Writes: FINAL-REPORT-OUT +---------------------------------------------------------- +~~~ + +#### Showcase C: Master Orchestration (CICS Banking Application) +Below is the live console output of the central orchestrator processing a legacy IBM CICS banking application. Notice the engine identifying over 6,700 lines of dead code, warning about macro substitutions, and automatically routing the compiler based on the detected COBOL dialect (74 vs 85). + +~~~text +=== 1. INITIATING DEPRECATED TRAILS ANALYZER === +🔍 GitGalaxy Deprecated Trails Analyzer scanning cics-banking-sample-application-cbsa for obsolete logic... +[... File Scans Omitted for Brevity ...] +========================================================== + 📉 DEPRECATED TRAILS REDUCTION REPORT +========================================================== + Files Flagged for Cleanup : 29 + Unused Memory Addresses : 817 variables + Unreachable Logic Blocks : 590 paragraphs + ✂️ Estimated Bloat Removed : ~6717 Lines of Code +========================================================== + +=== 2. INITIATING DAG ARCHITECT === +🕸️ GitGalaxy Data Lineage Architect mapping execution topology in: cics-banking-sample-application-cbsa... +========================================================== + ⚡ DETERMINISTIC EXECUTION PIPELINE (TOPOLOGICAL SORT) +========================================================== + STEP 01: Run [BANKDATA] + ↳ Reads : None + ↳ Writes: VSAM +---------------------------------------------------------- + +=== 3. INITIATING ARCHITECTURAL ANOMALY DETECTOR === +📠 Scanning directory for Architectural Anomalies: cics-banking-sample-application-cbsa... +🔎 GitGalaxy executing architectural integrity scan on 29 files... +========================================================================================== + ⚠️ [XFRFUN.cbl : Line 0128] HIGH LIMIT - Macro substitution detected. AST math may drift from actual compiled execution. + ⚠️ [CREACC.cbl : Line 0260] HIGH LIMIT - Macro substitution detected. AST math may drift from actual compiled execution. +========================================================================================== + 🚨 WARNING: Found 2 structural anomalies requiring human architectural review. +========================================================================================== + +=== 4. INITIATING CLOUD SCHEMA GENERATOR === +🔨 GitGalaxy Cloud Schema Generator processing: BNK1UAC.cbl... +========================================================== + 🐘 POSTGRESQL DDL (CLOUD DATABASE SCHEMA) +========================================================== +CREATE TABLE DFHCOMMAREA ( + WS_CICS_RESP INTEGER, + WS_CICS_RESP2 INTEGER, + WS_CICS_FAIL_MSG VARCHAR(70), + WS_COMM_EYE VARCHAR(4), + WS_COMM_CUSTNO VARCHAR(10), + WS_COMM_ACCNO DECIMAL(8, 0), + WS_COMM_AVAIL_BAL DECIMAL(12, 2), + WS_COMM_ACTUAL_BAL DECIMAL(12, 2) + -- [Schema Omitted for Brevity] +); + +=== 5. INITIATING MICROSERVICE LOGIC EXTRACTOR === +🔪 GitGalaxy Logic Extractor tracing dependencies for [WS-ACCOUNT-BALANCE] in BNK1UAC.cbl... +========================================================== + 🎯 Extracted 0 distinct business rules. +========================================================== + +=== 6. INITIATING MAINFRAME COMPILER GENERATOR === +====================================================================== + 🏗️ GITGALAXY MAINFRAME COMPILER GENERATOR (PRE-COMPILER ACTIVE) +====================================================================== + [+] Generated COBOL-85 Pipeline : BUILD_BNK1UAC.jcl + [+] Generated COBOL-85 Pipeline : BUILD_DBCRFUN.jcl + [+] Generated COBOL-74 Pipeline : BUILD_GETSCODE.jcl +====================================================================== + +=== 7. INITIATING STRUCTURAL EXTRACTION CONTROLLER === +====================================================================== + 🚀 EXTRACTION CONTROLLER ENGAGED + Target: cics-banking-sample-application-cbsa +====================================================================== + Generating Context-Aware Artifacts at: cics-banking-sample-application-cbsa_gitgalaxy_clean_20260422_153624 +---------------------------------------------------------------------- +====================================================================== + 🏁 EXTRACTION COMPLETE: Hybrid Pipeline execution successful. +====================================================================== +~~~ --- -### 🌌 Powered by the blAST Engine (Bypassing LLMs and ASTs) -This suite is driven by our custom deterministic heuristics engine. It processes multi-dimensional data at extreme velocity without requiring rigid ASTs or hallucinating LLMs. -* 📖 **[The blAST Paradigm (ASTs vs LLMs)](https://squid-protocol.github.io/gitgalaxy/01-03-the-blast-paradigm/)** -* 🪐 **[Return to the Main GitGalaxy Hub](https://github.com/squid-protocol/gitgalaxy)** \ No newline at end of file +## 🌌 The GitGalaxy Ecosystem (Powered by the blAST Engine) + +GitGalaxy Mainframe Modernization is the structural extraction layer of the broader **GitGalaxy Ecosystem**—a high-velocity, AST-free, LLM-free heuristic knowledge graph engine designed for planetary-scale repositories. + +Explore the ecosystem: + +* 🪐 **[Official Documentation](https://squid-protocol.github.io/gitgalaxy/)** — Comprehensive deep dives into the engine's mathematics, pipeline architecture, and DevSecOps integration protocols. +* 🔭 **[GitGalaxy Visualizer](http://gitgalaxy.io/)** — Render your codebase's topological network locally in interactive 3D using hardware-accelerated WebGPU. +* 📖 **[The blAST Paradigm](https://squid-protocol.github.io/gitgalaxy/docs/wiki/01-03-the-blast-paradigm/)** — The architectural thesis, academic research, and structural math that makes AST-free parsing possible at scale. +* ⚙️ **[Language Calibration Standards](https://github.com/squid-protocol/gitgalaxy/blob/main/gitgalaxy/standards/how_to_add_a_language.md)** — The definitive engineering guide to extending our comparative lexical taxonomy for custom enterprise dialects. \ No newline at end of file diff --git a/gitgalaxy/tools/terabyte_log_scanning/pii_leak_hunter.py b/gitgalaxy/tools/terabyte_log_scanning/pii_leak_hunter.py index e259cf02..f02ef2e7 100644 --- a/gitgalaxy/tools/terabyte_log_scanning/pii_leak_hunter.py +++ b/gitgalaxy/tools/terabyte_log_scanning/pii_leak_hunter.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: PII Data Leak Hunter -# Purpose: High-speed, single-pass log analyzer that hunts and masks +# GitGalaxy Tool: PII Data Leak Hunter +# Purpose: High-speed, single-pass log analyzer that detects and masks # exposed Credit Cards, SSNs, and AWS API Keys. # ============================================================================== import argparse @@ -12,21 +12,20 @@ from pathlib import Path # ============================================================================== -# 1. THE REGEX PHYSICS (MATHEMATICAL TRAPS) +# 1. REGEX PATTERNS (PII SIGNATURES) # ============================================================================== -# We compile these as binary (bytes) to maintain the insane speed of the original log parser +# We compile these as binary (bytes) to maintain maximum execution speed +# during large-scale log ingestion. PII_PATTERNS = { "VISA": re.compile(rb"\b4[0-9]{12}(?:[0-9]{3})?\b"), - "MASTERCARD": re.compile( - rb"\b(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}\b" - ), + "MASTERCARD": re.compile(rb"\b(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}\b"), "SSN": re.compile(rb"\b\d{3}-\d{2}-\d{4}\b"), "AWS_KEY": re.compile(rb"\b(?:AKIA|ASIA|AGPA|AIDA|AROA|AIPA)[A-Z0-9]{16}\b"), } def mask_pii(text: str) -> str: - """Masks out the middle of sensitive data so the evidence log is safe.""" + """Masks out the middle of sensitive data so the evidence log is safe for retention.""" # Mask Visa & Mastercard (Leave last 4) text = re.sub( r"\b(4[0-9]{12}(?:[0-9]{3})?)\b", @@ -53,7 +52,7 @@ def mask_pii(text: str) -> str: def draw_ascii_histogram(time_buckets: dict, keyword: str): - """Draws a dynamically scaled ASCII histogram, showing only top spikes if massive.""" + """Draws a dynamically scaled ASCII histogram to visualize exposure frequency over time.""" if not time_buckets: return @@ -66,9 +65,7 @@ def draw_ascii_histogram(time_buckets: dict, keyword: str): if len(time_buckets) > 15: print(" (Filtering to Top 15 Highest Volume Spikes)") - top_offenders = sorted(time_buckets.items(), key=lambda x: x[1], reverse=True)[ - :15 - ] + top_offenders = sorted(time_buckets.items(), key=lambda x: x[1], reverse=True)[:15] display_buckets = dict(sorted(top_offenders)) else: display_buckets = dict(sorted(time_buckets.items())) @@ -77,11 +74,7 @@ def draw_ascii_histogram(time_buckets: dict, keyword: str): bar_len = int((hits / max_hits) * max_bar_width) if max_hits > 0 else 0 bar = "█" * max(1, bar_len) - alert = ( - " <-- MASSIVE EXFILTRATION SPIKE" - if hits >= anomaly_threshold and hits > 10 - else "" - ) + alert = " <-- HIGH VOLUME SPIKE DETECTED" if hits >= anomaly_threshold and hits > 10 else "" print(f" [{time_bucket}] {bar} ({hits:,} hits){alert}") @@ -98,9 +91,9 @@ def main(): formatter_class=argparse.RawTextHelpFormatter, epilog=""" ============================================================================== -HUNTING CAPABILITIES: +SCANNING CAPABILITIES: This engine bypasses standard indexing to stream raw binary logs or database -dumps. It currently hunts and actively masks the following patterns: +dumps. It currently detects and actively masks the following patterns: - VISA Credit Cards - MASTERCARD Credit Cards - US Social Security Numbers (SSN) @@ -114,7 +107,7 @@ def main(): parser.add_argument( "--out", type=str, - help="Optional: Custom directory to save the safe evidence log", + help="Optional: Custom directory to save the redacted evidence log", ) args = parser.parse_args() @@ -123,9 +116,7 @@ def main(): # ------------------------------------------------------------------------- target_path = Path(args.target).resolve() if not target_path.exists() or not target_path.is_file(): - print( - f"\n[!] ERROR: Target file does not exist or is not a file: {target_path}" - ) + print(f"\n[ERROR] Target file does not exist or is not a file: {target_path}") sys.exit(1) if args.out: @@ -136,7 +127,7 @@ def main(): try: out_dir.mkdir(parents=True, exist_ok=True) except PermissionError: - print(f"\n[!] ERROR: Permission denied to create output directory: {out_dir}") + print(f"\n[ERROR] Permission denied to create output directory: {out_dir}") sys.exit(1) results_path = out_dir / f"{target_path.stem}_pii_leak_evidence.log" @@ -146,23 +137,19 @@ def main(): file_size_gb = file_size_bytes / (1024**3) file_size_mb = file_size_bytes / (1024**2) except OSError as e: - print(f"\n[!] ERROR: Could not read target file size: {e}") + print(f"\n[ERROR] Could not read target file size: {e}") sys.exit(1) - print( - f"🚨 Tapping into data stream: {target_path.name} ({file_size_gb:.2f} GB / {file_size_mb:.2f} MB)" - ) - print(f"🛡️ Masking enabled. Streaming safe evidence to: {results_path.name}") + print(f"🔍 Initializing stream analysis: {target_path.name} ({file_size_gb:.2f} GB / {file_size_mb:.2f} MB)") + print(f"🛡️ Masking enabled. Writing redacted evidence to: {results_path.name}") - ts_pattern = re.compile( - rb"(\d{4}-\d{2}-\d{2}[T\s]\d{2}|\b[A-Z][a-z]{2}\s+\d{1,2}\s\d{2})" - ) + ts_pattern = re.compile(rb"(\d{4}-\d{2}-\d{2}[T\s]\d{2}|\b[A-Z][a-z]{2}\s+\d{1,2}\s\d{2})") histograms = {kw: defaultdict(int) for kw in PII_PATTERNS.keys()} start_time = time.time() # ------------------------------------------------------------------------- - # 3. HIGH-SPEED SCANNING (The Memory Shield) + # 3. HIGH-SPEED SCANNING # ------------------------------------------------------------------------- try: with ( @@ -182,13 +169,11 @@ def main(): ts_match = ts_pattern.search(line) bucket = ( - ts_match.group(1).decode("utf-8", errors="ignore") + ":00" - if ts_match - else "Unknown Time" + ts_match.group(1).decode("utf-8", errors="ignore") + ":00" if ts_match else "Unknown Time" ) histograms[pii_type][bucket] += 1 except IOError as e: - print(f"\n[!] FATAL I/O ERROR during streaming: {e}") + print(f"\n[FATAL ERROR] I/O failure during streaming: {e}") sys.exit(1) time_elapsed = time.time() - start_time @@ -203,7 +188,7 @@ def main(): max_total = max(total_counts.values()) if total_counts.values() else 0 print("\n" + "=" * 75) - print(" 🚨 PRIVACY INCIDENT SUMMARY (TOTAL EXPOSURE)") + print(" PII DATA LEAK HUNTER: SCAN SUMMARY") print("=" * 75) if max_total > 0: @@ -212,7 +197,7 @@ def main(): bar = "█" * max(1, bar_len) if count > 0 else "" print(f" {kw.ljust(15)} | {bar} ({count:,} hits)") else: - print(" ✅ Clean scan. No Social Security, Credit Card, or AWS Keys detected.") + print(" [SUCCESS] Clean scan. No Social Security, Credit Card, or AWS Keys detected.") print("-" * 75) @@ -227,11 +212,9 @@ def main(): else: speed_str = "Instant" - print( - f" ✅ Scan complete. Sliced through {target_path.name} in {time_elapsed:.2f} seconds." - ) - print(f" ⚡ Processing Velocity: {speed_str}") - print(f" 📁 Safe Evidence Log: {results_path.resolve()}") + print(f" [COMPLETE] Processed {target_path.name} in {time_elapsed:.2f} seconds.") + print(f" Processing Velocity: {speed_str}") + print(f" Redacted Evidence Log: {results_path.resolve()}") print("=" * 75 + "\n") diff --git a/gitgalaxy/tools/terabyte_log_scanning/terabyte_log_scanner.py b/gitgalaxy/tools/terabyte_log_scanning/terabyte_log_scanner.py index e7e87e06..3483fe50 100644 --- a/gitgalaxy/tools/terabyte_log_scanning/terabyte_log_scanner.py +++ b/gitgalaxy/tools/terabyte_log_scanning/terabyte_log_scanner.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # ============================================================================== -# GitGalaxy Spoke: Mega Log Parser +# GitGalaxy Tool: High-Volume Log Scanner # Purpose: High-speed, single-pass log analyzer with ASCII time-series histograms. # ============================================================================== import argparse @@ -28,13 +28,11 @@ def draw_ascii_histogram(time_buckets: dict, keyword: str): avg_hits = sum(time_buckets.values()) / len(time_buckets) anomaly_threshold = avg_hits * 3 - # UX Safeguard: If there are too many buckets, only show the Top 15 worst ones + # UX Safeguard: If there are too many buckets, only show the Top 15 highest volume spikes if len(time_buckets) > 15: print(" (Filtering to Top 15 Highest Volume Spikes)") # Sort by highest hits, grab top 15, then resort chronologically for the graph - top_offenders = sorted(time_buckets.items(), key=lambda x: x[1], reverse=True)[ - :15 - ] + top_offenders = sorted(time_buckets.items(), key=lambda x: x[1], reverse=True)[:15] display_buckets = dict(sorted(top_offenders)) else: display_buckets = dict(sorted(time_buckets.items())) @@ -45,27 +43,27 @@ def draw_ascii_histogram(time_buckets: dict, keyword: str): bar = "█" * max(1, bar_len) # Flag statistical anomalies visually - alert = " <-- ANOMALY SPIKE" if hits >= anomaly_threshold and hits > 10 else "" + alert = " <-- VOLUME ANOMALY DETECTED" if hits >= anomaly_threshold and hits > 10 else "" print(f" [{time_bucket}] {bar} ({hits:,} hits){alert}") def main(): from gitgalaxy.licensing import enforce_licensing_guard - enforce_licensing_guard("Mega Log Parser") + enforce_licensing_guard("High-Volume Log Scanner") # ------------------------------------------------------------------------- # 1. CLI ARGUMENT PARSING & DOCUMENTATION # ------------------------------------------------------------------------- parser = argparse.ArgumentParser( - description="GitGalaxy Mega Log Parser: High-speed, single-pass log analyzer with ASCII time-series histograms.", + description="GitGalaxy High-Volume Log Scanner: High-speed, single-pass log analyzer with ASCII time-series histograms.", formatter_class=argparse.RawTextHelpFormatter, epilog=""" ============================================================================== JSON IR State Structure: If using --input_state, the script expects a GitGalaxy Intermediate Representation (IR) JSON file. It specifically targets the 'known_programs' -array to hunt for dead code and execution volumes. +array to detect execution patterns and identify dead code. Expected JSON Schema: { @@ -76,7 +74,7 @@ def main(): ============================================================================== """, ) - parser.add_argument("target", help="Path to the log file (Translated ASCII SMF)") + parser.add_argument("target", help="Path to the log file (Translated ASCII SMF or Server Logs)") parser.add_argument( "-k", "--keywords", @@ -88,28 +86,24 @@ def main(): type=str, help="Path to GitGalaxy ir_state.json to auto-extract targets", ) - parser.add_argument( - "--out", type=str, help="Optional: Custom directory to save the results log" - ) + parser.add_argument("--out", type=str, help="Optional: Custom directory to save the results log") args = parser.parse_args() - # Validate target log file exists before doing any work + # Validate target log file exists before initializing the stream target_path = Path(args.target).resolve() if not target_path.exists() or not target_path.is_file(): - print( - f"\n[!] ERROR: Target log file does not exist or is not a file: {target_path}" - ) + print(f"\n[ERROR] Target log file does not exist or is not a file: {target_path}") sys.exit(1) # ------------------------------------------------------------------------- - # 2. INPUT HANDSHAKE & VALIDATION (No Silent Failures) + # 2. INPUT VALIDATION & STATE INGESTION # ------------------------------------------------------------------------- search_targets = [] if args.input_state: state_path = Path(args.input_state).resolve() if not state_path.exists(): - print(f"\n[!] ERROR: Input state JSON file not found: {state_path}") + print(f"\n[ERROR] Input state JSON file not found: {state_path}") sys.exit(1) try: @@ -119,37 +113,28 @@ def main(): # Strict Schema Validation if not isinstance(ir_state, dict): raise ValueError("The root of the JSON file must be an object {}.") - if ( - "analysis" not in ir_state - or "known_programs" not in ir_state["analysis"] - ): - raise ValueError( - "JSON is missing the required ['analysis']['known_programs'] path." - ) + if "analysis" not in ir_state or "known_programs" not in ir_state["analysis"]: + raise ValueError("JSON is missing the required ['analysis']['known_programs'] path.") search_targets = ir_state["analysis"]["known_programs"] if not isinstance(search_targets, list) or not search_targets: - print( - "\n[!] WARNING: 'known_programs' array is empty or invalid. Nothing to search." - ) + print("\n[WARNING] 'known_programs' array is empty or invalid. Nothing to search.") sys.exit(0) - print(f"📡 Loaded {len(search_targets)} targets from {state_path.name}") + print(f"Loaded {len(search_targets)} targets from {state_path.name}") except json.JSONDecodeError as e: - print(f"\n[!] ERROR: Invalid JSON format in {state_path.name}:\n {e}") + print(f"\n[ERROR] Invalid JSON format in {state_path.name}:\n {e}") sys.exit(1) except Exception as e: - print(f"\n[!] ERROR: Failed to parse input state:\n {e}") + print(f"\n[ERROR] Failed to parse input state:\n {e}") sys.exit(1) elif args.keywords: search_targets = args.keywords else: - print( - "\n[!] ERROR: You must provide targets using either -k/--keywords or --input_state." - ) + print("\n[ERROR] You must provide targets using either -k/--keywords or --input_state.") parser.print_help() sys.exit(1) @@ -161,16 +146,12 @@ def main(): # Pre-compile regex for speed. Encode to bytes for fast binary reading. try: pattern_str = rf"{kw}" - keyword_patterns[kw] = re.compile( - pattern_str.encode("utf-8"), re.IGNORECASE - ) + keyword_patterns[kw] = re.compile(pattern_str.encode("utf-8"), re.IGNORECASE) except re.error as e: - print(f"\n[!] ERROR: Invalid regex generated for keyword '{kw}': {e}") + print(f"\n[ERROR] Invalid regex generated for keyword '{kw}': {e}") sys.exit(1) - ts_pattern = re.compile( - rb"(\d{4}-\d{2}-\d{2}[T\s]\d{2}|\b[A-Z][a-z]{2}\s+\d{1,2}\s\d{2})" - ) + ts_pattern = re.compile(rb"(\d{4}-\d{2}-\d{2}[T\s]\d{2}|\b[A-Z][a-z]{2}\s+\d{1,2}\s\d{2})") histograms = {kw: defaultdict(int) for kw in search_targets} # Determine output paths @@ -182,17 +163,17 @@ def main(): try: out_dir.mkdir(parents=True, exist_ok=True) except PermissionError: - print(f"\n[!] ERROR: Permission denied to create output directory: {out_dir}") + print(f"\n[ERROR] Permission denied to create output directory: {out_dir}") sys.exit(1) results_path = out_dir / f"{target_path.stem}_results.txt" sidecar_path = out_dir / "dynamic_telemetry.json" start_time = time.time() - print(f"🚀 Scanning {target_path.name} for {len(search_targets)} keywords...") + print(f"🔍 Initializing stream analysis of {target_path.name} for {len(search_targets)} keywords...") # ------------------------------------------------------------------------- - # 4. HIGH-SPEED SCANNING (The Memory Shield) + # 4. STREAMING LOG ANALYSIS (Memory-Optimized) # ------------------------------------------------------------------------- try: with ( @@ -207,16 +188,14 @@ def main(): # Bucket by hour bucket = ( - ts_match.group(1).decode("utf-8", errors="ignore") + ":00" - if ts_match - else "Unknown Time" + ts_match.group(1).decode("utf-8", errors="ignore") + ":00" if ts_match else "Unknown Time" ) histograms[kw][bucket] += 1 f_out.write(f"{decoded_line}\n") break # Stop checking keywords once a hit is found on this line except IOError as e: - print(f"\n[!] FATAL I/O ERROR during scanning: {e}") + print(f"\n[FATAL ERROR] I/O failure during streaming: {e}") sys.exit(1) time_elapsed = time.time() - start_time @@ -227,8 +206,8 @@ def main(): for kw, buckets in histograms.items(): draw_ascii_histogram(buckets, kw) - print(f"\n✅ Scan completed in {time_elapsed:.2f} seconds.") - print(f"📄 Filtered results saved to: {results_path}") + print(f"\n[SUCCESS] Scan completed in {time_elapsed:.2f} seconds.") + print(f"Filtered results saved to: {results_path}") # Calculate total hits for the JSON sidecar total_counts = {kw: sum(buckets.values()) for kw, buckets in histograms.items()} @@ -237,9 +216,9 @@ def main(): try: with open(sidecar_path, "w", encoding="utf-8") as f_json: json.dump(telemetry_payload, f_json, indent=4) - print(f"💾 JSON State Sidecar written to: {sidecar_path}") + print(f"JSON State Sidecar written to: {sidecar_path}") except IOError as e: - print(f"\n[!] ERROR: Failed to write telemetry sidecar: {e}") + print(f"\n[ERROR] Failed to write telemetry sidecar: {e}") print("=" * 75 + "\n") diff --git a/tests/cobol_mainframe/test_cobol_graveyard_finder.py b/tests/cobol_mainframe/test_cobol_graveyard_finder.py index e1061643..047a852b 100644 --- a/tests/cobol_mainframe/test_cobol_graveyard_finder.py +++ b/tests/cobol_mainframe/test_cobol_graveyard_finder.py @@ -125,6 +125,6 @@ def test_graveyard_cli_e2e(tmp_path, capsys): # Assertions on the final CLI output calculations assert "Files Flagged for Cleanup : 2" in captured.out - assert "Unused Memory Addresses : 2 orphaned variables" in captured.out - assert "Unreachable Logic Blocks : 1 phantom paragraphs" in captured.out + assert "Unused Memory Addresses : 2 variables" in captured.out + assert "Unreachable Logic Blocks : 1 paragraphs" in captured.out assert "Estimated Bloat Removed : ~12 Lines of Code" in captured.out diff --git a/tests/cobol_mainframe/test_cobol_jcl_forge.py b/tests/cobol_mainframe/test_cobol_jcl_forge.py index dd983003..753b6d0e 100644 --- a/tests/cobol_mainframe/test_cobol_jcl_forge.py +++ b/tests/cobol_mainframe/test_cobol_jcl_forge.py @@ -114,12 +114,12 @@ def test_hygienic_cli_defaults(tmp_path): forge_module.main() # 3. Verify the Hygienic Output Directory - # Look for a directory matching 'legacy_src_forged_YYYYMMDD_HHMMSS' + # Look for a directory matching 'legacy_src_generated_YYYYMMDD_HHMMSS' directories = [ - d for d in tmp_path.iterdir() if d.is_dir() and "legacy_src_forged_" in d.name + d for d in tmp_path.iterdir() if d.is_dir() and "legacy_src_generated_" in d.name ] assert len(directories) == 1, ( - "The engine failed to create the isolated hygienic directory!" + "The engine failed to create the isolated output directory!" ) hygienic_dir = directories[0] diff --git a/tests/cobol_mainframe/test_cobol_refractor_controller.py b/tests/cobol_mainframe/test_cobol_refractor_controller.py index c91aad28..fef1eaf1 100644 --- a/tests/cobol_mainframe/test_cobol_refractor_controller.py +++ b/tests/cobol_mainframe/test_cobol_refractor_controller.py @@ -93,7 +93,7 @@ def test_process_payload_integration(tmp_path): assert ir_state["metadata"]["file_name"] == "MAINPGM.cbl" # 2. Verify Graveyard Sub-Tool Integration - assert "DEAD-VAR" in ir_state["analysis"]["graveyard"]["orphaned_vars"], ( + assert "DEAD-VAR" in ir_state["analysis"]["dead_code"]["orphaned_vars"], ( "Orchestrator failed to invoke Graveyard Reaper!" ) diff --git a/tests/cobol_mainframe/test_cobol_system_limits_reporter.py b/tests/cobol_mainframe/test_cobol_system_limits_reporter.py index cb8280a5..768a8c2d 100644 --- a/tests/cobol_mainframe/test_cobol_system_limits_reporter.py +++ b/tests/cobol_mainframe/test_cobol_system_limits_reporter.py @@ -111,7 +111,7 @@ def test_system_limits_cli_directory_traversal(tmp_path, capsys): captured = capsys.readouterr() # Verify the results - assert "scanning 2 files" in captured.out, ( + assert "executing architectural integrity scan on 2 files" in captured.out, ( "Failed to properly filter .cbl and .cob files!" ) assert "PGM1.cbl : Line 0001" in captured.out diff --git a/tests/core_engine/test_detector.py b/tests/core_engine/test_detector.py index 5712ef13..33b04b6e 100644 --- a/tests/core_engine/test_detector.py +++ b/tests/core_engine/test_detector.py @@ -21,7 +21,7 @@ r"^[ \t]*def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", re.M ), "branch": re.compile(r"\b(if|elif|for|while)\b"), - "linear": re.compile(r"\b(print|return|assign)\b"), + "structural_boundaries": re.compile(r"\b(print|return|assign)\b"), "ownership": re.compile(r"#\s*Architect:\s*(.*)"), "_meta_purpose_line": re.compile(r"^Purpose:\s*(.*)"), }, @@ -31,7 +31,7 @@ "rules": { "func_start": re.compile(r"^([a-zA-Z0-9_]+):", re.M), "branch": re.compile(r"\b(JNE|JEQ|CALL)\b"), - "linear": re.compile(r"\b(MOV|PUSH|POP)\b"), + "structural_boundaries": re.compile(r"\b(MOV|PUSH|POP)\b"), }, }, "c": { @@ -42,12 +42,12 @@ ), "memory_scraping": re.compile(r"\b(memcpy|VirtualRead)\b"), "exfiltration_camouflage": re.compile(r"\b(send|socket)\b"), - "danger": re.compile(r"\b(strcpy|gets)\b"), + "high_risk_execution": re.compile(r"\b(strcpy|gets)\b"), "safety": re.compile(r"\b(strncpy|fgets)\b"), - "sec_danger": re.compile(r"system"), + "sec_high_risk_execution": re.compile(r"system"), "sec_io": re.compile(r"request_get"), "concurrency": re.compile(r"std::thread"), - "flux": re.compile(r"shared_state"), + "state_mutation": re.compile(r"shared_state"), "sync_locks": re.compile(r"mutex_lock"), "memory_alloc": re.compile(r"malloc"), "cleanup": re.compile(r"free"), @@ -61,14 +61,14 @@ "lexical_family": "single_line_only", "rules": { "branch": re.compile(r"\b(if|case|for|while)\b"), - "linear": re.compile(r"\b(echo|export|source)\b"), + "structural_boundaries": re.compile(r"\b(echo|export|source)\b"), }, }, "ruby": { "lexical_family": "single_line_only", "rules": { "branch": re.compile(r"(?= 1 @@ -223,14 +223,14 @@ def test_detector_class_extraction_and_lcom(): code = ( "class UserManager:\n" " def __init__(self):\n" - " self.users = []\n" # Hits 'flux' (mutation) + " self.users = []\n" # Hits 'state_mutation' (mutation) " def add_user(self, user, role):\n" # 2 args - " self.users.append(user)\n" # Hits 'flux' + " self.users.append(user)\n" # Hits 'state_mutation' " print(role)\n" ) # Mocking a flux rule for testing state entanglement - MOCK_LANG_DEFS["python"]["rules"]["flux"] = re.compile(r"\b(append|users\s*=)\b") + MOCK_LANG_DEFS["python"]["rules"]["state_mutation"] = re.compile(r"\b(append|users\s*=)\b") result = opt_detector.splice(code, "") @@ -323,9 +323,9 @@ def test_detector_c_macro_dead_branch_shield(): result = opt_detector.splice(code, "") - # Because 'danger' is in the dead branch, it should be scrubbed by the preprocessor shield + # Because 'high_risk_execution' is in the dead branch, it should be scrubbed by the preprocessor shield # before the regex engine even sees it. - assert result["equations"]["danger"] == 0, ( + assert result["equations"]["high_risk_execution"] == 0, ( "Failed to scrub dead preprocessor branches!" ) @@ -636,16 +636,16 @@ def spatial_mapper(): return SpatialMapper() -def test_spatial_mapper_mass_extraction(spatial_mapper): - """Proves the engine extracts gravitational mass natively or via fallback telemetry.""" +def test_spatial_mapper_magnitude_extraction(spatial_mapper): + """Proves the engine extracts structural magnitude natively or via fallback telemetry.""" # 1. Primary: Forensics Dictionary - assert spatial_mapper._get_mass({"forensics": {"structural_mass": 42.0}}) == 42.0 + assert spatial_mapper._get_magnitude({"forensics": {"structural_mass": 42.0}}) == 42.0 # 2. Secondary: Processed File Impact - assert spatial_mapper._get_mass({"file_impact": 15.5}) == 15.5 + assert spatial_mapper._get_magnitude({"file_impact": 15.5}) == 15.5 # 3. Fallback: Raw Function Impact - assert spatial_mapper._get_mass({"sum_fxn_impact": 7.0}) == 7.0 + assert spatial_mapper._get_magnitude({"sum_fxn_impact": 7.0}) == 7.0 def test_spatial_mapper_deterministic_jitter(spatial_mapper): @@ -920,7 +920,7 @@ def test_detector_comment_analysis_math(): # Inject comment rules opt_detector.primary_rules["planned_debt"] = re.compile(r"\bTODO\b") - opt_detector.primary_rules["graveyard"] = re.compile(r"^#\s*def\s", re.M) + opt_detector.primary_rules["dead_code"] = re.compile(r"^#\s*def\s", re.M) comment_stream = ( "# TODO: Refactor this entire class\n" @@ -929,11 +929,11 @@ def test_detector_comment_analysis_math(): ) # Pass an empty equations dict to simulate the handoff from coding_analysis - equations = {"planned_debt": 0, "graveyard": 0} + equations = {"planned_debt": 0, "dead_code": 0} result = opt_detector.comment_analysis(comment_stream, "python", equations) assert result["planned_debt"] == 1, "Failed to tally planned tech debt from comments!" - assert result["graveyard"] == 1, "Failed to tally graveyard (dead code) from comments!" + assert result["dead_code"] == 1, "Failed to tally graveyard (dead code) from comments!" # ============================================================================== @@ -963,7 +963,7 @@ def test_detector_active_hemorrhage_leak(): opt_detector = StructuralExtractor("c", MOCK_LANG_DEFS) # Inject rules for the hemorrhage sensor - opt_detector.primary_rules["sec_private_info"] = re.compile(r"password") + opt_detector.primary_rules["sec_hardcoded_secrets"] = re.compile(r"password") opt_detector.primary_rules["telemetry"] = re.compile(r"console\.log|printf") code = ( @@ -976,7 +976,7 @@ def test_detector_active_hemorrhage_leak(): result = opt_detector.splice(code, "") # A single private_info hit is multiplied by 50 when correlated with a telemetry sink - assert result["equations"].get("sec_private_info", 0) >= 50, ( + assert result["equations"].get("sec_hardcoded_secrets", 0) >= 50, ( "AppSec Sensor failed to amplify the Active Hemorrhage penalty!" ) assert result["mitigation_telemetry"].get("amplified_leaks", 0) >= 1, ( @@ -1215,7 +1215,7 @@ def test_detector_unregistered_rule_handling(caplog): # TEST 33: CARTOGRAPHY EMPTY STATES & FALLBACKS # ============================================================================== def test_spatial_mapper_empty_states_and_fallbacks(): - """Proves the 3D geometry engine handles missing files and zero-mass states safely.""" + """Proves the 3D geometry engine handles missing files and zero-magnitude states safely.""" mapper = SpatialMapper() # Case 1: Empty Repository @@ -1224,11 +1224,11 @@ def test_spatial_mapper_empty_states_and_fallbacks(): # Case 2: Empty Hash Jitter assert mapper._hash_jitter("", 100.0) == 0.0, "Jitter failed to neutralize empty seeds!" - # Case 3: Zero Mass Fallback - assert mapper._get_mass({}) == 0.0, "Mass extraction crashed on an empty node dictionary!" + # Case 3: Zero Magnitude Fallback + assert mapper._get_magnitude({}) == 0.0, "Magnitude extraction crashed on an empty node dictionary!" # Case 4: Deep Fallback (Using total_control_flow_ratio as a mock fallback if needed) - assert mapper._get_mass({"sum_fxn_impact": 0.0}) == 0.0, "Mass extraction failed on zero-impact nodes!" + assert mapper._get_magnitude({"sum_fxn_impact": 0.0}) == 0.0, "Magnitude extraction failed on zero-impact nodes!" # ============================================================================== # TEST 34: UTILITY & EMPTY STATE FALLBACKS diff --git a/tests/core_engine/test_galaxyscope.py b/tests/core_engine/test_galaxyscope.py index f8097ddc..7536f6f6 100644 --- a/tests/core_engine/test_galaxyscope.py +++ b/tests/core_engine/test_galaxyscope.py @@ -43,7 +43,7 @@ def test_galaxyscope_python_fixture(tmp_path): # INVARIANT 1: CLI Exit Code & Billboard Output assert result.returncode == 0, f"GalaxyScope crashed! Stderr: {result.stderr}" - assert "MISSION_SUCCESS" in result.stdout, ( + assert "PIPELINE_SUCCESS" in result.stdout, ( "CLI did not print the success billboard." ) diff --git a/tests/core_engine/test_language_lens.py b/tests/core_engine/test_language_lens.py index 8029f37d..aab50015 100644 --- a/tests/core_engine/test_language_lens.py +++ b/tests/core_engine/test_language_lens.py @@ -48,24 +48,24 @@ def isolated_detector(): mock_langs = { "python": {"extensions": [".py"], "shebangs": ["python"]}, "shell": {"extensions": [".sh"], "shebangs": ["bash"]}, - "cpp": {"extensions": [".cpp", ".h"], "lexical_family": "c_style_comment"}, + "cpp": {"extensions": [".cpp", ".h"], "lexical_family": "standard_block"}, "c": { "extensions": [".c", ".h"], - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "main": re.compile(r"int\s+main") }, # <-- The engine needs a rule to detect C! }, "objective-c": { "extensions": [".m", ".h"], - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": { "interface": re.compile(r"@interface\s+") }, # Needed for Lexical Scan score }, - "html": {"extensions": [".html"], "lexical_family": "xml_angle"}, - "javascript": {"extensions": [".js"], "lexical_family": "c_style_comment"}, - "markdown": {"extensions": [".md"], "lexical_family": "prose"}, + "html": {"extensions": [".html"], "lexical_family": "block_exclusive"}, + "javascript": {"extensions": [".js"], "lexical_family": "standard_block"}, + "markdown": {"extensions": [".md"], "lexical_family": "line_exclusive"}, } # 1. Initialize with empty configs to prevent live lookups @@ -100,8 +100,8 @@ def isolated_detector(): detector.PROSE_ANCHORS = {"README"} detector.DISQUALIFIERS = {} - detector.comment_defs = { - "mechanical_families": {"c_style_comment": {"delimiters": ["//", "/*"]}} + detector.lexical_heuristics = { + "lexical_families": {"standard_block": {"delimiters": ["//", "/*"]}} } detector.HANDSHAKE_REGISTRY = [ @@ -188,25 +188,24 @@ def test_tier_3_lexical_scan(isolated_detector): # ============================================================================== # TEST 6: Tier 4 (Heuristic Discovery) # ============================================================================== +from unittest.mock import patch + def test_tier_4_heuristic_discovery(isolated_detector): """Proves the engine can blindly identify a file with no extension.""" - # Needs > 20 lines to trigger Tier 4. We inject C-style comments and structure. - # Hardcoded \n newlines to bypass Windows \r\n Universal Newline artifacts in CI content = ( "// C-style comment\n" * 25 + "int main() { return 0; }\n" * 5 ) - result = isolated_detector.inspect( - file_path="unknown_binary_xyz", content_sample=content - ) + # Freeze time so the CI runner never trips the Temporal Friction Anomaly + with patch('time.time', return_value=100.0): + result = isolated_detector.inspect( + file_path="unknown_binary_xyz", content_sample=content + ) - # It should isolate 'std_c' as the comment family and find one of the C-family languages assert result["lang_id"] in ["c", "cpp", "objective-c", "javascript"] - assert "Discovery" in result["source_proof"] - - + # ============================================================================== # TEST 7: Hybrid Detection (Nested Languages) # ============================================================================== @@ -351,7 +350,7 @@ def test_tier_4_macro_and_handicaps(mock_time, isolated_detector): # Inject ABAP into the isolated detector isolated_detector.languages["abap"] = { - "lexical_family": "c_style_comment", + "lexical_family": "standard_block", "rules": {"keyword": re.compile(r"REPORT")}, } diff --git a/tests/core_engine/test_language_standards_strict.py b/tests/core_engine/test_language_standards_strict.py index 9b129b8b..57c4084f 100644 --- a/tests/core_engine/test_language_standards_strict.py +++ b/tests/core_engine/test_language_standards_strict.py @@ -134,7 +134,7 @@ def test_c_pointer_ambiguity_overlap(): exponential evaluation on massive strings of pointer asterisks. """ c_api = LANGUAGE_DEFINITIONS["c"]["rules"]["api"] - c_cast = LANGUAGE_DEFINITIONS["c"]["rules"]["cast_hits"] + c_cast = LANGUAGE_DEFINITIONS["c"]["rules"]["explicit_casts"] # The Pathological String: An unclosed cast with absurd pointer depth poison_cast = "( int " + "* " * 200 + ") " @@ -184,7 +184,7 @@ def test_thermodynamic_operator_collisions(): positives in the wrong metric categories. """ # 1. C++ Bitwise vs. I/O Streams - cpp_bitwise = LANGUAGE_DEFINITIONS["cpp"]["rules"]["bitwise_hits"] + cpp_bitwise = LANGUAGE_DEFINITIONS["cpp"]["rules"]["bitwise_ops"] assert len(list(cpp_bitwise.finditer("std::cout << 'Hello'"))) == 0, ( "C++ bitwise tripped on a cout stream!" ) @@ -193,7 +193,7 @@ def test_thermodynamic_operator_collisions(): ) # 2. Rust Closures vs. Bitwise - rust_bitwise = LANGUAGE_DEFINITIONS["rust"]["rules"]["bitwise_hits"] + rust_bitwise = LANGUAGE_DEFINITIONS["rust"]["rules"]["bitwise_ops"] assert len(list(rust_bitwise.finditer("let x = |a| a + 1;"))) == 0, ( "Rust bitwise tripped on a closure!" ) diff --git a/tests/core_engine/test_licensing.py b/tests/core_engine/test_licensing.py index cbfd80b6..5b2051c8 100644 --- a/tests/core_engine/test_licensing.py +++ b/tests/core_engine/test_licensing.py @@ -5,19 +5,20 @@ # ============================================================================== -# TEST 1: THE PYTEST BYPASS +# TEST 1: THE PYTEST BYPASS (REMOVED) # ============================================================================== def test_licensing_pytest_bypass(monkeypatch, capsys): - """Proves the guard instantly returns if PYTEST_CURRENT_TEST is present.""" + """Proves the Pytest bypass was successfully removed and strict compliance is enforced.""" monkeypatch.setenv("PYTEST_CURRENT_TEST", "True") + monkeypatch.setenv("GITGALAXY_LICENSE_KEY", "COMMUNITY_FREE_TIER") with patch("gitgalaxy.licensing.time.sleep") as mock_sleep: enforce_licensing_guard() - # It should exit silently without sleeping or printing warnings + # It should NO LONGER exit silently. It must print the compliance tripwire. mock_sleep.assert_not_called() captured = capsys.readouterr() - assert captured.err == "" + assert "LEGAL AUDIT TRIPWIRE" in captured.err # ============================================================================== diff --git a/tests/core_engine/test_signal_processor.py b/tests/core_engine/test_signal_processor.py index dea2f178..fb396220 100644 --- a/tests/core_engine/test_signal_processor.py +++ b/tests/core_engine/test_signal_processor.py @@ -17,14 +17,14 @@ def create_synthetic_star( """Generates a perfectly structured raw detector payload.""" base_signals = { "branch": 0, - "linear": 0, + "structural_boundaries": 0, "args": 0, "func_start": 0, - "danger": 0, - "sec_danger": 0, - "safety_neg": 0, + "high_risk_execution": 0, + "sec_high_risk_execution": 0, + "safety_bypasses": 0, "safety": 0, - "flux": 0, + "state_mutation": 0, "todo": 0, "fixme": 0, "empty_stubs": 0, @@ -35,7 +35,7 @@ def create_synthetic_star( "api": 0, "concurrency": 0, "sync_locks": 0, - "graveyard": 0, + "dead_code": 0, "spec": 0, "pointers": 0, "indent_tabs": 0, @@ -97,9 +97,9 @@ def test_signal_processor_apocalypse_breaches(processor): 20, { "branch": 5000, - "danger": 5000, - "sec_danger": 5000, - "flux": 5000, + "high_risk_execution": 5000, + "sec_high_risk_execution": 5000, + "state_mutation": 5000, "planned_debt": 5000, "fragile_debt": 5000, "api": 5000, @@ -139,7 +139,7 @@ def test_signal_processor_error_risk_floor(processor): processor, "shielded", 5, - {"danger": 5000, "sec_danger": 5000, "safety": 500, "test": 500}, + {"high_risk_execution": 5000, "sec_high_risk_execution": 5000, "safety": 500, "test": 500}, ) res = processor.calculate_risk_vector(meta, sig) @@ -257,7 +257,7 @@ def test_signal_processor_git_forensics(processor): def test_signal_processor_math_overflow_shield(processor): """Proves astronomical negative densities trigger and survive the OverflowError.""" meta, sig = create_synthetic_star( - processor, "absurd", 1, {"sec_danger": -99999999, "branch": -99999999} + processor, "absurd", 1, {"sec_high_risk_execution": -99999999, "branch": -99999999} ) try: @@ -275,7 +275,7 @@ def test_signal_processor_math_overflow_shield(processor): def test_signal_processor_aggregations(processor): """Triggers the final galaxy-level summary and forensic reports.""" m1, sig1 = create_synthetic_star(processor, "f1", 100, {"branch": 10}) - m2, sig2 = create_synthetic_star(processor, "f2", 200, {"sec_danger": 10}) + m2, sig2 = create_synthetic_star(processor, "f2", 200, {"sec_high_risk_execution": 10}) # Process and unwrap correctly! tel1 = processor.calculate_risk_vector(m1, sig1) @@ -307,7 +307,7 @@ def test_signal_processor_aggregations(processor): def test_signal_processor_minified_tripwire(processor): """Proves minified files bypass standard math and trigger explicit risk spikes.""" meta, sig = create_synthetic_star( - processor, "vendor_bundle", 1000, {"sec_danger": 50} + processor, "vendor_bundle", 1000, {"sec_high_risk_execution": 50} ) meta["is_minified"] = True # Trigger the tripwire @@ -359,13 +359,13 @@ def test_signal_processor_doc_and_secrets_bypass(processor): def test_signal_processor_memory_exhaustion(processor): """Proves recursive functions with high state mutation trigger the Memory Exhaustion multiplier.""" # Baseline: Normal function with state mutation - meta1, sig1 = create_synthetic_star(processor, "safe_flux", 100, {"flux": 50}) + meta1, sig1 = create_synthetic_star(processor, "safe_flux", 100, {"state_mutation": 50}) meta1["functions"] = [ {"name": "safe", "loc": 100, "is_recursive": False, "big_o_depth": 1} ] # Memory Exhaustion: Recursive function + State mutation (No lazy evaluation) - meta2, sig2 = create_synthetic_star(processor, "oom_flux", 100, {"flux": 50}) + meta2, sig2 = create_synthetic_star(processor, "oom_flux", 100, {"state_mutation": 50}) meta2["functions"] = [ {"name": "bomb", "loc": 100, "is_recursive": True, "big_o_depth": 1} ] @@ -475,7 +475,7 @@ def test_signal_processor_algorithmic_dos(processor): "loc": 250, "big_o_depth": 3, "db_complexity": 2, - "hit_vector": {"api": 4, "safety": 1, "bailout_hits": 2}, + "hit_vector": {"api": 4, "safety": 1, "panics_and_aborts": 2}, } ] @@ -508,7 +508,7 @@ def test_signal_processor_security_lenses(processor): processor, "logic_bomb", 100, - {"branch": 50, "sec_danger": 20, "sec_tainted_injection": 5}, + {"branch": 50, "sec_high_risk_execution": 20, "sec_tainted_injection": 5}, ) # 2. Obscured Payload (Requires intent_mass via sec_danger to bypass the 95% false-positive shield) @@ -517,16 +517,16 @@ def test_signal_processor_security_lenses(processor): "obscured", 100, { - "sec_heat_triggers": 20, - "sec_bitwise_hits": 50, + "sec_reflection_metaprogramming": 20, + "sec_bitwise_ops": 50, "sec_shadow_imports": 5, - "sec_danger": 10, + "sec_high_risk_execution": 10, }, ) # 3. Injection Surface m_inj, sig_inj = create_synthetic_star( - processor, "injection", 100, {"sec_io": 30, "sec_danger": 30} + processor, "injection", 100, {"sec_io": 30, "sec_high_risk_execution": 30} ) # 4. Memory Corruption (Requires native memory language like 'c' + malicious intent to bypass the 95% shield) @@ -534,7 +534,7 @@ def test_signal_processor_security_lenses(processor): processor, "memory", 100, - {"pointers": 50, "memory_alloc": 20, "sec_danger": 10}, + {"pointers": 50, "memory_alloc": 20, "sec_high_risk_execution": 10}, ) m_mem["lang_id"] = "c" @@ -577,7 +577,7 @@ def test_signal_processor_structural_metrics(processor): # Graveyard (High dead code) m_grave, sig_grave = create_synthetic_star( - processor, "graveyard", 100, {"graveyard": 80} + processor, "dead_code", 100, {"dead_code": 80} ) # Spec Match (0 specs for 10 functions = 100% risk) @@ -588,7 +588,7 @@ def test_signal_processor_structural_metrics(processor): r_grave = processor.calculate_risk_vector(m_grave, sig_grave) r_spec = processor.calculate_risk_vector(m_spec, sig_spec) - idx_grave = processor.RISK_SCHEMA.index("graveyard") + idx_grave = processor.RISK_SCHEMA.index("dead_code") idx_spec = processor.RISK_SCHEMA.index("spec_match") assert r_grave["risk_vector"][idx_grave] > 50.0, ( @@ -615,7 +615,7 @@ def test_signal_processor_design_slop(processor): processor, "sloppy_debt", 100, - {"planned_debt": 10, "design_slop_orphans": 5, "design_slop_duplicates": 2}, + {"planned_debt": 10, "orphaned_logic": 5, "duplicate_logic": 2}, ) r_clean = processor.calculate_risk_vector(m_clean, sig_clean) @@ -793,12 +793,12 @@ def test_signal_processor_flux_immutability(processor): # 1. Pure Flux (High mutation) m_flux, sig_flux = create_synthetic_star( - processor, "high_flux", 100, {"flux": 30} + processor, "high_flux", 100, {"state_mutation": 30} ) # 2. Frozen Flux (High mutation, but heavily mitigated by freeze/const/final) m_frozen, sig_frozen = create_synthetic_star( - processor, "frozen_flux", 100, {"flux": 30, "freeze_hits": 40} + processor, "frozen_flux", 100, {"state_mutation": 30, "immutability_locks": 40} ) r_flux = processor.calculate_risk_vector(m_flux, sig_flux) @@ -838,7 +838,7 @@ def test_signal_processor_contextual_mismatch(processor): processor, "native", 100, - {"branch": 50, "sec_danger": 20, "sec_tainted_injection": 5}, + {"branch": 50, "sec_high_risk_execution": 20, "sec_tainted_injection": 5}, ) m_native["lang_id"] = "c" m_native["metadata"] = {"folder_dominant_lang": "cpp"} @@ -848,7 +848,7 @@ def test_signal_processor_contextual_mismatch(processor): processor, "alien", 100, - {"branch": 50, "sec_danger": 20, "sec_tainted_injection": 5}, + {"branch": 50, "sec_high_risk_execution": 20, "sec_tainted_injection": 5}, ) m_alien["lang_id"] = "c" m_alien["metadata"] = {"folder_dominant_lang": "javascript"} @@ -870,7 +870,7 @@ def test_signal_processor_science_shield(processor): """Proves that Scientific/Math logic dampens the false-positive threat of Logic Bombs.""" # 1. Standard executable with dangerous triggers m_std, sig_std = create_synthetic_star( - processor, "standard", 100, {"branch": 30, "sec_danger": 20} + processor, "standard", 100, {"branch": 30, "sec_high_risk_execution": 20} ) # 2. Scientific executable with the exact same triggers @@ -878,7 +878,7 @@ def test_signal_processor_science_shield(processor): processor, "science", 100, - {"branch": 30, "sec_danger": 20, "scientific": 10}, + {"branch": 30, "sec_high_risk_execution": 20, "scientific": 10}, ) r_std = processor.calculate_risk_vector(m_std, sig_std) @@ -926,7 +926,7 @@ def test_signal_processor_civil_war_void(processor): ) r_void = processor.calculate_risk_vector(m_void, sig_void) - idx_civil = processor.RISK_SCHEMA.index("civil_war") + idx_civil = processor.RISK_SCHEMA.index("tabs_vs_spaces") assert r_void["risk_vector"][idx_civil] == 50.0, ( "Void state failed to default to 50.0% neutral exposure!" @@ -940,7 +940,7 @@ def test_signal_processor_llm_execution_vulnerability(processor): """Proves that pairing an LLM Orchestrator with dynamic execution creates a massive Injection Surface spike.""" # 1. Standard dynamic execution m_std, sig_std = create_synthetic_star( - processor, "std_exec", 100, {"sec_danger": 10} + processor, "std_exec", 100, {"sec_high_risk_execution": 10} ) # 2. Agentic dynamic execution @@ -948,7 +948,7 @@ def test_signal_processor_llm_execution_vulnerability(processor): processor, "agent_exec", 100, - {"sec_danger": 10, "llm_orchestrator": 5, "ai_tools": 5}, + {"sec_high_risk_execution": 10, "llm_orchestrator": 5, "ai_tools": 5}, ) r_std = processor.calculate_risk_vector(m_std, sig_std) @@ -971,7 +971,7 @@ def test_signal_processor_crypto_professionalism_shield(processor): processor, "raw_obf", 100, - {"sec_heat_triggers": 50, "sec_bitwise_hits": 50, "sec_danger": 10}, + {"sec_reflection_metaprogramming": 50, "sec_bitwise_ops": 50, "sec_high_risk_execution": 10}, ) # 2. Professional cryptography (Same obfuscation, but heavily documented and safe) @@ -980,9 +980,9 @@ def test_signal_processor_crypto_professionalism_shield(processor): "pro_crypto", 100, { - "sec_heat_triggers": 50, - "sec_bitwise_hits": 50, - "sec_danger": 10, + "sec_reflection_metaprogramming": 50, + "sec_bitwise_ops": 50, + "sec_high_risk_execution": 10, "doc": 100, "safety": 20, "cryptography": 10, @@ -1009,7 +1009,7 @@ def test_signal_processor_llm_api_secrets(processor): processor, "std_leak", 500, - {"sec_private_info": 1, "globals": 1, "sec_heat_triggers": 1}, + {"sec_hardcoded_secrets": 1, "globals": 1, "sec_reflection_metaprogramming": 1}, ) # 2. Careless LLM API secret leak (Calling APIs without using global variables) @@ -1017,7 +1017,7 @@ def test_signal_processor_llm_api_secrets(processor): processor, "llm_leak", 500, - {"sec_private_info": 1, "llm_api": 5, "globals": 0, "sec_heat_triggers": 1}, + {"sec_hardcoded_secrets": 1, "llm_api": 5, "globals": 0, "sec_reflection_metaprogramming": 1}, ) @@ -1027,7 +1027,7 @@ def test_signal_processor_llm_api_secrets(processor): def test_signal_processor_safe_minified(processor): """Proves that minified files with zero malicious intent safely bypass the tripwire.""" m_safe, sig_safe = create_synthetic_star( - processor, "jquery_min", 100, {"branch": 50, "flux": 20} + processor, "jquery_min", 100, {"branch": 50, "state_mutation": 20} ) m_safe["is_minified"] = True @@ -1047,12 +1047,12 @@ def test_signal_processor_safe_minified(processor): def test_signal_processor_lazy_evaluation_shield(processor): """Proves that lazy evaluation (generators/streams) neutralizes the Memory Exhaustion multiplier.""" # 1. Ticking Memory Exhaustion Bomb (O(N^3) + High Flux + No Lazy Eval) - m_oom, sig_oom = create_synthetic_star(processor, "oom_bomb", 100, {"flux": 20}) + m_oom, sig_oom = create_synthetic_star(processor, "oom_bomb", 100, {"state_mutation": 20}) m_oom["functions"] = [{"name": "heavy_loop", "loc": 50, "big_o_depth": 3}] # 2. Safe Stream (O(N^3) + High Flux + Lazy Evaluation) m_lazy, sig_lazy = create_synthetic_star( - processor, "lazy_stream", 100, {"flux": 20, "lazy_evaluation": 10} + processor, "lazy_stream", 100, {"state_mutation": 20, "lazy_evaluation": 10} ) m_lazy["functions"] = [{"name": "generator", "loc": 50, "big_o_depth": 3}] @@ -1103,7 +1103,7 @@ def test_signal_processor_ai_topology_dl_ml(processor): def test_signal_processor_paranoid_mode(processor): """Proves that Paranoid Mode tightens the Sigmoid thresholds across security lenses.""" m_para, sig_para = create_synthetic_star( - processor, "paranoid_file", 500, {"sec_danger": 5, "sec_io": 5} + processor, "paranoid_file", 500, {"sec_high_risk_execution": 5, "sec_io": 5} ) # Calculate in Standard Mode @@ -1166,7 +1166,7 @@ def test_signal_processor_sigmoid_overflow(processor): processor, "super_shield", 1, - {"safety": 15000, "test": 15000, "doc": 15000, "freeze_hits": 15000}, + {"safety": 15000, "test": 15000, "doc": 15000, "immutability_locks": 15000}, ) # Create a file with mathematically impossible danger to force a massive positive density @@ -1174,7 +1174,7 @@ def test_signal_processor_sigmoid_overflow(processor): processor, "super_bomb", 1, - {"branch": 15000, "concurrency": 15000, "flux": 15000, "sec_danger": 15000}, + {"branch": 15000, "concurrency": 15000, "state_mutation": 15000, "sec_high_risk_execution": 15000}, ) # If these execute without crashing the test runner, the except blocks are working perfectly. @@ -1281,7 +1281,7 @@ def test_signal_processor_tech_debt_slop(processor): processor, "fragile_slop", 500, - {"fragile_debt": 2, "design_slop_orphans": 2, "design_slop_duplicates": 1}, + {"fragile_debt": 2, "orphaned_logic": 2, "duplicate_logic": 1}, ) r_debt = processor.calculate_risk_vector(m_debt, sig_debt) @@ -1381,7 +1381,7 @@ def test_signal_processor_hardware_bridge_shield(processor): """Proves that Hardware Bridges (Serial/USB I/O) are forgiven for dynamic execution.""" # 1. Raw Execution (Malicious) m_raw, sig_raw = create_synthetic_star( - processor, "raw_exec", 100, {"sec_danger": 10, "sec_io": 10} + processor, "raw_exec", 100, {"sec_high_risk_execution": 10, "sec_io": 10} ) # 2. Hardware Execution (Expected Arduino/Serial behavior) @@ -1389,7 +1389,7 @@ def test_signal_processor_hardware_bridge_shield(processor): processor, "hw_exec", 100, - {"sec_danger": 10, "sec_io": 10, "hardware_bridge": 10}, + {"sec_high_risk_execution": 10, "sec_io": 10, "hardware_bridge": 10}, ) r_raw = processor.calculate_risk_vector(m_raw, sig_raw) diff --git a/tests/security_auditing/test_ai_appsec_sensor.py b/tests/security_auditing/test_ai_appsec_sensor.py index c02d887b..b6b89e95 100644 --- a/tests/security_auditing/test_ai_appsec_sensor.py +++ b/tests/security_auditing/test_ai_appsec_sensor.py @@ -3,21 +3,21 @@ # ============================================================================== -# TEST 1: The RCE Funnel (Weaponized Prompt Injection) +# TEST 1: Autonomous Execution Vector (Weaponized Prompt Injection) # ============================================================================== -def test_rce_funnel_detection(): +def test_autonomous_execution_vector_detection(): """ Proves that an LLM directly wired to OS execution (eval/subprocess) - and exposed via a public API correctly triggers the RCE Funnel alert. + and exposed via a public API correctly triggers the Autonomous Execution Vector alert. """ sensor = AIAppSecSensor() mock_files = [ { "telemetry": { - "llm_api": 1, # ☢️ AI is present - "arch_api": 1, # ☢️ Exposed to the public internet - "sec_danger": 1, # ☢️ Contains eval() or subprocess execution + "llm_api": 1, # AI is present + "arch_api": 1, # Exposed to the public internet + "sec_high_risk_execution": 1, # Contains eval() or subprocess execution "safety_density": 0.9, } } @@ -26,28 +26,28 @@ def test_rce_funnel_detection(): result = sensor.hunt_threats(mock_files) appsec_report = result[0]["telemetry"]["ai_appsec"] - assert appsec_report["is_rce_funnel"] is True, "Failed to detect the RCE Funnel!" + assert appsec_report["is_rce_funnel"] is True, "Failed to detect the Autonomous Execution Vector!" assert any( - "RCE Funnel" in warning for warning in appsec_report["critical_warnings"] + "Autonomous Execution Vector" in warning for warning in appsec_report["critical_warnings"] ) # ============================================================================== -# TEST 2: The God-Mode Agent (Autonomous Data Corruption) +# TEST 2: Over-Permissioned Agent (Autonomous Data Corruption) # ============================================================================== -def test_god_mode_agent_detection(): +def test_over_permissioned_agent_detection(): """ Proves that an AI agent given autonomous tools, write-access to complex - databases, and low defensive programming density triggers the God-Mode alert. + databases, and low defensive programming density triggers the Over-Permissioned Agent alert. """ sensor = AIAppSecSensor() mock_files = [ { - "max_db_complexity": 3, # ☢️ Heavy database write access + "max_db_complexity": 3, # Heavy database write access "telemetry": { - "ai_tools": 1, # ☢️ Agentic tool calling enabled - "safety_density": 0.2, # ☢️ Dangerously low defensive programming + "ai_tools": 1, # Agentic tool calling enabled + "safety_density": 0.2, # Dangerously low defensive programming }, } ] @@ -56,29 +56,29 @@ def test_god_mode_agent_detection(): appsec_report = result[0]["telemetry"]["ai_appsec"] assert appsec_report["over_permissioned_agent"] is True, ( - "Failed to detect the God-Mode Agent!" + "Failed to detect the Over-Permissioned Agent!" ) assert any( - "God-Mode Agent" in warning for warning in appsec_report["critical_warnings"] + "Over-Permissioned Agent" in warning for warning in appsec_report["critical_warnings"] ) # ============================================================================== -# TEST 3: The Exfiltration Vector (Unsandboxed Sockets) +# TEST 3: Agentic Exfiltration Vector (Unsandboxed Sockets) # ============================================================================== def test_exfiltration_vector_detection(): """ Proves that an LLM with access to both raw network sockets and hardcoded - environment secrets triggers the Exfiltration Vector alert. + environment secrets triggers the Agentic Exfiltration Vector alert. """ sensor = AIAppSecSensor() mock_files = [ { "telemetry": { - "llm_api": 1, # ☢️ AI is present - "arch_io": 1, # ☢️ Can make outbound network requests - "sec_secrets": 1, # ☢️ Has access to AWS keys/passwords + "llm_api": 1, # AI is present + "arch_io": 1, # Can make outbound network requests + "sec_secrets": 1, # Has access to AWS keys/passwords } } ] @@ -87,10 +87,10 @@ def test_exfiltration_vector_detection(): appsec_report = result[0]["telemetry"]["ai_appsec"] assert appsec_report["agentic_exfiltration_risk"] is True, ( - "Failed to detect the Exfiltration Vector!" + "Failed to detect the Agentic Exfiltration Vector!" ) assert any( - "Exfiltration Vector" in warning + "Agentic Exfiltration Vector" in warning for warning in appsec_report["critical_warnings"] ) @@ -111,7 +111,7 @@ def test_safe_baseline(): "telemetry": { "llm_api": 1, # ✅ AI is present "arch_api": 0, # ✅ Not exposed to the public - "sec_danger": 0, # ✅ No eval/subprocess + "sec_high_risk_execution": 0, # ✅ No eval/subprocess "sec_secrets": 0, # ✅ No secrets exposed "safety_density": 0.95, # ✅ High defensive try/catch density }, @@ -127,4 +127,4 @@ def test_safe_baseline(): assert appsec_report["agentic_exfiltration_risk"] is False assert len(appsec_report["critical_warnings"]) == 0, ( "False positive triggered on a safe file!" - ) + ) \ No newline at end of file diff --git a/tests/security_auditing/test_api_network_map.py b/tests/security_auditing/test_api_network_map.py index 34ac3f97..e519b073 100644 --- a/tests/security_auditing/test_api_network_map.py +++ b/tests/security_auditing/test_api_network_map.py @@ -345,9 +345,9 @@ def test_cli_presentation_dashboard_findings(tmp_path, capsys): captured = capsys.readouterr().out assert "SHADOW API SECURITY AUDIT" in captured - assert "SHADOW APIS DETECTED: 1" in captured + assert "SHADOW APIs DETECTED: 1" in captured assert "DELETE /api/shadow" in captured - assert "GHOST APIS DETECTED: 1" in captured + assert "GHOST APIs DETECTED: 1" in captured assert "GET /api/ghost" in captured diff --git a/tests/security_auditing/test_binary_anomaly_detector.py b/tests/security_auditing/test_binary_anomaly_detector.py index 8e589ec3..3312a694 100644 --- a/tests/security_auditing/test_binary_anomaly_detector.py +++ b/tests/security_auditing/test_binary_anomaly_detector.py @@ -7,7 +7,7 @@ # ============================================================================== -# TEST 1: The Routing Matrix (Denylist vs Allowlist vs Test Folders) +# TEST 1: Path Filtering Logic (Denylist vs Allowlist vs Extensions) # ============================================================================== @patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.SecurityLens") @patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.ApertureFilter") @@ -19,11 +19,11 @@ def test_xray_routing_matrix( monkeypatch.setattr(xray_module, "ALLOWLIST_PATHS", ["approved_keys/"]) mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True mock_security = mock_security_class.return_value mock_security.scan_content.return_value = { - "counts": {"entropy": 6.5, "bitwise_hits": 0} + "counts": {"entropy": 6.5, "bitwise_ops": 0} } mock_security.scan_binary.return_value = {} @@ -41,29 +41,20 @@ def test_xray_routing_matrix( # File C (Bypass Extension) (repo_dir / "compressed.zip").write_text("FAKE_ZIP_DATA", encoding="utf-8") - # File D (Bypass Test Folder) - src_dir = repo_dir / "src" - src_dir.mkdir() - test_dir = src_dir / "tests" - test_dir.mkdir() - (test_dir / "mock_payload.dat").write_text( - "FAKE_HIGH_ENTROPY_DATA", encoding="utf-8" - ) - result = xray_module.run_xray_audit(repo_dir) assert result["anomalies_found"] == 1, ( - "The routing matrix failed! Check Denylist/Allowlist math." + "Path filtering logic failed! Verify Denylist and Allowlist evaluation." ) # ============================================================================== -# TEST 2: The Deep Scan Threat Identification +# TEST 2: Deep Content Inspection (Magic Bytes & High Entropy) # ============================================================================== @patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.SecurityLens") @patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.ApertureFilter") def test_xray_deep_scan_threats(mock_aperture_class, mock_security_class, tmp_path): mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True repo_dir = tmp_path / "deep_scan_repo" repo_dir.mkdir() @@ -85,26 +76,26 @@ def mock_scan_binary(head_bytes, ext): def mock_scan_content(content, limit): if "MZ" in content: - return {"counts": {"entropy": 6.8, "bitwise_hits": 2}} - return {"counts": {"entropy": 1.2, "bitwise_hits": 0}} + return {"counts": {"entropy": 6.8, "bitwise_ops": 2}} + return {"counts": {"entropy": 1.2, "bitwise_ops": 0}} mock_security.scan_binary.side_effect = mock_scan_binary mock_security.scan_content.side_effect = mock_scan_content result = xray_module.run_xray_audit(repo_dir) assert result["anomalies_found"] == 1, ( - "Failed to flag magic byte mismatch or high entropy!" + "Failed to flag magic byte mismatch or high entropy structural anomaly." ) # ============================================================================== -# TEST 3: The Shebang Shield +# TEST 3: Expected Execution Header Exception # ============================================================================== @patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.SecurityLens") @patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.ApertureFilter") def test_xray_shebang_shield(mock_aperture_class, mock_security_class, tmp_path): mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True repo_dir = tmp_path / "shebang_repo" repo_dir.mkdir() @@ -114,14 +105,14 @@ def test_xray_shebang_shield(mock_aperture_class, mock_security_class, tmp_path) mock_security = mock_security_class.return_value mock_security.scan_content.return_value = { - "counts": {"entropy": 0, "bitwise_hits": 0} + "counts": {"entropy": 0, "bitwise_ops": 0} } mock_security.scan_binary.return_value = { "threat_snippet": "Suspicious execution header: #!/bin/bash" } result = xray_module.run_xray_audit(repo_dir) - assert result["anomalies_found"] == 0, "The Shebang Shield failed!" + assert result["anomalies_found"] == 0, "Expected execution header bypass failed." # ============================================================================== @@ -131,7 +122,7 @@ def test_xray_shebang_shield(mock_aperture_class, mock_security_class, tmp_path) @patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.ApertureFilter") def test_xray_run_audit_exception(mock_aperture_class, mock_security_class, tmp_path): mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True repo_dir = tmp_path / "broken_audit" repo_dir.mkdir() @@ -141,12 +132,12 @@ def test_xray_run_audit_exception(mock_aperture_class, mock_security_class, tmp_ result = xray_module.run_xray_audit(repo_dir) assert result["anomalies_found"] == 0, ( - "Failed to gracefully catch exception in run_xray_audit!" + "Failed to gracefully catch IO exception in run_xray_audit!" ) # ============================================================================== -# TEST 5: CLI Main - Missing Target Trap +# TEST 5: CLI Main - Missing Target Validation # ============================================================================== def test_main_missing_target(capsys): """Proves the CLI catches invalid directories and exits safely.""" @@ -170,7 +161,7 @@ def test_main_clean_run( """Proves a clean repository successfully logs completion without raising SystemExit.""" monkeypatch.setattr(xray_module, "ALLOWLIST_PATHS", ["approved/"]) mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True mock_security = mock_security_class.return_value mock_security.scan_binary.return_value = {} @@ -178,8 +169,8 @@ def test_main_clean_run( # We must trigger an anomaly in the bypassed file so the engine logs it as 'allowed' def mock_scan_content(content, limit): if "bypassed" in content: - return {"counts": {"entropy": 6.0, "bitwise_hits": 0}} - return {"counts": {"entropy": 0, "bitwise_hits": 0}} + return {"counts": {"entropy": 6.0, "bitwise_ops": 0}} + return {"counts": {"entropy": 0, "bitwise_ops": 0}} mock_security.scan_content.side_effect = mock_scan_content @@ -197,7 +188,7 @@ def mock_scan_content(content, limit): captured = capsys.readouterr() assert ( - "ALL CLEAR: No encrypted payloads or binary anomalies detected." in captured.out + "[SUCCESS] No obfuscated payloads or binary anomalies detected." in captured.out ) assert "known mock/safe files were bypassed via configuration." in captured.out @@ -210,10 +201,10 @@ def mock_scan_content(content, limit): def test_main_anomaly_detected( mock_aperture_class, mock_security_class, tmp_path, monkeypatch, capsys ): - """Proves the CLI detects active anomalies, blocks the commit, and logs the triage alert.""" + """Proves the CLI detects active anomalies, blocks the commit, and logs the blocking action.""" monkeypatch.setattr(xray_module, "DENYLIST_PATTERNS", ["*.forbidden"]) mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True repo_dir = tmp_path / "threat_repo_cli" repo_dir.mkdir() @@ -224,7 +215,7 @@ def test_main_anomaly_detected( mock_security = mock_security_class.return_value mock_security.scan_binary.return_value = {} mock_security.scan_content.side_effect = lambda content, limit: ( - {"counts": {"entropy": 5.0, "bitwise_hits": 1}} + {"counts": {"entropy": 5.0, "bitwise_ops": 1}} if "HIGH ENTROPY" in content else {"counts": {}} ) @@ -235,8 +226,8 @@ def test_main_anomaly_detected( assert exc_info.value.code == 1 captured = capsys.readouterr() - assert "TRIAGE ALERT" in captured.out - assert "[FORBIDDEN FILE BREACH]" in captured.out + assert "[BLOCKING ACTION]" in captured.out + assert "[DENYLIST MATCH]" in captured.out assert "[ANOMALY DETECTED]" in captured.out @@ -248,7 +239,7 @@ def test_main_anomaly_detected( def test_main_file_read_exception(mock_aperture_class, mock_security_class, tmp_path): """Triggers the generic 'except Exception: pass' inside the deep scan loop of main().""" mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True repo_dir = tmp_path / "broken_main" repo_dir.mkdir() @@ -272,7 +263,7 @@ def test_xray_import_fallback(): mock_config.APERTURE_CONFIG = {} sys.modules["gitgalaxy.standards.gitgalaxy_config"] = mock_config - # Force a reload to trigger the ImportError bypass block (Lines 24-29) + # Force a reload to trigger the ImportError bypass block importlib.reload(xray_module) assert xray_module.ALLOWLIST_PATHS == [], ( @@ -286,3 +277,38 @@ def test_xray_import_fallback(): if original_config: sys.modules["gitgalaxy.standards.gitgalaxy_config"] = original_config importlib.reload(xray_module) + + +# ============================================================================== +# TEST 10: False Positive Mitigation (Test Directory Bypass) +# ============================================================================== +@patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.SecurityLens") +@patch("gitgalaxy.tools.supply_chain_security.binary_anomaly_detector.ApertureFilter") +def test_xray_test_folder_bypass(mock_aperture_class, mock_security_class, tmp_path, capsys): + """Proves that high entropy mock files placed within test directories are safely ignored.""" + mock_aperture = mock_aperture_class.return_value + mock_aperture._check_ignore_rules.return_value = True + + repo_dir = tmp_path / "test_bypass_repo" + repo_dir.mkdir() + + # Create a test directory with a mock encrypted payload + test_dir = repo_dir / "tests" + test_dir.mkdir() + mock_payload = test_dir / "mock_encrypted_payload.bin" + mock_payload.write_text("HIGH_ENTROPY_DATA", encoding="utf-8") + + mock_security = mock_security_class.return_value + mock_security.scan_binary.return_value = {} + + # Mock returning high entropy for this file + mock_security.scan_content.return_value = { + "counts": {"entropy": 7.5, "bitwise_ops": 0} + } + + with patch("sys.argv", ["xray", str(repo_dir)]): + xray_module.main() + + captured = capsys.readouterr() + assert "[SUCCESS] No obfuscated payloads or binary anomalies detected." in captured.out + assert "known mock/safe files were bypassed via configuration." in captured.out \ No newline at end of file diff --git a/tests/security_auditing/test_dev_agent_firewall.py b/tests/security_auditing/test_dev_agent_firewall.py index ad38c3d6..6efb01bf 100644 --- a/tests/security_auditing/test_dev_agent_firewall.py +++ b/tests/security_auditing/test_dev_agent_firewall.py @@ -27,7 +27,7 @@ def test_black_hole_detection(): assert guardrails["is_agentic_black_hole"] is True, ( "Failed to detect the Agentic Black Hole!" ) - assert any("Black Hole detected" in warning for warning in guardrails["warnings"]) + assert any("Context Window Exhaustion" in warning for warning in guardrails["warnings"]) # ============================================================================== @@ -73,7 +73,7 @@ def test_hallucination_zone_detection(): mock_files = [ { "telemetry": { - "heat_triggers": 3, # ☢️ > 2 dynamic execution triggers + "reflection_metaprogramming": 3, # ☢️ > 2 dynamic execution triggers "doc_density": 0.15, # ☢️ < 0.20 density } } @@ -85,7 +85,7 @@ def test_hallucination_zone_detection(): assert guardrails["hallucination_zone"] is True, ( "Failed to detect the Hallucination Zone!" ) - assert any("Hallucination Zone" in warning for warning in guardrails["warnings"]) + assert any("Hallucination Risk" in warning for warning in guardrails["warnings"]) # ============================================================================== @@ -114,7 +114,7 @@ def test_silent_mutation_risk_detection(): assert guardrails.get("silent_mutation_risk") is True, ( "Failed to detect Silent Mutation Risk!" ) - assert any("Silent Mutation Risk" in warning for warning in guardrails["warnings"]) + assert any("Cascading State Flux" in warning for warning in guardrails["warnings"]) # ============================================================================== @@ -133,7 +133,7 @@ def test_safe_agentic_baseline(): "max_big_o": 1, # ✅ Simple O(N) logic "risk_vector": [10, 5], # ✅ Low risk debt (15) "telemetry": { - "heat_triggers": 0, # ✅ No dynamic execution + "reflection_metaprogramming": 0, # ✅ No dynamic execution "doc_density": 0.85, # ✅ Highly documented "state_flux": 10, # ✅ Low flux "has_tests": True, # ✅ Safely tested diff --git a/tests/security_auditing/test_pii_leak_hunter.py b/tests/security_auditing/test_pii_leak_hunter.py index 363bbaf9..0ffcd5ce 100644 --- a/tests/security_auditing/test_pii_leak_hunter.py +++ b/tests/security_auditing/test_pii_leak_hunter.py @@ -1,3 +1,4 @@ +import pytest import sys from unittest.mock import patch @@ -6,23 +7,23 @@ # ============================================================================== -# TEST 1: The Masking Engine (Data Destruction Verification) +# TEST 1: The Masking Engine (Data Redaction Verification) # ============================================================================== def test_pii_masking_engine(): """ Mathematically verifies that the regex engine correctly intercepts and - destroys sensitive PII data while preserving the safe formatting. + redacts sensitive PII data while preserving the safe formatting. """ - # 1. VISA Test (Destroy 12 digits, keep last 4) + # 1. VISA Test (Redact 12 digits, keep last 4) assert pii_module.mask_pii("Card: 4123456789012345") == "Card: VISA-MASKED-2345" - # 2. MASTERCARD Test (Destroy 12 digits, keep last 4) + # 2. MASTERCARD Test (Redact 12 digits, keep last 4) assert pii_module.mask_pii("Card: 5123456789012345") == "Card: MC-MASKED-2345" - # 3. SSN Test (Destroy first 5 digits, keep last 4) + # 3. SSN Test (Redact first 5 digits, keep last 4) assert pii_module.mask_pii("ID: 123-45-6789") == "ID: XXX-XX-6789" - # 4. AWS KEY Test (Keep prefix and last 4, destroy the 12-char middle) + # 4. AWS KEY Test (Keep prefix and last 4, redact the 12-char middle) assert pii_module.mask_pii("Key: AKIAIOSFODNN7EXAMPLE") == "Key: AKIA-XXXX-MPLE" # 5. The Combo Test (Multiple leaks in a single log line) @@ -63,7 +64,7 @@ def test_pii_leak_hunter_e2e(tmp_path): # 3. Verify the Evidence Log evidence_file = log_dir / "production_dump_pii_leak_evidence.log" assert evidence_file.exists(), ( - "The hunter failed to generate the safe evidence log!" + "The PII Leak Hunter failed to generate the safe evidence log!" ) content = evidence_file.read_text(encoding="utf-8") @@ -71,16 +72,114 @@ def test_pii_leak_hunter_e2e(tmp_path): # A) Ensure the clean lines were ignored (Saving disk space/CPU) assert "System boot sequence normal" not in content - # B) Ensure the masked data made it to the file + # B) Ensure the redacted data made it to the file assert "VISA-MASKED-1111" in content assert "AKIA-XXXX-MPLE" in content assert "XXX-XX-9999" in content - # C) ZERO-TRUST GUARANTEE: Ensure the raw PII was completely obliterated + # C) ZERO-TRUST GUARANTEE: Ensure the raw PII was completely redacted assert "4111111111111111" not in content, ( - "CRITICAL LEAK: Raw VISA card written to disk!" + "CRITICAL LEAK: Raw VISA card written to disk! Redaction failed." ) assert "AKIAIOSFODNN7EXAMPLE" not in content, ( - "CRITICAL LEAK: Raw AWS Key written to disk!" + "CRITICAL LEAK: Raw AWS Key written to disk! Redaction failed." ) - assert "999-99-9999" not in content, "CRITICAL LEAK: Raw SSN written to disk!" + assert "999-99-9999" not in content, "CRITICAL LEAK: Raw SSN written to disk! Redaction failed." + + +# ============================================================================== +# TEST 3: CLI Argument Parsing - Missing Target +# ============================================================================== +def test_missing_target_argument(capsys): + """Ensures the CLI gracefully exits when no target is provided.""" + with patch.object(sys, "argv", ["pii_leak_hunter.py"]): + with pytest.raises(SystemExit) as exc_info: + pii_module.main() + # argparse default exit code for missing arguments is 2 + assert exc_info.value.code == 2 + + captured = capsys.readouterr() + assert "the following arguments are required: target" in captured.err + + +# ============================================================================== +# TEST 4: Invalid Target Path Handling +# ============================================================================== +def test_invalid_target_path(tmp_path, capsys): + """Ensures the tool exits cleanly when provided a non-existent file.""" + invalid_path = tmp_path / "does_not_exist.log" + test_args = ["pii_leak_hunter.py", str(invalid_path)] + + with patch.object(sys, "argv", test_args): + with pytest.raises(SystemExit) as exc_info: + pii_module.main() + assert exc_info.value.code == 1 + + captured = capsys.readouterr() + assert "Target file does not exist or is not a file" in captured.out + + +# ============================================================================== +# TEST 5: Custom Output Directory Override +# ============================================================================== +def test_custom_output_directory(tmp_path): + """Verifies that the --out argument redirects the evidence log successfully.""" + log_dir = tmp_path / "source_logs" + log_dir.mkdir() + target_log = log_dir / "app.log" + target_log.write_text("2026-05-11T10:00 [DEBUG] Transaction 4111111111111111 processed\n", encoding="utf-8") + + custom_out = tmp_path / "secure_archive" + test_args = ["pii_leak_hunter.py", str(target_log), "--out", str(custom_out)] + + with patch.object(sys, "argv", test_args): + pii_module.main() + + assert custom_out.exists(), "Custom output directory was not created." + evidence_file = custom_out / "app_pii_leak_evidence.log" + assert evidence_file.exists(), "Evidence log not found in custom output directory." + assert "VISA-MASKED-1111" in evidence_file.read_text(encoding="utf-8") + + +# ============================================================================== +# TEST 6: Clean Log Processing (Zero Detection) +# ============================================================================== +def test_clean_log_processing(tmp_path, capsys): + """Proves the tool processes safe logs without generating false evidence data.""" + log_dir = tmp_path / "logs" + log_dir.mkdir() + target_log = log_dir / "clean.log" + target_log.write_text("2026-05-11T09:00 [INFO] System boot sequence normal\n", encoding="utf-8") + + test_args = ["pii_leak_hunter.py", str(target_log)] + with patch.object(sys, "argv", test_args): + pii_module.main() + + evidence_file = log_dir / "clean_pii_leak_evidence.log" + assert evidence_file.exists() + assert evidence_file.read_text(encoding="utf-8") == "", "Clean evidence log should be completely empty." + + captured = capsys.readouterr() + assert "[SUCCESS] Clean scan. No Social Security, Credit Card, or AWS Keys detected." in captured.out + + +# ============================================================================== +# TEST 7: Output Directory Permission Failure +# ============================================================================== +def test_output_directory_permission_error(tmp_path, capsys): + """Simulates a scenario where the application lacks rights to create the output folder.""" + log_dir = tmp_path / "logs" + log_dir.mkdir() + target_log = log_dir / "app.log" + target_log.write_text("data", encoding="utf-8") + + test_args = ["pii_leak_hunter.py", str(target_log)] + + with patch("pathlib.Path.mkdir", side_effect=PermissionError("Access Denied")): + with patch.object(sys, "argv", test_args): + with pytest.raises(SystemExit) as exc_info: + pii_module.main() + assert exc_info.value.code == 1 + + captured = capsys.readouterr() + assert "[ERROR] Permission denied to create output directory" in captured.out \ No newline at end of file diff --git a/tests/security_auditing/test_security_auditor.py b/tests/security_auditing/test_security_auditor.py index 4f70ba88..9d8dc44a 100644 --- a/tests/security_auditing/test_security_auditor.py +++ b/tests/security_auditing/test_security_auditor.py @@ -3,7 +3,7 @@ from unittest.mock import patch # We patch the schemas before importing so the Auditor doesn't fail on boot -MOCK_SCHEMAS = {"SIGNAL_SCHEMA": ["danger", "io", "flux", "safety", "graveyard", "structural_tab_indentations"]} +MOCK_SCHEMAS = {"SIGNAL_SCHEMA": ["high_risk_execution", "io", "state_mutation", "safety", "dead_code", "structural_tab_indentations"]} with patch("gitgalaxy.security.security_auditor.RECORDING_SCHEMAS", MOCK_SCHEMAS): from gitgalaxy.security.security_auditor import SecurityAuditor @@ -71,7 +71,7 @@ def test_construct_feature_matrix(mock_artifacts): auditor = SecurityAuditor() # Explicitly inject the schema into the instance so the dictionary mapping works - auditor.SIGNAL_SCHEMA = ["danger", "io", "flux", "safety", "graveyard"] + auditor.SIGNAL_SCHEMA = ["high_risk_execution", "io", "state_mutation", "safety", "dead_code"] auditor._resolve_dependency_graph(mock_artifacts) # Pre-load graph data df = auditor._construct_feature_matrix(mock_artifacts) @@ -79,7 +79,7 @@ def test_construct_feature_matrix(mock_artifacts): assert not df.empty assert len(df) == 2 # Ensure the log_density math didn't crash - assert "log_density_hit_danger" in df.columns + assert "log_density_hit_high_risk_execution" in df.columns assert "log_logic_loc" in df.columns @@ -108,7 +108,7 @@ def test_audit_repository_ml_inference(mock_xgb_class, mock_artifacts): """ # 1. Setup the Mock Model mock_model = mock_xgb_class.return_value - mock_model.feature_names_in_ = ["log_logic_loc", "log_density_hit_danger"] + mock_model.feature_names_in_ = ["log_logic_loc", "log_density_hit_high_risk_execution"] # Predict probabilities for 2 files across 5 classes (0=Safe, 1=Botnet, 2=Trojan, etc) # File 1: 99% confident it's a Botnet (Class 1) @@ -206,7 +206,7 @@ def test_audit_repository_threshold_gating(mock_xgb_class, mock_artifacts): def test_construct_feature_matrix_exclusion_list(mock_artifacts): """Ensures noisy signals (like indentation factions) are stripped before ML evaluation.""" auditor = SecurityAuditor() - auditor.SIGNAL_SCHEMA = ["danger", "structural_tab_indentations"] + auditor.SIGNAL_SCHEMA = ["high_risk_execution", "structural_tab_indentations"] # Inject the excluded signal into the mock artifact mock_artifacts[0]["hit_vector"] = [5, 100] @@ -214,7 +214,7 @@ def test_construct_feature_matrix_exclusion_list(mock_artifacts): auditor._resolve_dependency_graph(mock_artifacts) df = auditor._construct_feature_matrix(mock_artifacts) - assert "log_density_hit_danger" in df.columns + assert "log_density_hit_high_risk_execution" in df.columns assert "log_density_hit_structural_tab_indentations" not in df.columns, ( "Exclusion list failed! Noisy signal leaked into the feature matrix." ) diff --git a/tests/security_auditing/test_security_lens.py b/tests/security_auditing/test_security_lens.py index 85afad43..28bd2cab 100644 --- a/tests/security_auditing/test_security_lens.py +++ b/tests/security_auditing/test_security_lens.py @@ -37,13 +37,13 @@ def test_sast_vulnerability_signatures(lens): result = lens.scan_content(malicious_code, 15) counts = result["counts"] - assert counts.get("private_info", 0) > 0, "Failed to detect high-entropy API key!" - assert counts.get("safety_neg", 0) > 0, "Failed to detect safety bypass (ini_set)!" - assert counts.get("danger", 0) > 0, "Failed to detect dynamic execution payload (eval)!" + assert counts.get("hardcoded_secrets", 0) > 0, "Failed to detect high-entropy API key!" + assert counts.get("safety_bypasses", 0) > 0, "Failed to detect safety bypass (ini_set)!" + assert counts.get("high_risk_execution", 0) > 0, "Failed to detect dynamic execution payload (eval)!" assert counts.get("shadow_imports", 0) > 0, ( "Failed to detect steganographic import!" ) - assert counts.get("flux", 0) > 0, "Failed to detect prototype pollution!" + assert counts.get("state_mutation", 0) > 0, "Failed to detect prototype pollution!" # ============================================================================== @@ -144,7 +144,7 @@ def test_evaluate_risk_network_centrality(lens): for highly central architecture nodes in the dependency graph. """ hits = { - "danger": 50, + "high_risk_execution": 50, "io": 20, } # 70 hits in 100 LOC = 0.70 density (breaches 0.65 threshold) loc = 100 @@ -176,8 +176,8 @@ def test_evaluate_risk_prompt_injection_isolation(lens): risk = lens.evaluate_risk(hits, loc, network_metrics=None) - assert "Prompt Injection Risk" in risk - assert "Agentic RCE Risk (Critical)" not in risk + assert "Prompt Injection Surface Risk" in risk + assert "Autonomous Execution Vector (Critical)" not in risk # ============================================================================== @@ -195,7 +195,7 @@ def test_binary_magic_byte_scanner(lens): assert "sec_extension_mismatch" in result_mismatch, ( "Failed to detect Magic Byte mismatch!" ) - assert "sec_danger" in result_mismatch, ( + assert "sec_high_risk_execution" in result_mismatch, ( "Failed to detect embedded ELF execution header!" ) @@ -203,7 +203,7 @@ def test_binary_magic_byte_scanner(lens): random_bytes = os.urandom(50000) # Large sample size guarantees entropy > 7.95 result_entropy = lens.scan_binary(random_bytes, ".bin") - assert "sec_heat_triggers" in result_entropy, ( + assert "sec_reflection_metaprogramming" in result_entropy, ( "Failed to calculate extreme binary entropy!" ) @@ -220,18 +220,18 @@ def test_comprehensive_risk_evaluation_coverage(lens): assert lens._calculate_shannon_entropy("") == 0.0 # 2. Safe Code Baseline (Zero False Positives) - safe_hits = {"branch": 5, "linear": 10} + safe_hits = {"branch": 5, "structural_boundaries": 10} safe_risk = lens.evaluate_risk(safe_hits, 100) assert not safe_risk, "Safe code generated false positive risk exposures!" # 3. Total Threshold Breach (Triggering every risk vector simultaneously) apocalyptic_hits = { - "heat_triggers": 500, # Hidden Malware - "graveyard": 500, # Logic Bomb + "reflection_metaprogramming": 500, # Hidden Malware + "dead_code": 500, # Logic Bomb "io": 500, - "danger": 500, # Data Injection + "high_risk_execution": 500, # Data Injection "memory_corruption": 500, # Memory Corruption - "private_info": 500, # Secrets Leak + "hardcoded_secrets": 500, # Secrets Leak "agentic_rce": 1, # Critical Agentic RCE Override } @@ -241,7 +241,7 @@ def test_comprehensive_risk_evaluation_coverage(lens): assert "Logic Bomb Risk" in doomsday_risk assert "Memory Corruption Risk" in doomsday_risk assert "Secrets Leak Risk" in doomsday_risk - assert "Agentic RCE Risk (Critical)" in doomsday_risk + assert "Autonomous Execution Vector (Critical)" in doomsday_risk # 4. Binary Scanner Exception Handler # We pass a valid byte array to survive the header scan, but mock the Counter diff --git a/tests/security_auditing/test_statistical_auditor.py b/tests/security_auditing/test_statistical_auditor.py index 026c9386..e7dda49c 100644 --- a/tests/security_auditing/test_statistical_auditor.py +++ b/tests/security_auditing/test_statistical_auditor.py @@ -12,12 +12,12 @@ MOCK_LANG_DEFS = { "cpp": { - "rules": {"branch": 1, "args": 1, "linear": 1, "pointers": 1, "memory_alloc": 1} + "rules": {"branch": 1, "args": 1, "structural_boundaries": 1, "pointers": 1, "memory_alloc": 1} }, "c": { - "rules": {"branch": 1, "args": 1, "linear": 1, "pointers": 1, "memory_alloc": 1} + "rules": {"branch": 1, "args": 1, "structural_boundaries": 1, "pointers": 1, "memory_alloc": 1} }, - "python": {"rules": {"branch": 1, "args": 1, "linear": 1}}, + "python": {"rules": {"branch": 1, "args": 1, "structural_boundaries": 1}}, "json": {"rules": {}} # Inert data format (0 logic signals) } @@ -81,7 +81,7 @@ def test_auditor_zero_density_threshold(auditor): "name": "data_dump.cpp", "lang_id": "cpp", "coding_loc": 150, # > 50 - "equations": {"branch": 0, "linear": 0}, # 0 logic signals + "equations": {"branch": 0, "structural_boundaries": 0}, # 0 logic signals "telemetry": { "identity_lock_tier": 0, # <-- Tier 0 Bypass for the Low-Sample Guard! "identity_source_proof": "Absolute Override", @@ -107,7 +107,7 @@ def test_auditor_packed_payload_guard(auditor): "name": "packed_logic.cpp", "lang_id": "cpp", "coding_loc": 40, - "equations": {"branch": 200, "linear": 100}, + "equations": {"branch": 200, "structural_boundaries": 100}, "telemetry": { "identity_lock_tier": 0 }, # <--- CHANGE TO 0 (Bypass Low-Sample Guard) @@ -137,7 +137,7 @@ def test_auditor_threat_quarantine_guard(auditor): "name": "malware.cpp", "lang_id": "cpp", "coding_loc": 100, - "equations": {"sec_danger": 1}, + "equations": {"sec_high_risk_execution": 1}, "telemetry": {"identity_lock_tier": 0}, # <--- Bypasses the Low-Sample Guard } ] @@ -196,7 +196,7 @@ def test_auditor_dead_code_bypass(auditor): "name": "graveyard.cpp", "lang_id": "cpp", "coding_loc": 100, - "equations": {"branch": 0, "linear": 0}, # Would normally fail Zero-Density + "equations": {"branch": 0, "structural_boundaries": 0}, # Would normally fail Zero-Density "doc_loc": 600, # Massive comment-to-code ratio triggers dead code bypass "telemetry": {"identity_lock_tier": 0}, } diff --git a/tests/security_auditing/test_supply_chain_firewall.py b/tests/security_auditing/test_supply_chain_firewall.py index c5ec80fd..ae97893b 100644 --- a/tests/security_auditing/test_supply_chain_firewall.py +++ b/tests/security_auditing/test_supply_chain_firewall.py @@ -5,17 +5,19 @@ import gitgalaxy.tools.supply_chain_security.supply_chain_firewall as firewall_module - # ============================================================================== -# TEST 1: Zero-Trust Import Slicer (Regex & Bins) +# TEST 1: Dependency Graph Import Verification # ============================================================================== -def test_zero_trust_import_slicer(monkeypatch): +def test_zero_trust_import_verification(monkeypatch): + """ + Validates that the firewall correctly segregates imports into approved, + unknown, and blacklisted categories based on enterprise policy constraints. + """ monkeypatch.setattr(firewall_module, "APPROVED_IMPORTS", ["react", "express"]) monkeypatch.setattr( firewall_module, "BLACKLISTED_IMPORTS", ["event-stream-malware"] ) - # Build the mock RAM graph (Pre-tokenized by Phase 1) mock_ram_graph = [ { "path": "app.js", @@ -32,25 +34,84 @@ def test_zero_trust_import_slicer(monkeypatch): ] result = firewall_module.run_firewall_audit(mock_ram_graph) - assert result["imports_blacklisted"] == 1, "Failed to identify blacklisted package!" - assert result["imports_unknown"] == 1, "Failed to identify unknown package!" + assert result["imports_whitelisted"] == 1, "Failed to identify approved package." + assert result["imports_blacklisted"] == 1, "Failed to identify blacklisted package." + assert result["imports_unknown"] == 1, "Failed to identify unknown package." + assert result["threats_found"] == 1, "Blacklisted package did not increment threat counter." + +# ============================================================================== +# TEST 2: Local Path and Sub-Module Truncation Shield +# ============================================================================== +def test_import_truncation_and_local_shield(monkeypatch): + """ + Ensures that local relative imports are ignored, and deeply nested + scoped packages (@org/pkg/module) are properly truncated for evaluation. + """ + monkeypatch.setattr(firewall_module, "APPROVED_IMPORTS", ["@angular/core", "lodash"]) + monkeypatch.setattr(firewall_module, "BLACKLISTED_IMPORTS", []) + + mock_ram_graph = [ + { + "path": "component.ts", + # .local should be ignored. @angular/core/testing should truncate to @angular/core + "raw_imports": ["./local-service", "@angular/core/testing", "lodash/fp"], + "equations": {}, + "coding_loc": 50, + } + ] + + result = firewall_module.run_firewall_audit(mock_ram_graph) + assert result["imports_whitelisted"] == 2, "Failed to truncate and match scoped/nested dependencies." + assert result["imports_unknown"] == 0, "Local relative import was erroneously evaluated." +# ============================================================================== +# TEST 3: Alias Spoofing Detection +# ============================================================================== +def test_alias_spoofing_detection(monkeypatch, capsys): + """ + Validates that the firewall correctly detects when a safe alias is mapped + to a blacklisted upstream package via the alias_map. + """ + monkeypatch.setattr(firewall_module, "APPROVED_IMPORTS", []) + monkeypatch.setattr(firewall_module, "BLACKLISTED_IMPORTS", ["malicious-core"]) + + mock_ram_graph = [ + { + "path": "package.json", + "raw_imports": ["safe-utils"], + "equations": {}, + "coding_loc": 10, + } + ] + + # Simulate an npm alias: "safe-utils": "npm:malicious-core@1.0" + mock_alias_map = {"safe-utils": "malicious-core"} + + result = firewall_module.run_firewall_audit(mock_ram_graph, alias_map=mock_alias_map) + captured = capsys.readouterr() + + assert result["imports_blacklisted"] == 1, "Failed to dereference spoofed alias." + assert result["threats_found"] == 1, "Spoofed alias did not increment threat counter." + assert "Spoofed alias blocked" in captured.out, "Missing spoofed alias log output." # ============================================================================== -# TEST 2: Strict Mode Enforcement +# TEST 4: Strict Policy Enforcement Mode # ============================================================================== def test_strict_mode_enforcement(tmp_path, monkeypatch): + """ + Ensures that when STRICT_IMPORT_MODE is enabled, any unknown dependency + causes the pipeline to fail with a SystemExit. + """ monkeypatch.setattr(firewall_module, "APPROVED_IMPORTS", ["react"]) monkeypatch.setattr(firewall_module, "BLACKLISTED_IMPORTS", []) monkeypatch.setattr(firewall_module, "STRICT_IMPORT_MODE", True) mock_ram_graph = { - "stars": [ + "artifacts": [ {"path": "server.js", "raw_imports": ["shadow-library"], "equations": {}} ] } - # Test the main CLI interface by writing a fake RAM graph JSON graph_file = tmp_path / "results.json" graph_file.write_text(json.dumps(mock_ram_graph), encoding="utf-8") @@ -58,24 +119,25 @@ def test_strict_mode_enforcement(tmp_path, monkeypatch): with patch.object(sys, "argv", test_args): with pytest.raises(SystemExit) as exc: firewall_module.main() - assert exc.value.code == 1, "STRICT_IMPORT_MODE failed!" - + assert exc.value.code == 1, "Strict import policy enforcement failed to block an unknown package." # ============================================================================== -# TEST 3: The Inert Data Shield (Minified File Bypass) +# TEST 5: Behavioral Threat Density Evaluation # ============================================================================== -def test_inert_data_shield_minified_bypass(tmp_path, monkeypatch): +def test_behavioral_threat_evaluation(tmp_path, monkeypatch): + """ + Validates that artifacts exhibiting high-density threat indicators + (calculated during Phase 1) trigger a firewall block. + """ monkeypatch.setattr(firewall_module, "STRICT_IMPORT_MODE", False) monkeypatch.setattr(firewall_module, "BLACKLISTED_IMPORTS", []) - # The firewall evaluates behavioral hits based on Phase 1 equations. - # We simulate a file that Phase 1 flagged with massive threats mock_ram_graph_threat = { - "stars": [ + "artifacts": [ { "path": "logic.js", "raw_imports": [], - "equations": {"homoglyphs": 500, "danger": 50}, + "equations": {"homoglyphs": 500, "high_risk_execution": 50}, "coding_loc": 50, } ] @@ -86,8 +148,71 @@ def test_inert_data_shield_minified_bypass(tmp_path, monkeypatch): test_args = ["supply_chain_firewall.py", str(graph_file)] - # It should exit with code 1 due to the high density of threats with patch.object(sys, "argv", test_args): with pytest.raises(SystemExit) as exc: firewall_module.main() - assert exc.value.code == 1 + assert exc.value.code == 1, "Behavioral threat density evaluation failed to trigger pipeline failure." + +# ============================================================================== +# TEST 6: Build-Time Execution Multiplier (Static Sandbox) +# ============================================================================== +def test_build_time_execution_multiplier(monkeypatch): + """ + Ensures that critical build files (like setup.py) have their risk equations + artificially multiplied to make them hyper-sensitive to anomalous logic. + """ + monkeypatch.setattr(firewall_module, "STRICT_IMPORT_MODE", False) + + # MATHEMATICS FIX: + # A file with 1000 LOC gets a safe_loc of 1150. + # To breach the 20% 'paranoid' Logic Bomb threshold, the file needs + # a sabotage density of >= 0.20 (230 effective hits). + # + # standard_app.py: 20 danger hits * 1.5 = 30 hits (2.6% density) -> Safely Passes + # setup.py: (20 danger hits * 10x multiplier) * 1.5 = 300 hits (26.0% density) -> Blocks! + mock_ram_graph = [ + { + "path": "setup.py", + "raw_imports": [], + "equations": {"high_risk_execution": 20}, + "coding_loc": 1000, + }, + { + "path": "standard_app.py", + "raw_imports": [], + "equations": {"high_risk_execution": 20}, + "coding_loc": 1000, + } + ] + + result = firewall_module.run_firewall_audit(mock_ram_graph) + assert result["threats_found"] == 1, "Build-time multiplier failed to amplify threat in setup.py." + +# ============================================================================== +# TEST 7: CLI Main - Missing Target Validation +# ============================================================================== +def test_main_missing_target(capsys): + """Proves the CLI catches invalid directories and exits safely.""" + with patch("sys.argv", ["supply_chain_firewall.py", "non_existent_graph.json"]): + with pytest.raises(SystemExit) as exc_info: + firewall_module.main() + + assert exc_info.value.code == 1 + captured = capsys.readouterr() + assert "Error: RAM graph" in captured.out + +# ============================================================================== +# TEST 8: CLI Main - Corrupted JSON Handling +# ============================================================================== +def test_main_corrupted_json(tmp_path, capsys): + """Ensures the firewall gracefully exits if the input graph is malformed.""" + broken_graph = tmp_path / "broken.json" + broken_graph.write_text("{ broken_json: ", encoding="utf-8") + + with patch("sys.argv", ["supply_chain_firewall.py", str(broken_graph)]): + with pytest.raises(SystemExit) as exc_info: + firewall_module.main() + + assert exc_info.value.code == 1 + captured = capsys.readouterr() + assert "Failed to parse RAM graph JSON" in captured.out \ No newline at end of file diff --git a/tests/security_auditing/test_terabyte_log_scanner.py b/tests/security_auditing/test_terabyte_log_scanner.py index 29d67427..c1d297bb 100644 --- a/tests/security_auditing/test_terabyte_log_scanner.py +++ b/tests/security_auditing/test_terabyte_log_scanner.py @@ -8,12 +8,12 @@ # ============================================================================== -# TEST 1: The IR State Handshake & Binary Extraction +# TEST 1: IR State Ingestion & Stream Processing # ============================================================================== -def test_scanner_json_handshake_and_extraction(tmp_path): +def test_scanner_json_ingestion_and_extraction(tmp_path): """ - Proves that the engine correctly parses the IR state JSON, extracts the targets, - scans a binary log stream, and safely extracts only the matching lines to disk. + Validates that the scanner correctly parses the IR state JSON, extracts the targets, + processes a binary log stream, and safely writes only the matching lines to disk. """ # 1. Setup the physical mock workspace work_dir = tmp_path / "scanner_workspace" @@ -24,8 +24,8 @@ def test_scanner_json_handshake_and_extraction(tmp_path): state_file = work_dir / "ir_state.json" state_file.write_text(json.dumps(ir_state), encoding="utf-8") - # B) The Mock Terabyte Log (Mix of noise and target hits) - target_log = work_dir / "mainframe_dump.log" + # B) The Mock Log File (Mix of noise and target hits) + target_log = work_dir / "production_dump.log" target_log.write_text( "2026-05-11 09:15 [INFO] System boot sequence initialized\n" "2026-05-11 09:20 [EXEC] PGM_ALPHA executed successfully\n" @@ -44,12 +44,12 @@ def test_scanner_json_handshake_and_extraction(tmp_path): ] with patch.object(sys, "argv", test_args): - # We don't trap SystemExit here because a successful run should exit normally (no sys.exit call) + # We don't trap SystemExit here because a successful run should exit normally scanner_module.main() # 3. The Invariant Assertions # A) Verify the filtered results log - results_file = work_dir / "mainframe_dump_results.txt" + results_file = work_dir / "production_dump_results.txt" assert results_file.exists(), "Scanner failed to create the results output file!" results_content = results_file.read_text(encoding="utf-8") @@ -57,7 +57,7 @@ def test_scanner_json_handshake_and_extraction(tmp_path): assert "PGM_ALPHA executed successfully" in results_content assert "PGM_BETA encountered warning 04" in results_content assert "System boot sequence initialized" not in results_content, ( - "Noise slipped through the binary filter!" + "Unrelated log entries bypassed the stream filter." ) # B) Verify the Telemetry Sidecar @@ -69,13 +69,13 @@ def test_scanner_json_handshake_and_extraction(tmp_path): # PGM_ALPHA appeared twice, PGM_BETA appeared once assert counts.get("PGM_ALPHA") == 2, ( - "Mathematical aggregation failed for PGM_ALPHA!" + "Execution count aggregation failed for PGM_ALPHA." ) - assert counts.get("PGM_BETA") == 1, "Mathematical aggregation failed for PGM_BETA!" + assert counts.get("PGM_BETA") == 1, "Execution count aggregation failed for PGM_BETA." # ============================================================================== -# TEST 2: The Schema Guard (Invalid JSON Rejection) +# TEST 2: Schema Validation (Invalid JSON Rejection) # ============================================================================== def test_scanner_invalid_json_schema(tmp_path): """ @@ -104,11 +104,11 @@ def test_scanner_invalid_json_schema(tmp_path): scanner_module.main() # The engine must throw a fatal error (exit code 1) on schema mismatch - assert exc.value.code == 1, "Scanner failed to block an invalid JSON schema!" + assert exc.value.code == 1, "Scanner failed to halt on an invalid JSON schema." # ============================================================================== -# TEST 3: The Manual CLI Override (-k Flag) +# TEST 3: Manual Keyword Extraction (-k Flag) # ============================================================================== def test_scanner_manual_keyword_override(tmp_path): """ @@ -135,4 +135,124 @@ def test_scanner_manual_keyword_override(tmp_path): assert "ERROR 500" in content assert "ERROR 404" in content - assert "SUCCESS 200" not in content, "Manual keyword override failed to filter!" + assert "SUCCESS 200" not in content, "Manual keyword override failed to filter properly." + + +# ============================================================================== +# TEST 4: Missing Target Argument +# ============================================================================== +def test_missing_target_argument(capsys): + """Ensures the CLI gracefully exits when no target is provided.""" + with patch.object(sys, "argv", ["terabyte_log_scanner.py"]): + with pytest.raises(SystemExit) as exc_info: + scanner_module.main() + # argparse default exit code for missing arguments is 2 + assert exc_info.value.code == 2 + + captured = capsys.readouterr() + assert "the following arguments are required: target" in captured.err + + +# ============================================================================== +# TEST 5: Invalid Target Path Handling +# ============================================================================== +def test_invalid_target_path(tmp_path, capsys): + """Ensures the tool exits cleanly when provided a non-existent file.""" + invalid_path = tmp_path / "does_not_exist.log" + test_args = ["terabyte_log_scanner.py", str(invalid_path), "-k", "TEST"] + + with patch.object(sys, "argv", test_args): + with pytest.raises(SystemExit) as exc_info: + scanner_module.main() + assert exc_info.value.code == 1 + + captured = capsys.readouterr() + assert "Target log file does not exist or is not a file" in captured.out + + +# ============================================================================== +# TEST 6: Missing Input State File +# ============================================================================== +def test_missing_state_file(tmp_path, capsys): + """Ensures the tool exits cleanly when the specified --input_state file is missing.""" + work_dir = tmp_path / "missing_state_repo" + work_dir.mkdir() + + dummy_log = work_dir / "dummy.log" + dummy_log.write_text("empty", encoding="utf-8") + + missing_state = work_dir / "missing.json" + + test_args = [ + "terabyte_log_scanner.py", + str(dummy_log), + "--input_state", + str(missing_state) + ] + + with patch.object(sys, "argv", test_args): + with pytest.raises(SystemExit) as exc_info: + scanner_module.main() + assert exc_info.value.code == 1 + + captured = capsys.readouterr() + assert "Input state JSON file not found" in captured.out + + +# ============================================================================== +# TEST 7: Empty Known Programs Array +# ============================================================================== +def test_empty_known_programs(tmp_path, capsys): + """Ensures the tool exits cleanly (code 0) if the known_programs array is empty.""" + work_dir = tmp_path / "empty_programs_repo" + work_dir.mkdir() + + ir_state = {"analysis": {"known_programs": []}} + state_file = work_dir / "ir_state.json" + state_file.write_text(json.dumps(ir_state), encoding="utf-8") + + dummy_log = work_dir / "dummy.log" + dummy_log.write_text("empty", encoding="utf-8") + + test_args = [ + "terabyte_log_scanner.py", + str(dummy_log), + "--input_state", + str(state_file), + ] + + with patch.object(sys, "argv", test_args): + with pytest.raises(SystemExit) as exc_info: + scanner_module.main() + + # An empty target list is not a crash, just a clean exit because there's nothing to do + assert exc_info.value.code == 0 + + captured = capsys.readouterr() + assert "array is empty or invalid. Nothing to search." in captured.out + + +# ============================================================================== +# TEST 8: Custom Output Directory Override +# ============================================================================== +def test_custom_output_directory(tmp_path): + """Verifies that the --out argument redirects the generated files successfully.""" + log_dir = tmp_path / "source_logs" + log_dir.mkdir() + target_log = log_dir / "app.log" + target_log.write_text("2026-05-11T10:00 [DEBUG] ERROR 500\n", encoding="utf-8") + + custom_out = tmp_path / "analysis_results" + test_args = ["terabyte_log_scanner.py", str(target_log), "-k", "ERROR", "--out", str(custom_out)] + + with patch.object(sys, "argv", test_args): + scanner_module.main() + + assert custom_out.exists(), "Custom output directory was not created." + + results_file = custom_out / "app_results.txt" + sidecar_file = custom_out / "dynamic_telemetry.json" + + assert results_file.exists(), "Results log not found in custom output directory." + assert sidecar_file.exists(), "Telemetry JSON not found in custom output directory." + assert "ERROR 500" in results_file.read_text(encoding="utf-8") \ No newline at end of file diff --git a/tests/security_auditing/test_vault_sentinel.py b/tests/security_auditing/test_vault_sentinel.py index 739df504..b5f6a758 100644 --- a/tests/security_auditing/test_vault_sentinel.py +++ b/tests/security_auditing/test_vault_sentinel.py @@ -4,9 +4,8 @@ import gitgalaxy.tools.supply_chain_security.vault_sentinel as sentinel_module - # ============================================================================== -# TEST 1: The Denylist Wall (Immediate Path Blocking) +# TEST 1: Denylist Path Evaluation (Immediate Blocking) # ============================================================================== @patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.SecurityLens") @patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.ApertureFilter") @@ -15,13 +14,13 @@ def test_sentinel_denylist_blocking( ): """ Proves that files matching the DENYLIST_PATTERNS are instantly blocked - and trigger a fatal exit without needing a deep content scan. + and trigger a pipeline failure without requiring a deep content scan. """ monkeypatch.setattr(sentinel_module, "DENYLIST_PATTERNS", ["*.pem", "id_rsa*"]) monkeypatch.setattr(sentinel_module, "ALLOWLIST_PATHS", []) mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True repo_dir = tmp_path / "denylist_repo" repo_dir.mkdir() @@ -31,11 +30,10 @@ def test_sentinel_denylist_blocking( with patch.object(sys, "argv", test_args): with pytest.raises(SystemExit) as exc: sentinel_module.main() - assert exc.value.code == 1, "Sentinel failed to block a DENYLIST file pattern!" - + assert exc.value.code == 1, "Sentinel failed to block a denylisted file pattern." # ============================================================================== -# TEST 2: The Deep Scan Trap (Hardcoded Content Leaks) +# TEST 2: Deep Content Inspection (Hardcoded Credential Leaks) # ============================================================================== @patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.SecurityLens") @patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.ApertureFilter") @@ -43,20 +41,20 @@ def test_sentinel_content_breach( mock_aperture_class, mock_security_class, tmp_path, monkeypatch ): """ - Proves that seemingly benign files are deeply scanned, and if the SecurityLens - detects private_info, it successfully crashes the build. + Proves that seemingly benign files are deeply scanned, and if the SAST engine + detects private_info signatures, it successfully halts the pipeline. """ monkeypatch.setattr(sentinel_module, "DENYLIST_PATTERNS", []) monkeypatch.setattr(sentinel_module, "ALLOWLIST_PATHS", []) mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True mock_aperture.evaluate_path_integrity.return_value = (True, 100, None) mock_security = mock_security_class.return_value mock_security.scan_content.return_value = { - "counts": {"private_info": 1}, - "snippets": {"private_info": ["AKIAIOSFODNN7EXAMPLE"]}, + "counts": {"hardcoded_secrets": 1}, + "snippets": {"hardcoded_secrets": ["AKIAIOSFODNN7EXAMPLE"]}, } repo_dir = tmp_path / "deepscan_repo" @@ -71,27 +69,26 @@ def test_sentinel_content_breach( sentinel_module.main() assert exc.value.code == 1, ( - "Sentinel failed to crash the build on a hardcoded secret!" + "Sentinel failed to halt the pipeline on a hardcoded credential detection." ) - # ============================================================================== -# TEST 3: The Allowlist Bypass (Mock/Test Key Suppression) +# TEST 3: Allowlist Path Exclusions (Test Key Suppression) # ============================================================================== @patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.SecurityLens") @patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.ApertureFilter") def test_sentinel_allowlist_bypass( - mock_aperture_class, mock_security_class, tmp_path, monkeypatch + mock_aperture_class, mock_security_class, tmp_path, monkeypatch, capsys ): """ - Proves that if a file is explicitly inside an ALLOWLIST_PATH, it completely - bypasses both Denylist crashes and Content Scan crashes. + Proves that if a file resides explicitly inside an ALLOWLIST_PATH, it completely + bypasses both Denylist path checks and Deep Content scanning triggers. """ monkeypatch.setattr(sentinel_module, "DENYLIST_PATTERNS", ["*.pem"]) monkeypatch.setattr(sentinel_module, "ALLOWLIST_PATHS", ["mock_keys/"]) mock_aperture = mock_aperture_class.return_value - mock_aperture._check_solar_shield.return_value = True + mock_aperture._check_ignore_rules.return_value = True mock_aperture.evaluate_path_integrity.return_value = ( False, 100, @@ -99,7 +96,7 @@ def test_sentinel_allowlist_bypass( ) mock_security = mock_security_class.return_value - mock_security.scan_content.return_value = {"counts": {"private_info": 5}} + mock_security.scan_content.return_value = {"counts": {"hardcoded_secrets": 5}} repo_dir = tmp_path / "allowlist_repo" repo_dir.mkdir() @@ -115,5 +112,103 @@ def test_sentinel_allowlist_bypass( sentinel_module.main() except SystemExit: pytest.fail( - "The Allowlist Bypass failed! A whitelisted test key crashed the build." + "The Allowlist evaluation failed. A designated test credential triggered a pipeline failure." ) + + captured = capsys.readouterr() + assert "[ALLOWLIST BYPASS]" in captured.out + assert "[SUCCESS] No unauthorized secrets detected." in captured.out + +# ============================================================================== +# TEST 4: Root Traversal Ignore Rules (Skipping .git / node_modules) +# ============================================================================== +@patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.SecurityLens") +@patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.ApertureFilter") +def test_ignore_rules_traversal( + mock_aperture_class, mock_security_class, tmp_path, monkeypatch, capsys +): + """ + Ensures that os.walk is properly mutated to completely skip directories + like .git or node_modules that fail the ApertureFilter check. + """ + monkeypatch.setattr(sentinel_module, "DENYLIST_PATTERNS", []) + monkeypatch.setattr(sentinel_module, "ALLOWLIST_PATHS", []) + + mock_aperture = mock_aperture_class.return_value + + # Configure the mock to reject any directory named '.git' + def mock_check_ignore(rel_path): + if ".git" in str(rel_path): + return False + return True + + mock_aperture._check_ignore_rules.side_effect = mock_check_ignore + mock_aperture.evaluate_path_integrity.return_value = (True, 100, None) + + mock_security = mock_security_class.return_value + mock_security.scan_content.return_value = {"counts": {"hardcoded_secrets": 0}} + + repo_dir = tmp_path / "ignore_repo" + repo_dir.mkdir() + + # Create a blocked directory with a file that would normally be scanned + git_dir = repo_dir / ".git" + git_dir.mkdir() + (git_dir / "config").write_text("dummy", encoding="utf-8") + + # Create an approved directory + src_dir = repo_dir / "src" + src_dir.mkdir() + (src_dir / "main.py").write_text("dummy", encoding="utf-8") + + test_args = ["vault_sentinel.py", str(repo_dir)] + with patch.object(sys, "argv", test_args): + sentinel_module.main() + + captured = capsys.readouterr() + assert "Files Evaluated : 1" in captured.out, "Failed to skip the .git directory contents during traversal." + +# ============================================================================== +# TEST 5: Exception Handling (Unreadable Files) +# ============================================================================== +@patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.SecurityLens") +@patch("gitgalaxy.tools.supply_chain_security.vault_sentinel.ApertureFilter") +def test_unreadable_file_handling( + mock_aperture_class, mock_security_class, tmp_path, monkeypatch +): + """ + Validates that a file generating an I/O or Permission error during reading + is gracefully skipped without crashing the Sentinel. + """ + monkeypatch.setattr(sentinel_module, "DENYLIST_PATTERNS", []) + monkeypatch.setattr(sentinel_module, "ALLOWLIST_PATHS", []) + + mock_aperture = mock_aperture_class.return_value + mock_aperture._check_ignore_rules.return_value = True + mock_aperture.evaluate_path_integrity.return_value = (True, 100, None) + + repo_dir = tmp_path / "broken_repo" + repo_dir.mkdir() + (repo_dir / "locked.dat").write_text("data", encoding="utf-8") + + test_args = ["vault_sentinel.py", str(repo_dir)] + + try: + with patch.object(sys, "argv", test_args): + with patch("builtins.open", side_effect=PermissionError("Locked")): + sentinel_module.main() + except SystemExit as exc: + pytest.fail(f"Sentinel failed to gracefully handle file read exception. Exited with {exc.code}") + +# ============================================================================== +# TEST 6: CLI Main - Missing Target Validation +# ============================================================================== +def test_main_missing_target(capsys): + """Proves the CLI catches invalid directories and exits safely.""" + with patch("sys.argv", ["vault_sentinel.py", "non_existent_folder_path_12345"]): + with pytest.raises(SystemExit) as exc_info: + sentinel_module.main() + + assert exc_info.value.code == 1 + captured = capsys.readouterr() + assert "Error: Target" in captured.out \ No newline at end of file diff --git a/tests/tools_recorders/test_audit_recorder.py b/tests/tools_recorders/test_audit_recorder.py index 3668d32c..490f1d31 100644 --- a/tests/tools_recorders/test_audit_recorder.py +++ b/tests/tools_recorders/test_audit_recorder.py @@ -10,7 +10,7 @@ def recorder(): # We patch the schema dynamically so our tests are immune to upstream schema changes mock_schemas = { "RISK_SCHEMA": ["secrets_risk", "indentation_faction", "logic_bomb"], - "SIGNAL_SCHEMA": ["sec_private_info", "sec_danger"], + "SIGNAL_SCHEMA": ["sec_hardcoded_secrets", "sec_high_risk_execution"], "EXPOSURE_LABELS": { "secrets_risk": "Secrets Risk Exposure", "indentation_faction": "Indentation Consistency", @@ -85,9 +85,9 @@ def test_audit_recorder_generate_ml_threat_report(recorder, tmp_path): # Validate File Identity overrides artifact = payload["6. Parsed Files (Scanned Artifacts)"]["src/core"]["Files"]["src/core/auth.py"] assert artifact["1. Artifact Identity"]["System Purpose"] == "Handles JWT Validation" - + # Validate Unparsable formatting - unparsable = payload["5. Unparsable Files (Excluded Artifacts Queue)"] + unparsable = payload["5. Unparsable Artifacts (Excluded Artifacts Queue)"] assert len(unparsable) == 2 assert unparsable[1]["Forensic Category"] == "Parser Bypass" diff --git a/tests/tools_recorders/test_decoder_forge.py b/tests/tools_recorders/test_decoder_forge.py index 90cb6888..b5042993 100644 --- a/tests/tools_recorders/test_decoder_forge.py +++ b/tests/tools_recorders/test_decoder_forge.py @@ -16,7 +16,7 @@ private static final Logger log = LoggerFactory.getLogger(EbcdicDecoderUtil.class); - // Cp1047 is the standard IBM EBCDIC character set + // Cp1047 is the standard IBM EBCDIC character set (US/Canada) private static final Charset EBCDIC_CHARSET = Charset.forName("Cp1047"); /** @@ -50,14 +50,15 @@ int highNibble = b >>> 4; int lowNibble = b & 0x0F; - // The high nibble MUST be a number (0-9) + // DEFENSIVE DESIGN: The high nibble MUST be a valid base-10 digit (0-9). + // Values above 9 indicate corrupted memory or shifted byte boundaries. if (highNibble > 9) { log.warn("Corrupt COMP-3 high nibble '{}' at byte index {}. Defaulting to ZERO.", Integer.toHexString(highNibble), i); return BigDecimal.ZERO; } sb.append(highNibble); - // The low nibble is a number EXCEPT in the very last byte, where it's the sign + // The low nibble is a number EXCEPT in the very last byte, where it acts as the sign flag if (i == packedBytes.length - 1) { boolean isNegative = (lowNibble == 0x0D || lowNibble == 0x0B); if (isNegative) { diff --git a/tests/tools_recorders/test_golden_forge.py b/tests/tools_recorders/test_golden_forge.py index 2b697675..033804db 100644 --- a/tests/tools_recorders/test_golden_forge.py +++ b/tests/tools_recorders/test_golden_forge.py @@ -48,12 +48,15 @@ private final ProcessPayrollService processPayrollService; @PostMapping("/execute") - public ResponseEntity executeProcessPayroll( @RequestBody EmployeeRecordDTO employeeRecordData, - @RequestBody TimecardDataDTO timecardDataData ) { // ⚡ TRANSACTIONAL PARADIGM DETECTED + public ResponseEntity executeProcessPayroll( + @RequestBody EmployeeRecordDTO employeeRecordData, + @RequestBody TimecardDataDTO timecardDataData + ) { + // TRANSACTIONAL PARADIGM DETECTED processPayrollService.executeProcessPayroll(/* pass DTOs here */); - // Expected Outputs: PAYROLL-RECEIPT - return ResponseEntity.ok().build(); } + return ResponseEntity.ok().build(); + } }""" GOLDEN_ENTITY = """package com.gitgalaxy.modernized.entity; diff --git a/tests/tools_recorders/test_llm_recorder.py b/tests/tools_recorders/test_llm_recorder.py index 188e7b91..1598d94c 100644 --- a/tests/tools_recorders/test_llm_recorder.py +++ b/tests/tools_recorders/test_llm_recorder.py @@ -10,7 +10,7 @@ def recorder(): """Initializes the LLMRecorder with a controlled schema for deterministic testing.""" mock_schemas = { "RISK_SCHEMA": ["tech_debt", "cognitive_load", "state_flux"], - "SIGNAL_SCHEMA": ["danger", "io", "prompt_injection"], + "SIGNAL_SCHEMA": ["high_risk_execution", "io", "prompt_injection"], "EXPOSURE_LABELS": { "tech_debt": "Tech Debt Exposure", "cognitive_load": "Cognitive Load Exposure" diff --git a/tests/tools_recorders/test_record_keeper.py b/tests/tools_recorders/test_record_keeper.py index a53e804e..faaf8c13 100644 --- a/tests/tools_recorders/test_record_keeper.py +++ b/tests/tools_recorders/test_record_keeper.py @@ -9,7 +9,7 @@ def keeper(): """Initializes the RecordKeeper with a controlled schema for deterministic testing.""" mock_schemas = { "RISK_SCHEMA": ["tech_debt", "cognitive_load"], - "SIGNAL_SCHEMA": ["danger", "io", "prompt_injection"] + "SIGNAL_SCHEMA": ["high_risk_execution", "io", "prompt_injection"] } with patch("gitgalaxy.recorders.record_keeper.RECORDING_SCHEMAS", mock_schemas): return RecordKeeper() @@ -76,7 +76,7 @@ def mock_pipeline_state(): "db_complexity": 3, "docstring": "Handles incoming API requests.", "calls_out_to": ["validate_token"], - "hit_vector": {"danger": 1, "io": 2} + "hit_vector": {"high_risk_execution": 1, "io": 2} } ] } diff --git a/tests/tools_recorders/test_service_forge.py b/tests/tools_recorders/test_service_forge.py index 676d7a87..d145d866 100644 --- a/tests/tools_recorders/test_service_forge.py +++ b/tests/tools_recorders/test_service_forge.py @@ -32,11 +32,11 @@ private static final Logger log = LoggerFactory.getLogger(PayrollProcessorService.class); // ⚠️ UNRESOLVED EXTERNAL DEPENDENCIES (FROM DAG) - // TODO: AI AGENT - Implement or mock call to: CalcBenefitsService - // TODO: AI AGENT - Implement or mock call to: UpdateLedgerService + // TODO: AI AGENT - Implement or mock interface call to: CalcBenefitsService + // TODO: AI AGENT - Implement or mock interface call to: UpdateLedgerService public void executePayrollProcessor(/* Parameters mapped from Controller */) { - log.info("Executing legacy business logic for payroll-processor"); + log.info("Executing modernized business logic for payroll-processor"); // TODO: [AI AGENT] Implement extracted business rules here. } }"""