From cb3a3173da4b1a891d0eac877c89dc9007a50074 Mon Sep 17 00:00:00 2001 From: Dan Tsai Date: Fri, 26 Jun 2026 23:08:03 +0000 Subject: [PATCH 1/2] ci: add reusable copyright check workflow and tool script --- .../workflows/reusable-copyright-check.yml | 71 +++++++ org-tools/check_copyright.py | 174 ++++++++++++++++++ org-tools/test_check_copyright.py | 156 ++++++++++++++++ 3 files changed, 401 insertions(+) create mode 100644 .github/workflows/reusable-copyright-check.yml create mode 100644 org-tools/check_copyright.py create mode 100644 org-tools/test_check_copyright.py diff --git a/.github/workflows/reusable-copyright-check.yml b/.github/workflows/reusable-copyright-check.yml new file mode 100644 index 0000000..bf86477 --- /dev/null +++ b/.github/workflows/reusable-copyright-check.yml @@ -0,0 +1,71 @@ +name: Reusable Copyright Check + +on: + workflow_call: + inputs: + central_repo: + required: false + type: string + default: ".github" + description: "Name of the central organization tools repository" + exclude_filename_regex: + required: false + type: string + default: "" + description: "Optional custom regex pattern to identify and exclude auto-generated files by filename." + secrets: + ORG_READ_TOKEN: + required: false + description: "Org-level read token to check out central repository if private" + +jobs: + check-copyright: + name: Check Copyright Headers + runs-on: ubuntu-latest + steps: + # 1. Check out the caller repository (the PR code) + - name: Check out PR code + uses: actions/checkout@v5 + with: + fetch-depth: 0 + + # 2. Check out the central tools repository containing check_copyright.py + - name: Check out central tools + uses: actions/checkout@v5 + with: + repository: ${{ github.repository_owner }}/${{ inputs.central_repo }} + path: .github-central + token: ${{ secrets.ORG_READ_TOKEN || github.token }} + + # 3. Set up uv using the standard action + - name: Set up uv + uses: astral-sh/setup-uv@v8.1.0 + + # 4. Get changed files and run check_copyright.py + - name: Run Copyright Check + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + BASE_SHA="${{ github.event.pull_request.base.sha }}" + else + BASE_SHA="${{ github.event.before }}" + fi + + # Get list of added/modified files + CHANGED_FILES=$(git diff --name-only --diff-filter=AM "$BASE_SHA" HEAD || true) + + if [ -z "$CHANGED_FILES" ]; then + echo "No relevant source files changed. Skipping check." + exit 0 + fi + + echo "Checking copyright headers for:" + echo "$CHANGED_FILES" + + # Build execution command + CMD="uv run python3 .github-central/org-tools/check_copyright.py" + if [ -n "${{ inputs.exclude_filename_regex }}" ]; then + CMD="$CMD --exclude-filename-regex '${{ inputs.exclude_filename_regex }}'" + fi + CMD="$CMD $CHANGED_FILES" + + eval $CMD diff --git a/org-tools/check_copyright.py b/org-tools/check_copyright.py new file mode 100644 index 0000000..f624d51 --- /dev/null +++ b/org-tools/check_copyright.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# Copyright 2026 UCP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Linter script to check for copyright headers in source files, excluding generated files.""" + +import argparse +import re +import sys +from pathlib import Path + +# Regular expression to identify copyright lines +COPYRIGHT_RE = re.compile( + r"(copyright\s+(\(c\)\s+)?\d{4})|(licensed\s+under\s+the\s+apache\s+license)", + re.IGNORECASE, +) + +# Regular expressions to identify exact, anchored auto-generation comment headers. +# This prevents normal English docstrings or inline comments in hand-written code from triggering exemptions. +GENERATED_MARKERS_RE = re.compile( + r"^\s*([#//\*]+\s*)?" # Optional comment prefix (#, //, /*, *) + r"(" + r"generated by datamodel-codegen|" + r"this file was automatically generated by json-schema-to-typescript|" + r"@generated|" + r"this file is auto-generated by\b.*|" + r"this file was automatically generated - do not edit" + r")\s*$", + re.IGNORECASE, +) + +# File extensions that require a copyright header +INCLUDED_EXTENSIONS = { + ".py", + ".ts", + ".js", + ".sh", + ".yaml", + ".yml", + ".css", + ".html", + ".go", + ".rs", + ".c", + ".cpp", + ".h", +} + +# Safe, conservative default regex pattern to identify universally recognized auto-generated files. +# Matches common protobuf, grpc, and thrift generated file signatures. +# Matches: foo.pb.go, foo_pb2.py, foo_pb2_grpc.py +DEFAULT_GENERATED_FILENAME_RE = re.compile( + r"(\.pb\.[a-z]+|.*_pb2(_grpc)?\.py)$", + re.IGNORECASE, +) + +# Number of lines to inspect at the top of the file +MAX_COPYRIGHT_LINES = 15 +MAX_GENERATED_CHECK_LINES = 15 + + +def is_generated( + file_path: Path, content_lines: list[str], generated_filename_re: re.Pattern +) -> bool: + """Check if the file is automatically generated based on its name or content.""" + # 1. Check filename pattern + if generated_filename_re.search(file_path.name): + return True + + # 2. Check the first few lines for exact generation marker patterns + for line in content_lines[:MAX_GENERATED_CHECK_LINES]: + if GENERATED_MARKERS_RE.search(line): + return True + + return False + + +def has_copyright(content_lines: list[str]) -> bool: + """Check if the file contains a valid copyright header in the top lines.""" + for line in content_lines[:MAX_COPYRIGHT_LINES]: + if COPYRIGHT_RE.search(line): + return True + return False + + +def check_file(file_path: Path, generated_filename_re: re.Pattern) -> bool: + """Checks a single file for copyright header. Returns True if valid/skipped, False if missing header.""" + if not file_path.is_file(): + return True + + # Skip files that do not match our target source code extensions + if file_path.suffix.lower() not in INCLUDED_EXTENSIONS: + return True + + try: + with file_path.open("r", encoding="utf-8", errors="ignore") as f: + # We only need the top lines for check + lines = [f.readline() for _ in range(MAX_COPYRIGHT_LINES)] + lines = [line for line in lines if line] # Filter out empty reads (EOF) + except Exception as e: + print(f"Warning: Could not read {file_path}: {e}") + return True + + # If the file is empty, skip it + if not lines: + return True + + # If the file is automatically generated, we exempt it + if is_generated(file_path, lines, generated_filename_re): + return True + + # Check for copyright header + if not has_copyright(lines): + print(f"Error: {file_path} is missing a copyright header.") + return False + + return True + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Check files for copyright headers, exempting auto-generated files." + ) + parser.add_argument( + "--exclude-filename-regex", + type=str, + help="Optional custom regex pattern to identify and exclude auto-generated files by filename.", + ) + parser.add_argument( + "files", + nargs="+", + type=Path, + help="List of files to check.", + ) + args = parser.parse_args() + + # Combine custom regex with safe defaults if provided, otherwise fall back to defaults + if args.exclude_filename_regex: + try: + combined_pattern = f"({args.exclude_filename_regex}|{DEFAULT_GENERATED_FILENAME_RE.pattern})" + generated_filename_re = re.compile(combined_pattern, re.IGNORECASE) + except re.error as e: + print(f"Error: Invalid custom regex pattern: {e}") + sys.exit(2) + else: + generated_filename_re = DEFAULT_GENERATED_FILENAME_RE + + failed_files = 0 + for file_path in args.files: + if not check_file(file_path, generated_filename_re): + failed_files += 1 + + if failed_files > 0: + print( + f"\nCopyright header check failed: {failed_files} file(s) missing headers." + ) + sys.exit(1) + + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/org-tools/test_check_copyright.py b/org-tools/test_check_copyright.py new file mode 100644 index 0000000..cdada17 --- /dev/null +++ b/org-tools/test_check_copyright.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# Copyright 2026 UCP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for check_copyright.py.""" + +import tempfile +import unittest +from pathlib import Path + +# Import the check functions from check_copyright +import check_copyright + + +class TestCheckCopyright(unittest.TestCase): + def setUp(self): + # Create a temporary directory for test files + self.test_dir = tempfile.TemporaryDirectory() + self.test_dir_path = Path(self.test_dir.name) + + def tearDown(self): + self.test_dir.cleanup() + + def create_test_file(self, filename: str, content: str) -> Path: + file_path = self.test_dir_path / filename + file_path.write_text(content, encoding="utf-8") + return file_path + + def test_valid_copyright_header(self): + content = ( + "# Copyright 2026 UCP Authors\n" + "# Licensed under the Apache License, Version 2.0\n" + "\n" + "print('hello')" + ) + file_path = self.create_test_file("valid.py", content) + self.assertTrue( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + def test_valid_apache_license_only(self): + content = ( + '// Licensed under the Apache License, Version 2.0 (the "License");\n' + "// you may not use this file except in compliance with the License.\n" + "console.log('hello');" + ) + file_path = self.create_test_file("valid.ts", content) + self.assertTrue( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + def test_missing_copyright_header(self): + content = "print('hello')" + file_path = self.create_test_file("invalid.py", content) + self.assertFalse( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + def test_empty_file_passes(self): + file_path = self.create_test_file("empty.py", "") + self.assertTrue( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + def test_untracked_extension_skipped(self): + file_path = self.create_test_file("image.png", "binarydata") + self.assertTrue( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + def test_default_generated_filename_passes(self): + # Default Protobuf / GRPC generated filename format should pass + content = "print('hello')" + file_path = self.create_test_file("foo_pb2.py", content) + self.assertTrue( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + file_path2 = self.create_test_file("foo_pb2_grpc.py", content) + self.assertTrue( + check_copyright.check_file( + file_path2, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + def test_custom_exclude_filename_regex_override(self): + import re + + content = "print('hello')" + file_path = self.create_test_file("spec_generated.ts", content) + + # 1. Under default regex, spec_generated.ts should FAIL (no copyright header and doesn't match default pb2 regex) + self.assertFalse( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + # 2. Under combined regex, spec_generated.ts should PASS + custom_pattern = r"^(spec_generated\.[a-z]+)$" + combined_pattern = f"({custom_pattern}|{check_copyright.DEFAULT_GENERATED_FILENAME_RE.pattern})" + combined_re = re.compile(combined_pattern, re.IGNORECASE) + self.assertTrue(check_copyright.check_file(file_path, combined_re)) + + # 3. Default Protobuf files should STILL pass under combined regex + protobuf_path = self.create_test_file("foo_pb2.py", content) + self.assertTrue(check_copyright.check_file(protobuf_path, combined_re)) + + def test_generated_content_marker_passes(self): + content = ( + "# This file is auto-generated by some generator tool.\n" + "# DO NOT EDIT.\n" + "print('hello')" + ) + file_path = self.create_test_file("auto_gen.py", content) + self.assertTrue( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + def test_datamodel_codegen_marker_passes(self): + content = "# generated by datamodel-codegen\nprint('hello')" + file_path = self.create_test_file("codegen.py", content) + self.assertTrue( + check_copyright.check_file( + file_path, check_copyright.DEFAULT_GENERATED_FILENAME_RE + ) + ) + + +if __name__ == "__main__": + unittest.main() From e8ae0e5ce5d400f27ad1d8ee65815867bcc8b520 Mon Sep 17 00:00:00 2001 From: Dan Tsai Date: Sat, 27 Jun 2026 00:43:37 +0000 Subject: [PATCH 2/2] style: mark python scripts with shebangs as executable (mode 755) to pass pre-commit --- org-tools/check_copyright.py | 0 org-tools/test_check_copyright.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 org-tools/check_copyright.py mode change 100644 => 100755 org-tools/test_check_copyright.py diff --git a/org-tools/check_copyright.py b/org-tools/check_copyright.py old mode 100644 new mode 100755 diff --git a/org-tools/test_check_copyright.py b/org-tools/test_check_copyright.py old mode 100644 new mode 100755