Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions .github/workflows/reusable-copyright-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: Reusable Copyright Check

on:
workflow_call:
inputs:
central_repo:
required: false
type: string
default: ".github"
description: "Name of the central organization tools repository"
exclude_filename_regex:
required: false
type: string
default: ""
description: "Optional custom regex pattern to identify and exclude auto-generated files by filename."
secrets:
ORG_READ_TOKEN:
required: false
description: "Org-level read token to check out central repository if private"

jobs:
check-copyright:
name: Check Copyright Headers
runs-on: ubuntu-latest
steps:
# 1. Check out the caller repository (the PR code)
- name: Check out PR code
uses: actions/checkout@v5
with:
fetch-depth: 0

# 2. Check out the central tools repository containing check_copyright.py
- name: Check out central tools
uses: actions/checkout@v5
with:
repository: ${{ github.repository_owner }}/${{ inputs.central_repo }}
path: .github-central
token: ${{ secrets.ORG_READ_TOKEN || github.token }}

# 3. Set up uv using the standard action
- name: Set up uv
uses: astral-sh/setup-uv@v8.1.0

# 4. Get changed files and run check_copyright.py
- name: Run Copyright Check
run: |
if [ "${{ github.event_name }}" = "pull_request" ]; then
BASE_SHA="${{ github.event.pull_request.base.sha }}"
else
BASE_SHA="${{ github.event.before }}"
fi

# Get list of added/modified files
CHANGED_FILES=$(git diff --name-only --diff-filter=AM "$BASE_SHA" HEAD || true)

if [ -z "$CHANGED_FILES" ]; then
echo "No relevant source files changed. Skipping check."
exit 0
fi

echo "Checking copyright headers for:"
echo "$CHANGED_FILES"

# Build execution command
CMD="uv run python3 .github-central/org-tools/check_copyright.py"
if [ -n "${{ inputs.exclude_filename_regex }}" ]; then
CMD="$CMD --exclude-filename-regex '${{ inputs.exclude_filename_regex }}'"
fi
CMD="$CMD $CHANGED_FILES"

eval $CMD
174 changes: 174 additions & 0 deletions org-tools/check_copyright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#!/usr/bin/env python3
# Copyright 2026 UCP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Linter script to check for copyright headers in source files, excluding generated files."""

import argparse
import re
import sys
from pathlib import Path

# Regular expression to identify copyright lines
COPYRIGHT_RE = re.compile(
r"(copyright\s+(\(c\)\s+)?\d{4})|(licensed\s+under\s+the\s+apache\s+license)",
re.IGNORECASE,
)

# Regular expressions to identify exact, anchored auto-generation comment headers.
# This prevents normal English docstrings or inline comments in hand-written code from triggering exemptions.
GENERATED_MARKERS_RE = re.compile(
r"^\s*([#//\*]+\s*)?" # Optional comment prefix (#, //, /*, *)
r"("
r"generated by datamodel-codegen|"
r"this file was automatically generated by json-schema-to-typescript|"
r"@generated|"
r"this file is auto-generated by\b.*|"
r"this file was automatically generated - do not edit"
r")\s*$",
re.IGNORECASE,
)

# File extensions that require a copyright header
INCLUDED_EXTENSIONS = {
".py",
".ts",
".js",
".sh",
".yaml",
".yml",
".css",
".html",
".go",
".rs",
".c",
".cpp",
".h",
}

# Safe, conservative default regex pattern to identify universally recognized auto-generated files.
# Matches common protobuf, grpc, and thrift generated file signatures.
# Matches: foo.pb.go, foo_pb2.py, foo_pb2_grpc.py
DEFAULT_GENERATED_FILENAME_RE = re.compile(
r"(\.pb\.[a-z]+|.*_pb2(_grpc)?\.py)$",
re.IGNORECASE,
)

# Number of lines to inspect at the top of the file
MAX_COPYRIGHT_LINES = 15
MAX_GENERATED_CHECK_LINES = 15


def is_generated(
file_path: Path, content_lines: list[str], generated_filename_re: re.Pattern
) -> bool:
"""Check if the file is automatically generated based on its name or content."""
# 1. Check filename pattern
if generated_filename_re.search(file_path.name):
return True

# 2. Check the first few lines for exact generation marker patterns
for line in content_lines[:MAX_GENERATED_CHECK_LINES]:
if GENERATED_MARKERS_RE.search(line):
return True

return False


def has_copyright(content_lines: list[str]) -> bool:
"""Check if the file contains a valid copyright header in the top lines."""
for line in content_lines[:MAX_COPYRIGHT_LINES]:
if COPYRIGHT_RE.search(line):
return True
return False


def check_file(file_path: Path, generated_filename_re: re.Pattern) -> bool:
"""Checks a single file for copyright header. Returns True if valid/skipped, False if missing header."""
if not file_path.is_file():
return True

# Skip files that do not match our target source code extensions
if file_path.suffix.lower() not in INCLUDED_EXTENSIONS:
return True

try:
with file_path.open("r", encoding="utf-8", errors="ignore") as f:
# We only need the top lines for check
lines = [f.readline() for _ in range(MAX_COPYRIGHT_LINES)]
lines = [line for line in lines if line] # Filter out empty reads (EOF)
except Exception as e:
print(f"Warning: Could not read {file_path}: {e}")
return True

# If the file is empty, skip it
if not lines:
return True

# If the file is automatically generated, we exempt it
if is_generated(file_path, lines, generated_filename_re):
return True

# Check for copyright header
if not has_copyright(lines):
print(f"Error: {file_path} is missing a copyright header.")
return False

return True


def main() -> None:
parser = argparse.ArgumentParser(
description="Check files for copyright headers, exempting auto-generated files."
)
parser.add_argument(
"--exclude-filename-regex",
type=str,
help="Optional custom regex pattern to identify and exclude auto-generated files by filename.",
)
parser.add_argument(
"files",
nargs="+",
type=Path,
help="List of files to check.",
)
args = parser.parse_args()

# Combine custom regex with safe defaults if provided, otherwise fall back to defaults
if args.exclude_filename_regex:
try:
combined_pattern = f"({args.exclude_filename_regex}|{DEFAULT_GENERATED_FILENAME_RE.pattern})"
generated_filename_re = re.compile(combined_pattern, re.IGNORECASE)
except re.error as e:
print(f"Error: Invalid custom regex pattern: {e}")
sys.exit(2)
else:
generated_filename_re = DEFAULT_GENERATED_FILENAME_RE

failed_files = 0
for file_path in args.files:
if not check_file(file_path, generated_filename_re):
failed_files += 1

if failed_files > 0:
print(
f"\nCopyright header check failed: {failed_files} file(s) missing headers."
)
sys.exit(1)

sys.exit(0)


if __name__ == "__main__":
main()
Loading
Loading