# .github/workflows/sir-scan.yml
#
# SIR Engine — Semantic Duplicate Check
# ─────────────────────────────────────────────────────────────────────────────
# Runs on every pull request. Scans changed Python / JS / TS files for
# semantically duplicate functions and posts a summary comment on the PR.
#
# LICENSING
# Public repositories — free, no license key required.
# Private repositories — requires a SIR_LICENSE_KEY repository secret.
# Get a license at sir-engine.com.
#
# QUICK START (adding to your own repo):
# 1. Copy this file to .github/workflows/sir-scan.yml in your repo.
# 2. Edit the env vars in the "Configuration" block below.
# 3. For private repos, add SIR_LICENSE_KEY to Settings → Secrets → Actions.
# 4. Push — the check runs automatically on every PR.
#
# See README.md → "GitHub Actions" for the full setup guide.
# ─────────────────────────────────────────────────────────────────────────────
name: SIR Engine — Semantic Duplicate Check
on:
pull_request:
types: [opened, synchronize, reopened]
workflow_call:
inputs:
strict:
description: "Fail the check if duplicates are found in changed files"
type: boolean
default: false
min_cluster_size:
description: "Minimum copies to count as a duplicate cluster"
type: number
default: 2
scan_path:
description: "Root directory to scan (relative to repo root)"
type: string
default: "."
ai_backend:
description: "AI backend for non-Python/JS files: '' (off) or 'anthropic'"
type: string
default: ""
base_sha:
description: "PR base commit SHA — pass github.event.pull_request.base.sha from the caller"
type: string
default: ""
head_sha:
description: "PR head commit SHA — pass github.sha from the caller"
type: string
default: ""
pr_number:
description: "PR number — pass github.event.pull_request.number from the caller"
type: string
default: ""
secrets:
SIR_LICENSE_KEY:
required: false
ANTHROPIC_API_KEY:
required: false
permissions:
contents: read
pull-requests: write # required to post and update PR comments
# ── Configuration ─────────────────────────────────────────────────────────────
# Edit these when copying this file into your own repo.
# workflow_call inputs (above) take precedence when the workflow is called
# from another workflow; these env vars are the defaults for direct PR triggers.
env:
SIR_STRICT: "false" # "true" → block merges until duplicates are resolved
SIR_MIN_CLUSTER_SIZE: "2" # minimum copies to flag as a duplicate cluster
SIR_SCAN_PATH: "." # root path to scan (relative to repo root)
SIR_AI_BACKEND: "" # "" = Python/JS/TS only | "anthropic" = also scan other languages
# ─────────────────────────────────────────────────────────────────────────────
jobs:
sir-scan:
name: Semantic Duplicate Check
runs-on: ubuntu-latest
steps:
# ── 1. Checkout ──────────────────────────────────────────────────────────
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
# ── 2. Python ────────────────────────────────────────────────────────────
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: pip
# ── 3. Fetch SIR Engine ──────────────────────────────────────────────────
- name: Fetch SIR Engine
run: |
if [ -f "sir_cli.py" ]; then
echo "sir_cli.py found in repo — skipping clone."
echo "SIR_DIR=." >> "$GITHUB_ENV"
else
git clone --depth 1 https://github.com/lflin00/SIR-ENGINE.git .sir_engine
echo "SIR_DIR=.sir_engine" >> "$GITHUB_ENV"
fi
# ── 4. Install dependencies ──────────────────────────────────────────────
- name: Install dependencies
run: |
BACKEND="${{ inputs.ai_backend }}"
[ -z "$BACKEND" ] && BACKEND="$SIR_AI_BACKEND"
if [ "$BACKEND" = "anthropic" ]; then
pip install --quiet anthropic
fi
# ── 5. Resolve configuration ─────────────────────────────────────────────
- name: Resolve configuration
id: config
run: |
STRICT="${{ inputs.strict }}"
[ -z "$STRICT" ] && STRICT="$SIR_STRICT"
MIN="${{ inputs.min_cluster_size }}"
[ -z "$MIN" ] && MIN="$SIR_MIN_CLUSTER_SIZE"
SCANPATH="${{ inputs.scan_path }}"
[ -z "$SCANPATH" ] && SCANPATH="$SIR_SCAN_PATH"
AI="${{ inputs.ai_backend }}"
[ -z "$AI" ] && AI="$SIR_AI_BACKEND"
BASE="${{ inputs.base_sha }}"
[ -z "$BASE" ] && BASE="${{ github.event.pull_request.base.sha }}"
HEAD="${{ inputs.head_sha }}"
[ -z "$HEAD" ] && HEAD="${{ github.sha }}"
PRNUM="${{ inputs.pr_number }}"
[ -z "$PRNUM" ] && PRNUM="${{ github.event.pull_request.number }}"
echo "strict=$STRICT" >> "$GITHUB_OUTPUT"
echo "min=$MIN" >> "$GITHUB_OUTPUT"
echo "scan_path=$SCANPATH" >> "$GITHUB_OUTPUT"
echo "ai_backend=$AI" >> "$GITHUB_OUTPUT"
echo "base_sha=$BASE" >> "$GITHUB_OUTPUT"
echo "head_sha=$HEAD" >> "$GITHUB_OUTPUT"
echo "pr_number=$PRNUM" >> "$GITHUB_OUTPUT"
# ── 6. Check license ─────────────────────────────────────────────────────
# Public repos: always licensed (free tier).
# Private repos: require SIR_LICENSE_KEY validated against the API.
# If the validation API is unreachable, we fail open so paying customers
# are never blocked by a transient outage.
- name: Check license
id: license
env:
SIR_LICENSE_KEY: ${{ secrets.SIR_LICENSE_KEY }}
run: |
python3 << 'PYEOF'
import hashlib, json, os, sys, urllib.request, urllib.error
VALIDATE_URL = "https://api.sir-engine.com/validate"
GITHUB_OUTPUT = os.environ["GITHUB_OUTPUT"]
def output(key, value):
with open(GITHUB_OUTPUT, "a") as f:
f.write(f"{key}={value}\n")
is_private = "${{ github.event.repository.private }}" == "true"
key = os.environ.get("SIR_LICENSE_KEY", "").strip()
# ── Public repo: always free ──────────────────────────────────────
if not is_private:
print("Public repository — license not required.")
output("licensed", "true")
output("reason", "public")
sys.exit(0)
# ── Private repo: no key provided ────────────────────────────────
if not key:
print("Private repository — SIR_LICENSE_KEY secret not set.")
output("licensed", "false")
output("reason", "no_key")
sys.exit(0)
# ── Private repo: validate key against API ────────────────────────
try:
payload = json.dumps({"key": key}).encode()
req = urllib.request.Request(
VALIDATE_URL,
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=10) as resp:
data = json.loads(resp.read())
print(f"License valid — issued to: {data.get('issued_to', 'unknown')}")
output("licensed", "true")
output("reason", "valid_key")
output("customer", data.get("issued_to", ""))
except urllib.error.HTTPError as e:
if e.code in (401, 403):
print(f"License invalid or expired (HTTP {e.code}).")
output("licensed", "false")
output("reason", "invalid_key")
else:
# Unexpected server error — fail open to protect paying customers
print(f"Warning: license API returned HTTP {e.code} — proceeding.")
output("licensed", "true")
output("reason", "api_error")
except Exception as e:
# Network error — fail open
print(f"Warning: could not reach license API ({e}) — proceeding.")
output("licensed", "true")
output("reason", "api_error")
PYEOF
# ── 7. Post upgrade prompt (unlicensed private repos) ────────────────────
- name: Post upgrade prompt
if: steps.license.outputs.licensed == 'false'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ steps.config.outputs.pr_number }}
REPO: ${{ github.repository }}
REASON: ${{ steps.license.outputs.reason }}
run: |
python3 << 'PYEOF'
import json, os, urllib.request
MARKER = ""
token = os.environ["GITHUB_TOKEN"]
repo = os.environ["REPO"]
pr_number = int(os.environ["PR_NUMBER"])
reason = os.environ.get("REASON", "no_key")
if reason == "invalid_key":
key_msg = (
"> :warning: The `SIR_LICENSE_KEY` secret in this repository "
"is invalid or has expired. "
"[Manage your license at sir-engine.com →](https://sir-engine.com)"
)
else:
key_msg = (
"> :key: Add your license key as a repository secret named "
"`SIR_LICENSE_KEY` under **Settings → Secrets and variables → Actions**."
)
body = "\n".join([
MARKER,
"",
"## SIR Engine · License Required",
"",
"SIR Engine's GitHub Action is **free for public repositories**.",
"",
f"This repository is **private** — a license is required to run "
f"semantic duplicate checks in CI/CD.",
"",
"[**Get a license at sir-engine.com →**](https://sir-engine.com)",
"",
key_msg,
"",
"---",
"*The CLI tool (`sir scan`) remains free for all use. "
"Install it locally to scan without a license.*",
])
api = "https://api.github.com"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
"Content-Type": "application/json",
}
# Find and update an existing SIR comment, or post a new one
req = urllib.request.Request(
f"{api}/repos/{repo}/issues/{pr_number}/comments?per_page=100",
headers=headers,
)
with urllib.request.urlopen(req) as resp:
comments = json.loads(resp.read())
existing_id = next(
(c["id"] for c in comments if MARKER in c.get("body", "")), None
)
payload = json.dumps({"body": body}).encode()
if existing_id:
req = urllib.request.Request(
f"{api}/repos/{repo}/issues/comments/{existing_id}",
data=payload, headers=headers, method="PATCH",
)
else:
req = urllib.request.Request(
f"{api}/repos/{repo}/issues/{pr_number}/comments",
data=payload, headers=headers,
)
with urllib.request.urlopen(req) as resp:
result = json.loads(resp.read())
print(f"Comment posted: {result['html_url']}")
PYEOF
# ── 8. Detect changed files ──────────────────────────────────────────────
- name: Detect changed Python / JS / TS files
if: steps.license.outputs.licensed == 'true'
id: changed
run: |
BASE="${{ steps.config.outputs.base_sha }}"
HEAD="${{ steps.config.outputs.head_sha }}"
git diff --name-only "$BASE" "$HEAD" > /tmp/all_changed.txt
grep -E '\.(py|js|ts|jsx|tsx)$' /tmp/all_changed.txt \
> /tmp/sir_changed.txt || true
COUNT=$(wc -l < /tmp/sir_changed.txt | tr -d ' ')
echo "count=$COUNT" >> "$GITHUB_OUTPUT"
echo "Changed files ($COUNT relevant for SIR scan):"
cat /tmp/sir_changed.txt
# ── 9. Early exit if nothing relevant changed ────────────────────────────
- name: Skip — no Python / JS / TS files changed
if: steps.license.outputs.licensed == 'true' && steps.changed.outputs.count == '0'
run: echo "No Python, JS, or TS files changed in this PR — skipping scan."
# ── 10. Run native scan (Python + JS / TS) ───────────────────────────────
- name: Run SIR scan (Python + JS / TS)
if: steps.license.outputs.licensed == 'true' && steps.changed.outputs.count != '0'
run: |
python3 "$SIR_DIR/sir_cli.py" scan \
"${{ steps.config.outputs.scan_path }}" \
--min "${{ steps.config.outputs.min }}" \
--output sir_report.json \
|| true
# ── 11. Run AI scan (optional) ───────────────────────────────────────────
- name: Run SIR AI scan
if: >-
steps.license.outputs.licensed == 'true' &&
steps.changed.outputs.count != '0' &&
steps.config.outputs.ai_backend != ''
run: |
python3 "$SIR_DIR/sir_cli.py" ai-scan \
"${{ steps.config.outputs.scan_path }}" \
--backend "${{ steps.config.outputs.ai_backend }}" \
--output sir_ai_report.json \
|| true
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
# ── 12. Post PR comment ──────────────────────────────────────────────────
- name: Post PR comment
if: steps.license.outputs.licensed == 'true' && steps.changed.outputs.count != '0'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ steps.config.outputs.pr_number }}
REPO: ${{ github.repository }}
SIR_STRICT: ${{ steps.config.outputs.strict }}
run: |
python3 << 'PYEOF'
import json, os, sys, urllib.request
MARKER = ""
def load_report(path):
try:
with open(path) as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return None
def load_changed():
try:
with open("/tmp/sir_changed.txt") as f:
return {ln.strip() for ln in f if ln.strip()}
except FileNotFoundError:
return set()
def relevant_clusters(report, changed):
if not report:
return []
return [
c for c in report.get("duplicates", [])
if any(o["file"] in changed for o in c.get("occurrences", []))
]
def cluster_rows(clusters, changed, extra_cols=False):
lines = []
for i, cluster in enumerate(clusters, 1):
h = cluster.get("hash", "")
occs = cluster.get("occurrences", [])
lines.append(
f"#### Cluster {i} · "
f"`{h}…` · {len(occs)} copies"
)
lines.append("")
if extra_cols:
lines.append("| Function | File | Language | Confidence | Line |")
lines.append("|----------|------|----------|------------|-----:|")
else:
lines.append("| Function | File | Line |")
lines.append("|----------|------|-----:|")
for o in occs:
tag = " ◀ **in this PR**" if o["file"] in changed else ""
if extra_cols:
lines.append(
f"| `{o['name']}` | `{o['file']}` "
f"| {o.get('lang', '')} | {o.get('confidence', '')} "
f"| {o['lineno']}{tag} |"
)
else:
lines.append(
f"| `{o['name']}` | `{o['file']}` | {o['lineno']}{tag} |"
)
lines.append("")
return lines
def build_comment(native, ai_clusters, report, changed, strict):
total = len(native) + len(ai_clusters)
health = report.get("health_score", "n/a") if report else "n/a"
total_fns = report.get("total_functions", 0) if report else 0
files_scanned = report.get("files", 0) if report else 0
lines = [MARKER, ""]
lines.append("## SIR Engine · Semantic Duplicate Report")
lines.append("")
if total == 0:
lines.append(
":white_check_mark: **No semantic duplicates** found "
"in the files changed by this PR."
)
else:
noun = "cluster" if total == 1 else "clusters"
lines.append(
f":warning: **{total} duplicate {noun}** found "
"in this PR's changed files."
)
lines.append("")
lines.append(
f"> Scanned **{files_scanned} file(s)** / "
f"**{total_fns} function(s)** · "
f"Health score: **{health}/100**"
)
lines.append("")
if native:
lines.append("")
lines.append(
f"Python / JS / TS — "
f"{len(native)} cluster(s)
"
)
lines.append("")
lines.extend(cluster_rows(native, changed, extra_cols=False))
lines.append(" ")
lines.append("")
if ai_clusters:
lines.append("")
lines.append(
f"AI-translated languages — "
f"{len(ai_clusters)} cluster(s)
"
)
lines.append("")
lines.extend(cluster_rows(ai_clusters, changed, extra_cols=True))
lines.append(" ")
lines.append("")
if total > 0 and strict == "true":
lines.append(
"> :no_entry: **Strict mode is enabled.** "
"This check will fail until all duplicate clusters "
"in the changed files are resolved."
)
lines.append("")
lines.append("---")
lines.append(
"*Powered by [SIR Engine](https://sir-engine.com) · "
"[What is semantic duplication?]"
"(https://github.com/lflin00/SIR-ENGINE#how-it-works)*"
)
return "\n".join(lines)
def gh_api(method, url, token, data=None):
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
if data is not None:
headers["Content-Type"] = "application/json"
req = urllib.request.Request(url, data=data, headers=headers, method=method)
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
def post_or_update(body, token, repo, pr_number):
api = "https://api.github.com"
try:
comments = gh_api(
"GET",
f"{api}/repos/{repo}/issues/{pr_number}/comments?per_page=100",
token,
)
except Exception as e:
print(f"Warning: could not fetch comments: {e}", file=sys.stderr)
comments = []
existing_id = next(
(c["id"] for c in comments if MARKER in c.get("body", "")), None
)
payload = json.dumps({"body": body}).encode()
try:
if existing_id:
result = gh_api(
"PATCH",
f"{api}/repos/{repo}/issues/comments/{existing_id}",
token, payload,
)
print(f"Updated existing comment: {result['html_url']}")
else:
result = gh_api(
"POST",
f"{api}/repos/{repo}/issues/{pr_number}/comments",
token, payload,
)
print(f"Posted new comment: {result['html_url']}")
except Exception as e:
print(f"Warning: could not post comment: {e}", file=sys.stderr)
report = load_report("sir_report.json")
ai_report = load_report("sir_ai_report.json")
changed = load_changed()
strict = os.environ.get("SIR_STRICT", "false")
token = os.environ["GITHUB_TOKEN"]
repo = os.environ["REPO"]
pr_number = int(os.environ["PR_NUMBER"])
native = relevant_clusters(report, changed)
ai_dupes = relevant_clusters(ai_report, changed)
body = build_comment(native, ai_dupes, report, changed, strict)
post_or_update(body, token, repo, pr_number)
PYEOF
# ── 13. Enforce strict mode ──────────────────────────────────────────────
- name: Enforce strict mode
if: >-
steps.license.outputs.licensed == 'true' &&
steps.changed.outputs.count != '0' &&
steps.config.outputs.strict == 'true'
run: |
python3 << 'PYEOF'
import json, sys
def relevant_count(path, changed):
try:
with open(path) as f:
data = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return 0
return sum(
1 for c in data.get("duplicates", [])
if any(o["file"] in changed for o in c.get("occurrences", []))
)
try:
with open("/tmp/sir_changed.txt") as f:
changed = {ln.strip() for ln in f if ln.strip()}
except FileNotFoundError:
changed = set()
total = (
relevant_count("sir_report.json", changed)
+ relevant_count("sir_ai_report.json", changed)
)
if total > 0:
noun = "cluster" if total == 1 else "clusters"
print(
f"::error::SIR Engine (strict mode): {total} duplicate {noun} "
"found in changed files. Resolve duplicates before merging."
)
sys.exit(1)
else:
print("Strict mode: no duplicates in changed files — check passed.")
PYEOF