package workflow import ( "os" "os/exec" "path/filepath" "strings" ) // CommitGuardResult is the outcome of inspecting a candidate shell // command for a `git commit` that would carry AI attribution. IsCommit // is true when a real `git commit` segment was detected — token-based, // never substring, so `echo "git commit"` does not qualify. Findings is // the union of attribution findings across the assembled commit message, // the staged diff, and the raw command string; an empty Findings means // allow. type CommitGuardResult struct { IsCommit bool Findings []Finding } // stagedDiff returns the staged diff for the repo rooted at cwd. It is a // package var so the test suite can stub it; the default shells out to // git and is degrade-open — any error yields "" so the guard never // blocks on infrastructure trouble (not a repo, git missing, …). var stagedDiff = func(cwd string) string { cmd := exec.Command("git", "-C", cwd, "diff", "--cached") out, err := cmd.Output() if err != nil { return "" } return string(out) } // ScanCommitGuard inspects command — the full Bash command string from a // Claude Code PreToolUse tool call — for a pending `git commit` and // scans the text it would commit for AI attribution. cwd is the hook's // working directory (the repo the commit targets); det is the shared // attribution detector that also backs leak-guard and the pre-write // scanner, so the patterns stay one source of truth. // // Degrade-open by contract: Findings is populated only on a positive // detector match. Any parse/infra uncertainty — not a commit, a message // it cannot statically resolve (heredoc / command substitution), an // unreadable file, git unavailable — yields no findings, so a harness // session is never wedged. Defense-in-depth keeps the git pre-commit // hook and CI leak-guard as the hard gates; this guard covers the common // case (an inline `-m` trailer) reliably. func ScanCommitGuard(det *Detector, command, cwd string) CommitGuardResult { var res CommitGuardResult for _, words := range commandSegments(command) { if !isGitCommit(words) { continue } res.IsCommit = true if msg := assembleMessage(words, cwd); msg != "" { res.Findings = append(res.Findings, det.Scan("commit message", msg)...) } } if !res.IsCommit { return res } // The staged diff catches a non-line-anchored generated-by line added // to a file rather than the message. if diff := stagedDiff(cwd); diff != "" { res.Findings = append(res.Findings, det.Scan("staged diff", diff)...) } // Belt: the raw command string catches a trailer or generated-by line // embedded with a real newline inside the command. res.Findings = append(res.Findings, det.Scan("command", command)...) return res } // --- shell command lexing ------------------------------------------- // tokKind distinguishes a word from a command separator. type tokKind int const ( tokWord tokKind = iota tokSep ) type token struct { kind tokKind text string } // commandSegments splits a shell command into independent segments at // unquoted separators (&&, ||, ;, |, &, newline) and returns each // segment as its quote-aware word list. It is best-effort: a construct // it cannot statically resolve (command substitution, heredoc) degrades // to literal text, which the detector then finds nothing in — allow. func commandSegments(command string) [][]string { toks := lex(command) var segs [][]string var cur []string for _, t := range toks { if t.kind == tokSep { if len(cur) > 0 { segs = append(segs, cur) cur = nil } continue } cur = append(cur, t.text) } if len(cur) > 0 { segs = append(segs, cur) } return segs } // lex tokenizes s into words and separators, honoring single quotes, // double quotes (with the standard backslash escapes), and backslash // escaping. Adjacent quoted and unquoted runs concatenate into one word, // matching shell word-splitting. Operator characters inside quotes are // literal, so a `;` or `&&` inside an `-m "…"` value never splits. func lex(s string) []token { var toks []token var buf strings.Builder hasWord := false flush := func() { if hasWord { toks = append(toks, token{tokWord, buf.String()}) buf.Reset() hasWord = false } } emitSep := func() { flush() toks = append(toks, token{kind: tokSep}) } i, n := 0, len(s) for i < n { c := s[i] switch c { case '\n': emitSep() i++ case ' ', '\t', '\r': flush() i++ case ';': emitSep() i++ case '&': emitSep() if i+1 < n && s[i+1] == '&' { i += 2 } else { i++ } case '|': emitSep() if i+1 < n && s[i+1] == '|' { i += 2 } else { i++ } case '\'': hasWord = true i++ for i < n && s[i] != '\'' { buf.WriteByte(s[i]) i++ } if i < n { i++ // closing quote } case '"': hasWord = true i++ for i < n && s[i] != '"' { if s[i] == '\\' && i+1 < n { switch s[i+1] { case '"', '\\', '`', '$': buf.WriteByte(s[i+1]) i += 2 continue case '\n': i += 2 // line continuation continue } } buf.WriteByte(s[i]) i++ } if i < n { i++ // closing quote } case '\\': hasWord = true if i+1 < n { if s[i+1] == '\n' { i += 2 // line continuation continue } buf.WriteByte(s[i+1]) i += 2 } else { i++ } default: hasWord = true buf.WriteByte(c) i++ } } flush() return toks } // --- git commit detection ------------------------------------------- // isGitCommit reports whether a segment's word list invokes `git commit`. // It skips leading NAME=VALUE env assignments, requires the program token // to be git (bare or a path ending in /git), then walks git's global // options to the subcommand and checks it is exactly "commit". A bare `--` // before any subcommand, or any other subcommand, disqualifies — so // `git status` and `git log -m commit` never fire. func isGitCommit(words []string) bool { i := 0 for i < len(words) && isEnvAssign(words[i]) { i++ } if i >= len(words) || !isGitProg(words[i]) { return false } i++ // past git // git global options that consume the following token as their value. valueOpts := map[string]bool{ "-C": true, "-c": true, "--git-dir": true, "--work-tree": true, "--namespace": true, "--exec-path": true, "--super-prefix": true, "--config-env": true, } for i < len(words) { w := words[i] if w == "--" { return false // end of options without a subcommand } if strings.HasPrefix(w, "-") { if valueOpts[w] { i += 2 // skip the option and its value } else { i++ // a flag or an --opt=val single token } continue } return w == "commit" // first bare token is the subcommand } return false } // isGitProg reports whether tok names the git program. func isGitProg(tok string) bool { return tok == "git" || strings.HasSuffix(tok, "/git") } // isEnvAssign reports whether w is a leading NAME=VALUE env assignment // (NAME is a shell identifier), e.g. GIT_AUTHOR_NAME=x before `git …`. func isEnvAssign(w string) bool { eq := strings.IndexByte(w, '=') if eq <= 0 { return false } for i := range eq { c := w[i] switch { case c == '_': case c >= 'A' && c <= 'Z': case c >= 'a' && c <= 'z': case i > 0 && c >= '0' && c <= '9': default: return false } } return true } // --- commit message assembly ---------------------------------------- // assembleMessage reconstructs the text a `git commit` segment would // commit as its message. -m/--message values join with a blank line (how // git forms paragraphs), so an attribution trailer lands at line-start // for the line-anchored detector pattern. It falls back to -F/--file // (read relative to cwd) and then to /.git/COMMIT_EDITMSG. An // unresolved or unreadable source returns "" (degrade-open). func assembleMessage(words []string, cwd string) string { var parts []string var filePath string for i := 0; i < len(words); i++ { w := words[i] switch { case w == "-m" || w == "--message": if i+1 < len(words) { parts = append(parts, words[i+1]) i++ } case strings.HasPrefix(w, "--message="): parts = append(parts, w[len("--message="):]) case clusterEndsInM(w): // a short cluster like -am / -sm: the trailing m takes the // next token as the message. if i+1 < len(words) { parts = append(parts, words[i+1]) i++ } case strings.HasPrefix(w, "-m") && !strings.HasPrefix(w, "--"): parts = append(parts, strings.TrimPrefix(w[2:], "=")) // -mMSG / -m=MSG case w == "-F" || w == "--file": if i+1 < len(words) { filePath = words[i+1] i++ } case strings.HasPrefix(w, "--file="): filePath = w[len("--file="):] case strings.HasPrefix(w, "-F") && !strings.HasPrefix(w, "--") && len(w) > 2: filePath = strings.TrimPrefix(w[2:], "=") } } if len(parts) > 0 { return strings.Join(parts, "\n\n") } if filePath != "" { if !filepath.IsAbs(filePath) { filePath = filepath.Join(cwd, filePath) } if b, err := os.ReadFile(filePath); err == nil { return string(b) } return "" } if b, err := os.ReadFile(filepath.Join(cwd, ".git", "COMMIT_EDITMSG")); err == nil { return string(b) } return "" } // clusterEndsInM reports whether w is a combined short-flag cluster whose // last flag is m (so it consumes the next token as the message), e.g. // -am or -sm. Pure -m (length 2) is handled separately. func clusterEndsInM(w string) bool { if len(w) < 3 || w[0] != '-' || w[1] == '-' { return false } for i := 1; i < len(w); i++ { c := w[i] if (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') { return false } } return w[len(w)-1] == 'm' }