package workflow import ( "bufio" "fmt" "regexp" "strings" ) // Detector finds AI-attribution fingerprints in text. It backs both the // comment-hygiene and leak-guard workflows and enforces Constraint 3 // ("no AI attribution") — which eeco applies to its own repository. // // Self-clean by construction: the sensitive trigger literals are // assembled from fragments at runtime, so this source file contains no // contiguous attribution string for the detector to flag when it scans // eeco's own tracked tree. The trailer rule is line-anchored so a prose // mention of the trailer's name (for example in documentation) is not a // false positive — only an actual trailer line is. type Detector struct { patterns []namedPattern } type namedPattern struct { what string re *regexp.Regexp } // fragment assembly: keeping these split means the full trigger token // never appears verbatim in tracked source. var ( coAuthored = "[Cc]o-" + "[Aa]uthored-" + "[Bb]y" genVerb = "[Gg]enerated" // Tool tokens are word-bounded and case-scoped on purpose: a global // (?i) would let the bare letters "ai" inside ordinary prose (for // example "fair") trip the gate, which would make it untrustworthy // (Constraint 5). Generic words like "model" are excluded for the // same reason; operators add project-specific tokens via config. assistanten = `\b(?:[Aa]ssistant|[Aa]gent|[Cc]opilot|[Bb]ot|AI|CLI|LLM)\b` robotEmoji = "\\x{1F916}" // U+1F916; not written as a literal glyph here. ) // NewDetector builds the detector with the default denylist plus any // operator-supplied extra patterns (compiled as regular expressions). // An invalid extra pattern is an error so a typo is loud, not silent. func NewDetector(extra []string) (*Detector, error) { d := &Detector{patterns: []namedPattern{ // An actual trailer line: anchored to line start so a prose or // backticked mention of the name is not flagged. {"co-authored-by trailer", regexp.MustCompile(`(?m)^\s*` + coAuthored + `:\s*\S`)}, // "Generated with/by " co-marketing line. {"generated-by attribution", regexp.MustCompile(genVerb + ` (?:[Ww]ith|[Bb]y) [^\n]{0,40}?` + assistanten)}, // Robot-emoji-prefixed generated line. {"robot-emoji attribution", regexp.MustCompile(`(?m)` + robotEmoji + `[^\n]{0,20}` + genVerb)}, }} for i, p := range extra { re, err := regexp.Compile(p) if err != nil { return nil, fmt.Errorf("attribution_pattern[%d] %q: %w", i, p, err) } d.patterns = append(d.patterns, namedPattern{"configured pattern", re}) } return d, nil } // Scan returns one Finding per matching line. path is recorded on each // Finding for reporting; it is not inspected. A line that trips several // patterns is reported once (first match wins) to keep reports terse. func (d *Detector) Scan(path, content string) []Finding { var out []Finding sc := bufio.NewScanner(strings.NewReader(content)) sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) ln := 0 for sc.Scan() { ln++ line := sc.Text() for _, p := range d.patterns { // The trailer pattern is intentionally multi-line-anchored; // evaluating it per line keeps the anchor meaningful and the // line number exact. if p.re.MatchString(line) { out = append(out, Finding{Path: path, Line: ln, Msg: p.what}) break } } } return out } // ScanResponse adapts the detector to ai.ResponseScanner: scans an AI response // body, returns one description per flagged line (nil for clean). Signature // matches ai.ResponseScanner without importing internal/ai, so the cmd / tui // layer can wire d.ScanResponse with no import cycle. func (d *Detector) ScanResponse(text string) []string { findings := d.Scan("ai-response", text) if len(findings) == 0 { return nil } out := make([]string, 0, len(findings)) for _, f := range findings { out = append(out, fmt.Sprintf("line %d: %s", f.Line, f.Msg)) } return out }