package workflow import ( "bytes" "io/fs" "os" "path/filepath" "strings" ) // maxScanBytes caps the size of a file the text scanners will read. A // file larger than this is treated as non-text and skipped: attribution // fingerprints live in source and docs, not in large generated blobs. const maxScanBytes = 4 << 20 // 4 MiB // walkText walks root and calls fn(relPath, content) for every regular // text file, skipping the .git directory and the gitignored workspace // (engine output must never gate the tracked tree). Skipped: binary // files, oversized files, and unreadable entries. relPath is // slash-separated and repo-relative. func walkText(root, workspaceName string, fn func(rel, content string) error) error { return filepath.WalkDir(root, func(path string, de fs.DirEntry, err error) error { if err != nil { return err } name := de.Name() if de.IsDir() { if path == root { return nil } if name == ".git" || name == workspaceName { return filepath.SkipDir } return nil } if !de.Type().IsRegular() { return nil } info, ierr := de.Info() if ierr != nil || info.Size() > maxScanBytes { return nil } b, rerr := os.ReadFile(path) if rerr != nil || !isText(b) { return nil } rel, rerr := filepath.Rel(root, path) if rerr != nil { return nil } return fn(filepath.ToSlash(rel), string(b)) }) } // isText reports whether b looks like text: a NUL byte in the first // chunk marks it binary. Cheap and good enough for source/doc trees. func isText(b []byte) bool { n := min(len(b), 8000) return !bytes.ContainsRune(b[:n], 0) } // splitLines splits content into lines, dropping a trailing CR so a // CRLF file reports the same line content as an LF one. func splitLines(content string) []string { lines := strings.Split(content, "\n") for i, l := range lines { lines[i] = strings.TrimSuffix(l, "\r") } return lines }