#!/usr/bin/env python3 """ scan.py - Universal AI-slop + Comprehension scanner (slop-cop, dual axis). Scans prose on two parallel axes: 1. AI-Slop axis — ~45 rhetorical patterns, ~150 vocabulary tells, ~33 formatting tells. Density score, burstiness, model fingerprint. 2. Comprehension axis — ~17 mechanically-detectable comprehension patterns plus 8 readability metrics (Flesch RE, FK Grade, SMOG, Coleman-Liau, Dale-Chall, lexical density, sentence length variance, passive %). Catches what regex can; qualitative patterns (anaphora, symmetry, real-vs- decorative judgment, missing thesis, curse of knowledge) require reading. Usage: python3 scan.py path/to/draft.md python3 scan.py --json path/to/draft.md python3 scan.py --quick path/to/draft.md python3 scan.py --genre academic path/to/draft.md python3 scan.py --audience marketing path/to/draft.md python3 scan.py --strict-em-dash path/to/draft.md cat draft.md | python3 scan.py echo "draft text" | python3 scan.py """ import argparse import json import math import re import sys from pathlib import Path # ============================================================================= # VOCABULARY — every item from references/vocabulary.md # Severity: H (always cut) / M (often cut) / L (context-dependent) # ============================================================================= # 2A. LLM-favored verbs VERBS_H = [ "delve into", "delves", "delved", "delve", "leverage", "leverages", "leveraged", "leveraging", "harness", "harnesses", "harnessed", "harnessing", "foster", "fosters", "fostered", "fostering", "empower", "empowers", "empowered", "empowering", "unlock", "unlocks", "unlocked", "unlocking", "elevate", "elevates", "elevated", "elevating", "streamline", "streamlines", "streamlined", "streamlining", "revolutionize", "revolutionizes", "revolutionized", "revolutionizing", "underscore", "underscores", "underscored", "underscoring", "illuminate", "illuminates", "illuminated", "illuminating", "navigate the", "navigates the", "navigated the", "navigating the", "garner", "garners", "garnered", "garnering", "utilize", "utilizes", "utilized", "utilizing", "facilitate", "facilitates", "facilitated", "facilitating", "embark on", "embarks on", "embarked on", "embarking on", "showcase", "showcases", "showcased", "showcasing", "boast", "boasts", "boasted", "boasting", "dive into", "dives into", "dove into", "diving into", "pave the way", "pave the way for", "paves the way", "shed light on", "sheds light on", "transform the", "transforms the", "transforming the", ] VERBS_M = [ "demystify", "demystifies", "demystified", "demystifying", "ignite", "ignites", "ignited", "igniting", "supercharge", "supercharges", "supercharged", "unleash", "unleashes", "unleashed", "unleashing", "unveil", "unveils", "unveiled", "unveiling", "resonate", "resonates", "resonated", "resonating", "transcend", "transcends", "transcended", "transcending", "spearhead", "spearheads", "spearheaded", "spearheading", "reimagine", "reimagines", "reimagined", "reimagining", "reverberate", "reverberates", "reverberated", ] # 2B. Cliché metaphors and grandiose nouns NOUNS_H = [ "tapestry", "treasure trove", "symphony of", "embark on a journey", "beacon of", "myriad of", "plethora", "paradigm shift", "testament to", "arsenal of", "ecosystem of", ] NOUNS_M = [ "landscape of", "realm of", "journey of", "roadmap", "cornerstone of", "crucible", "labyrinth", "metropolis", "enigma", "kaleidoscope", "arena of", ] # 2C. Empty intensifiers, hedges, vague adjectives INTENSIFIERS_H = [ "crucial", "essential", "vital", "pivotal", "paramount", "robust", "seamless", "comprehensive", "multifaceted", "intricate", "intricacies", "meticulous", "meticulously", "unwavering", "transformative", "groundbreaking", "cutting-edge", "state-of-the-art", "game-changer", "game-changing", "ever-evolving", "ever-changing", "fast-paced", ] INTENSIFIERS_M = [ "profound", "holistic", "nuanced", "compelling", "commendable", "insightful", "invaluable", "next-generation", "future-proof", "dynamic", "vibrant", "bustling", "daunting", "ever-expanding", "timeless", "enduring", "diverse array", "unique blend", "hyper-connected", ] # 2D. Sycophantic openers / closers SYCOPHANCY_OPEN_H = [ r"\bGreat question[!.]", r"\bExcellent question[!.]", r"\bExcellent point[!.]", r"\bAbsolutely[!.]", r"\bCertainly[!.]", r"\bOf course[!.]", r"\bSure[!,]\s+Here'?s", r"\bI'?d be happy to help", r"\bWhat a (?:great|wonderful|fantastic) (?:question|idea)", ] SYCOPHANCY_CLOSE_H = [ r"\bI hope this helps", r"\bLet me know if you have any questions", r"\bLet me know if you'?d like me to (?:elaborate|continue|expand)", r"\bFeel free to reach out", r"\bDon'?t hesitate to (?:ask|reach out)", r"\bIs there anything else I can help you with", r"\bI hope this answers your question", r"\bHappy to clarify", ] # 2E. Vague-authority weasel attribution VAGUE_AUTH_H = [ r"\bStudies show\b", r"\bResearch suggests\b", r"\bResearch indicates\b", r"\bMany experts (?:agree|believe)\b", r"\bIndustry reports indicate\b", r"\bIt is widely understood\b", r"\bIt'?s widely (?:believed|understood)\b", r"\bObservers have noted\b", r"\bSome critics argue\b", ] VAGUE_AUTH_M = [ r"\bGenerally speaking\b", r"\bIn many cases\b", r"\bIt is commonly (?:known|believed)\b", ] # 2F. Closing / connector clichés CONNECTORS_H = [ "in conclusion", "to conclude", "in summary", "to summarize", "at the end of the day", "in essence", "to put it simply", "furthermore", "moreover", "additionally", "first and foremost", "last but not least", ] CONNECTORS_M = [ "overall", "ultimately", "all things considered", "in a nutshell", "on the other hand", "that being said", "with that in mind", "notably", "indeed", ] # Decorative / "magic" adverbs (low+ severity) MAGIC_ADVERBS = [ "genuinely", "actually", "truly", "really", "honestly", "frankly", "ultimately", "basically", "obviously", "clearly", "simply", "literally", "fundamentally", "remarkably", "arguably", "deeply", "quietly", "subtly", ] # Buzzwords for density check (3+ in one paragraph = flag) BUZZWORDS = [ "scalable", "repeatable", "defensible", "mission-critical", "enterprise-grade", "world-class", "best-in-class", "ai-native", "agent-driven", "autonomous", "high-velocity", "outcome-oriented", "robust", "seamless", "innovative", "cutting-edge", "state-of-the-art", "synergy", "holistic", "next-generation", "transformative", "groundbreaking", "comprehensive", "multifaceted", ] # ============================================================================= # PATTERNS — sentence-level and structural # ============================================================================= # 1. Negation reversal openers NEGATION_OPENERS = [ r"^\s*It wasn'?t\b", r"^\s*It was not\b", r"^\s*It'?s not\b", r"^\s*It is not\b", r"^\s*This isn'?t\b", r"^\s*This is not\b", r"^\s*Not just\b", r"^\s*Not a\b", r"^\s*Not because\b", ] # 2. Dramatic countdown — "Not X. Not Y. Just Z." # Detected via consecutive short sentences starting with "Not" # 3. Self-posed rhetorical question + immediate answer # "The result? X." "The catch? Y." RHETORICAL_QA = re.compile( r"\b(The result|The catch|The kicker|The thing|The point|The bottom line|The real question)\?\s+\w", re.IGNORECASE, ) # 8. Performative opening patterns PERFORMATIVE_OPENINGS = [ r"^\s*Let me cut to it[:\.]", r"^\s*Picture this[:\.]?", r"^\s*Imagine a world", r"^\s*In a world where", r"^\s*Have you ever wondered", r"^\s*Are you struggling with", r"^\s*In today'?s fast-paced", r"^\s*In today'?s (?:world|landscape|digital age)", r"^\s*Here'?s the thing\b", r"^\s*I'?ll be brief", r"^\s*When I read\b.*I closed", r"^\s*Most\s+\w+\s+\w+\s+(?:waste|won'?t)", ] # 9. Setup-reveal phrases SETUP_REVEAL_PHRASES = [ r"\bThe point is\b", r"\bThe thing is\b", r"\bWhat this means is\b", r"\bIn short\b", r"\bBottom line\b", r"\bThe bottom line\b", r"\bIn summary\b", r"\bTo summarize\b", r"\bThe real takeaway\b", r"\bWhat matters here\b", ] # 10. Crafted closer indicators CRAFTED_CLOSERS = [ r"^Build it\.?\s+Ship it\.?\s+Run it\.?$", r"^Let'?s go\.?$", r"^The future is now\.?$", r"^The future belongs to\b", r"^And that'?s the point\.?$", ] # 13. Present-participle "-ing" tails ING_TAIL = re.compile( r",\s+(highlighting|emphasizing|symbolizing|contributing to|reflecting|" r"underscoring|demonstrating|showcasing|embodying|representing|reinforcing|" r"signaling|illustrating|exemplifying|marking|paving|fostering)\s+", re.IGNORECASE, ) # 14. False range "From X to Y" FALSE_RANGE = re.compile( r"(?:^|\.\s+|:\s+)From\s+\w+(?:\s+\w+){0,3}\s+to\s+\w+(?:\s+\w+){0,3}[,.]", re.IGNORECASE, ) # 15. Copula avoidance verbs COPULA_AVOIDANCE = [ r"\bserves as (?:a|an|the)\b", r"\bstands as (?:a|an|the)\b", r"\bmarks (?:a|an|the)\b", r"\brepresents (?:a|an|the)\b", r"\bembodies\b", ] # 16. Hedge stacking — clusters of hedges in one sentence HEDGE_WORDS = [ r"\bmay\b", r"\bmight\b", r"\bcould\b", r"\bpossibly\b", r"\bpotentially\b", r"\bperhaps\b", r"\bgenerally\b", r"\bsomewhat\b", r"\bprobably\b", r"\bin many cases\b", r"\bit'?s possible that\b", ] # 17. Hedged superlatives HEDGED_SUPERLATIVES = [ r"\bperhaps the most\b", r"\barguably the (?:best|most|greatest)\b", r"\bone of the most\b", r"\bamong the most\b", r"\bquite possibly the\b", ] # 18. "While X, Y" sentence opener WHILE_OPENER = re.compile(r"^\s*While\s+\w+", re.IGNORECASE | re.MULTILINE) # 19. "X meets Y" / "X is more than just Y" X_MEETS_Y = re.compile(r"\b\w+\s+meets\s+\w+\b", re.IGNORECASE) MORE_THAN_JUST = re.compile(r"\bmore than just\s+(?:a|an)?\s*\w+", re.IGNORECASE) # 21. False concession openers FALSE_CONCESSION = [ r"^\s*Despite (?:its |the |these )?(?:challenges|limitations|drawbacks)", r"^\s*While (?:there are|the evidence is|some)\s+\w+\s+(?:limitations|concerns|challenges)", r"^\s*Although (?:there are|some)\s+", ] # 26. Pedagogical voice PEDAGOGICAL = [ r"^\s*Let'?s dive into\b", r"^\s*Let'?s explore\b", r"^\s*Let'?s break (?:this|it) down\b", r"^\s*We'?ll walk through\b", r"^\s*Let'?s unpack\b", ] # 27. Royal-we / "as a society" framing ROYAL_WE = [ r"\bWe live in (?:an? |the )?(?:age|era|world)\b", r"\bAs a society,? we\b", r"\bIn our (?:time|age|world)\b", r"\bOur collective\b", ] # 29. Knowledge-cutoff disclaimer leakage KNOWLEDGE_CUTOFF = [ r"\bAs of my (?:last update|knowledge cutoff)\b", r"\bI don'?t have access to real-time\b", r"\bMy training data\b", r"\bWhile my training\b", r"\bbased on (?:my|the) training data\b", ] # 31. Stake inflation / future-flourish STAKE_INFLATION = [ r"\bThis will revolutionize\b", r"\bWe'?re entering (?:a|the) new era\b", r"\bA new paradigm\b", r"\bThe future of\b.*\bis\b", r"\bUshering in (?:a|the) new\b", ] # 32. Grandiose framing GRANDIOSE = [ r"\bstands as (?:a|an|the)\b", r"\bserves as (?:a|an|the)\b", r"\b(?:a|the) testament to\b", r"\bAt its core,?\s+(?:this|the|it)\b", r"\bembodies (?:the|a) spirit\b", r"\brepresents (?:a|an|the)\s+\w+\s+(?:moment|era|chapter)", ] # 36. Fabricated case study / generic name FABRICATED_CASE = re.compile( r"\b(?:Take|Meet|Consider)\s+([A-Z][a-z]{2,10})(?:\s+[A-Z][a-z]+)?,\s+(?:a|an)\s+", ) # 41. Throat-clearing meta-comments THROAT_CLEARING = [ r"\bIt'?s worth noting (?:that)?\b", r"\bIt'?s important to (?:mention|note)\b", r"\bIt bears (?:mentioning|noting)\b", r"^\s*Notably,\s", r"^\s*Interestingly,\s", ] # Whether-or openers (12) WHETHER_OR = re.compile(r"^\s*Whether you'?re\s+", re.IGNORECASE | re.MULTILINE) # 20. Both-sides-ism — on one hand / on the other hand BOTH_SIDES = [ r"\bon (?:the )?one hand\b", r"\bon the other hand\b", r"\bboth (?:sides|perspectives) have merit\b", r"\badvantages and disadvantages\b", ] # 22. The "real" tic — "real X" as an authenticity intensifier REAL_TIC = re.compile( r"\breal\s+(?:money|stakes|outcomes|connection|impact|results|deal|talk|research|world)\b", re.IGNORECASE, ) # 34. Vapid analogies — "Think of it as a", "It's like having a" VAPID_ANALOGY = [ r"\bThink of it as (?:a |an |the )", r"\bIt'?s like having (?:a |an )", r"\bImagine it as (?:a |an )", r"\bIt'?s the (?:Uber|Airbnb|Spotify|Netflix) of\b", ] # 39. Historical analogy stacking — printing press / electricity / internet within ~150 chars HISTORICAL_ANALOGY = re.compile( r"\b(?:printing press|electricity|internet|industrial revolution|wheel|fire|atomic age)\b", re.IGNORECASE, ) # 38. Dead-metaphor repetition — count cliché metaphor reuse DEAD_METAPHORS = ["journey", "landscape", "tapestry", "ecosystem", "realm", "beacon", "symphony", "tide"] # ============================================================================= # MODEL FINGERPRINT MARKERS # ============================================================================= GPT_MARKERS = [ r"\bdelve(?:s|d)?\b", r"\bunderscore(?:s|d)?\b", r"\bnoteworthy\b", r"\bcommendable\b", r"\bintricate\b", r"\bmeticulous(?:ly)?\b", r"\bsupercharge\b", r"\bunleash(?:es|ed)?\b", r"\bdive in\b", r"\bgame-changing\b", r"\bindividuals with\b", r"\bcharacterized by elevated\b", r"\bplay a significant role\b", ] CLAUDE_MARKERS = [ r"\bmeaningfully\b", r"\bthe distinction is worth examining\b", r"\bI notice that\b", r"\bit'?s worth examining\b", r"\bI should be careful here\b", r"\bworth noting that\b", r"\bmore carefully\b", ] GEMINI_MARKERS = [ r"\bthe way for\b", r"\bthe cascade of\b", r"\bin the world of\b", r"\blet'?s explore\b", r"\bunderstand how\b", r"\blet'?s take a closer look\b", ] # ============================================================================= # COMPREHENSION AXIS — patterns and constants # Sourced from references/comprehension.md and readability-metrics.md. # ============================================================================= # F1. Known-acronym allowlist (~50 well-known across domains). # Anything outside this list counts as "undefined" unless introduced with # a parenthetical expansion earlier in the document, e.g. "search request agent (SRA)". KNOWN_ACRONYMS = { # Tech / web "USB", "FAQ", "URL", "API", "JSON", "HTML", "CSS", "SQL", "AWS", "GCP", "PDF", "GIF", "JPG", "PNG", "MP3", "MP4", "HTTP", "HTTPS", "IP", "DNS", "GPS", "VPN", "RAM", "CPU", "GPU", "SSD", "HDD", "OS", "IOS", "AI", "ML", "LLM", "UI", "UX", "SDK", "CLI", "GUI", "CDN", "DOM", "XML", "IDE", "REST", "RPC", "TLS", "SSL", "FTP", "SMTP", "IMAP", # Business "CEO", "CFO", "CTO", "COO", "CMO", "VP", "HR", "PR", "QA", "ROI", "KPI", "CRM", "ERP", "SaaS", "PaaS", "IaaS", "B2B", "B2C", "B2G", "MVP", "OKR", "PMF", "ICP", "MRR", "ARR", "LTV", "CAC", "NPS", # Government / countries / agencies "USA", "UK", "EU", "UN", "NATO", "NASA", "FBI", "CIA", "IRS", "DMV", "DOJ", "DOD", "FDA", "EPA", "CDC", "WHO", "OECD", "IMF", # Time / measurement "AM", "PM", "GMT", "UTC", "EST", "PST", "BC", "AD", "CE", "BCE", # Media / docs "TV", "FM", "AM", "DVD", "CD", "VHS", # Common short "OK", "ID", "TLDR", "FYI", "ASAP", "DIY", "RSVP", "AKA", "ETA", "ETC", "CV", "LLC", # Legacy abbrev set (from existing ABBREVIATIONS) "MR", "MRS", "MS", "DR", "PROF", "SR", "JR", # Misc common "PIN", "ATM", "ZIP", "CAPTCHA", "GDPR", "CCPA", "PCI", } # Audience presets that affect comprehension thresholds. AUDIENCE_PRESETS = { "casual": {"flesch_min": 60, "fk_max": 9, "sent_max": 18, "passive_max": 10, "lex_max": 55}, "marketing": {"flesch_min": 65, "fk_max": 8, "sent_max": 16, "passive_max": 5, "lex_max": 50}, "academic": {"flesch_min": 30, "fk_max": 16, "sent_max": 28, "passive_max": 20, "lex_max": 65}, "encyclopedic":{"flesch_min": 40, "fk_max": 14, "sent_max": 24, "passive_max": 15, "lex_max": 60}, "technical": {"flesch_min": 40, "fk_max": 14, "sent_max": 25, "passive_max": 15, "lex_max": 60}, "fiction": {"flesch_min": 60, "fk_max": 10, "sent_max": 22, "passive_max": 12, "lex_max": 55}, "healthcare": {"flesch_min": 70, "fk_max": 8, "sent_max": 15, "passive_max": 5, "lex_max": 50}, } # G5. Glue-word bloat — sentence-start patterns that delay the real subject GLUE_WORD_OPENERS = [ r"^\s*There\s+(?:is|are|was|were)\b", r"^\s*It\s+is\b", r"^\s*It\s+was\b", r"^\s*What\s+is\b", r"^\s*What\s+I'?m\s+trying\s+to\s+say\s+is\b", r"^\s*What\s+I\s+mean\s+is\b", r"^\s*The\s+thing\s+is\b", ] # H5. Forward-reference / "we'll see later" FORWARD_REFERENCE = [ r"\bas we'?ll see\b", r"\bmore on this later\b", r"\bcovered below\b", r"\bwe'?ll discuss\b", r"\bas discussed below\b", r"\bsee section \d+\b", r"\bsee below\b", r"\bdetailed (?:later|below)\b", r"\bin a later (?:section|chapter)\b", ] # J1. Passive voice — be-verb + past participle PASSIVE_VOICE = re.compile( r"\b(is|are|was|were|been|being|am)\s+(?:[a-z]+ly\s+)?" r"(?:[a-z]+ed|known|made|done|seen|given|taken|written|sent|shown|" r"found|left|kept|paid|met|read|put|set|cut|hit|lost|won|brought|" r"caught|chosen|driven|spoken|stolen|broken|thrown|drawn|drunk|" r"swum|sworn|torn|worn|sung|sunk|run|begun|come|become|gone|done|" r"borne|built|burnt|spent|sent|bent|lent|meant|kept|slept|wept|" r"crept|swept|felt|dealt|spilt|spoilt|told|sold|held|bound|wound)\b", re.IGNORECASE, ) # J2. Nominalization / zombie noun suffixes NOMINALIZATION_SUFFIXES = re.compile( r"\b\w{3,}(?:tion|ment|ance|ence|ity|ization|isation|ization|ism|ness)\b", re.IGNORECASE, ) # J5. Decorative qualifiers (comprehension-axis version) DECORATIVE_QUALIFIERS = re.compile( r"\b(very|really|quite|extremely|incredibly|just|literally|" r"basically|actually|simply|truly|highly|fairly|rather|somewhat)\b", re.IGNORECASE, ) # J8. Negative-construction-where-positive-available NEGATIVE_CONSTRUCTIONS = [ r"\bnot\s+un[a-z]+\b", r"\bnot\s+in[a-z]+\b", r"\bnot\s+infrequent(?:ly)?\b", r"\bdon'?t\s+fail\s+to\b", r"\bnever\s+fail\s+to\b", r"\bnot\s+un\w+\b", ] # Acronym-detector regex: 2-5 uppercase letters/digits, surrounded by word boundaries. ACRONYM_TOKEN = re.compile(r"\b([A-Z][A-Z0-9]{1,4})\b") # Parenthetical expansion regex — captures "Search Request Agent (SRA)" style introductions. PAREN_EXPANSION = re.compile(r"\b(?:[A-Z][a-zA-Z]+\s+){1,5}\(([A-Z][A-Z0-9]{1,4})\)") # Numeric-token regex for stat bombing (F3) NUMERIC_TOKEN = re.compile(r"(?:\$\d+(?:\.\d+)?(?:[KkMmBb])?|\b\d+(?:\.\d+)?(?:%|[KkMmBb]|x|×)?\b)") # Telegraphic colon-label regex (G1): "Word(s): Capital..." mid-sentence. COLON_LABEL = re.compile(r"\b([A-Z][a-zA-Z]+(?:\s+[a-zA-Z]+){0,3}):\s+[A-Z]") # Stoplist for lexical-density heuristic. STOPWORDS = { "the", "a", "an", "of", "in", "on", "at", "to", "for", "with", "by", "and", "or", "but", "is", "are", "was", "were", "be", "been", "being", "am", "has", "have", "had", "do", "does", "did", "will", "would", "can", "could", "should", "may", "might", "must", "shall", "it", "its", "this", "that", "these", "those", "he", "she", "they", "we", "i", "you", "me", "him", "her", "them", "us", "my", "your", "his", "their", "our", "who", "what", "which", "where", "when", "why", "how", "as", "if", "then", "else", "than", "so", "because", "while", "though", "from", "into", "onto", "upon", "about", "over", "under", "again", "not", "no", "yes", "out", "up", "down", "off", "all", "any", "some", "each", "every", "other", "another", "such", } # Dale-Chall simplified word list — curated subset of ~500 of the most common # English words. Source: Dale-Chall 3,000-word list, abridged for inline embedding. DALE_CHALL_WORDLIST = { "a", "able", "about", "above", "across", "act", "add", "afraid", "after", "afternoon", "again", "against", "age", "ago", "agree", "ah", "ahead", "air", "alike", "all", "allow", "almost", "alone", "along", "already", "also", "always", "am", "among", "an", "and", "angry", "another", "answer", "any", "apart", "apple", "are", "arm", "around", "art", "as", "ask", "at", "ate", "away", "baby", "back", "bad", "bag", "ball", "band", "bank", "bar", "base", "be", "bear", "beat", "beautiful", "became", "because", "become", "bed", "been", "before", "began", "begin", "begun", "behind", "being", "believe", "bell", "below", "best", "better", "between", "big", "bird", "bit", "black", "blank", "blew", "block", "blow", "blue", "board", "boat", "body", "boil", "bone", "book", "born", "both", "bottle", "bottom", "bought", "box", "boy", "branch", "brave", "bread", "break", "breakfast", "breath", "brick", "bridge", "bright", "bring", "broke", "brother", "brown", "brought", "build", "built", "burn", "burst", "bury", "business", "busy", "but", "buy", "by", "cake", "call", "came", "can", "candy", "cap", "captain", "car", "card", "care", "carry", "case", "cast", "cat", "catch", "cause", "caught", "cell", "cent", "center", "chair", "chance", "change", "chase", "cheap", "check", "cheer", "child", "children", "chose", "circle", "city", "class", "clean", "clear", "climb", "close", "cloth", "clothes", "cloud", "club", "coal", "coat", "cold", "color", "come", "common", "company", "complete", "cook", "cool", "corn", "corner", "cost", "could", "country", "course", "cover", "cow", "crack", "cried", "cross", "cry", "cup", "cut", "dad", "daily", "dance", "danger", "dare", "dark", "date", "daughter", "day", "dead", "dear", "death", "decide", "deep", "deer", "did", "die", "different", "dig", "dinner", "dirt", "do", "dog", "done", "door", "down", "draw", "drawn", "dream", "dress", "drew", "drink", "drive", "drop", "drove", "dry", "duck", "dust", "each", "ear", "early", "earth", "east", "easy", "eat", "egg", "eight", "either", "else", "empty", "end", "enemy", "enjoy", "enough", "enter", "even", "evening", "ever", "every", "everybody", "everyone", "everything", "expect", "eye", "face", "fact", "fail", "fair", "fall", "false", "family", "far", "farm", "fast", "fat", "father", "fault", "favor", "fear", "feed", "feel", "feet", "fell", "felt", "few", "field", "fight", "fill", "find", "fine", "finish", "fire", "first", "fish", "fit", "five", "flag", "flat", "floor", "flow", "flower", "fly", "follow", "food", "foot", "for", "forget", "form", "forth", "forward", "fought", "found", "four", "free", "fresh", "friend", "from", "front", "fruit", "full", "fun", "funny", "game", "garden", "gate", "gave", "get", "gift", "girl", "give", "glad", "glass", "go", "goes", "going", "gold", "gone", "good", "got", "grade", "grand", "grass", "great", "green", "grew", "ground", "group", "grow", "guess", "had", "hair", "half", "hall", "hand", "happen", "happy", "hard", "has", "hat", "have", "head", "hear", "heart", "heat", "heavy", "held", "help", "her", "here", "hide", "high", "hill", "him", "his", "history", "hit", "hold", "hole", "home", "hope", "horse", "hot", "hour", "house", "how", "however", "huge", "hundred", "hung", "hunt", "hurry", "hurt", "i", "ice", "idea", "if", "ill", "important", "in", "inch", "indeed", "inside", "into", "is", "it", "its", "job", "join", "joy", "judge", "jump", "just", "keep", "kept", "kid", "kill", "kind", "king", "kiss", "kitchen", "knee", "knew", "know", "known", "lake", "land", "large", "last", "late", "laugh", "law", "lay", "lead", "learn", "least", "leave", "led", "left", "leg", "less", "let", "letter", "lie", "life", "lift", "light", "like", "line", "lion", "lip", "list", "listen", "little", "live", "lone", "long", "look", "lose", "lost", "lot", "love", "low", "luck", "made", "main", "make", "man", "many", "march", "mark", "may", "me", "mean", "meant", "meat", "meet", "men", "met", "mid", "middle", "might", "mile", "milk", "mill", "mind", "mine", "minute", "miss", "mix", "money", "month", "moon", "more", "morning", "most", "mother", "mountain", "mouse", "mouth", "move", "much", "must", "my", "name", "near", "neck", "need", "neighbor", "neither", "never", "new", "next", "nice", "night", "nine", "no", "none", "noise", "north", "nose", "not", "note", "nothing", "now", "nut", "of", "off", "office", "often", "oh", "old", "on", "once", "one", "only", "open", "or", "order", "other", "our", "out", "outside", "over", "own", "page", "paid", "pain", "paint", "pair", "paper", "part", "party", "pass", "past", "pay", "people", "perhaps", "person", "pick", "picture", "pie", "piece", "pig", "pink", "place", "plain", "plan", "plant", "play", "please", "point", "police", "pond", "poor", "post", "pot", "power", "press", "pretty", "price", "prince", "print", "promise", "prove", "pull", "push", "put", "queen", "question", "quick", "quiet", "quite", "rabbit", "race", "rain", "ran", "rather", "reach", "read", "ready", "real", "really", "reason", "red", "remember", "rest", "return", "rich", "ride", "right", "ring", "river", "road", "rock", "roll", "roof", "room", "rose", "round", "row", "rub", "run", "sad", "safe", "said", "same", "sang", "sat", "save", "saw", "say", "school", "sea", "seat", "second", "secret", "see", "seed", "seem", "seen", "self", "sell", "send", "sent", "serve", "set", "seven", "several", "shade", "shake", "shall", "shape", "share", "she", "sheep", "shelf", "shell", "shine", "ship", "shoe", "shoot", "shop", "shore", "short", "should", "show", "shut", "sick", "side", "sight", "sign", "silent", "silly", "silver", "since", "sing", "sister", "sit", "six", "size", "sky", "sleep", "slow", "small", "smell", "smile", "smoke", "snake", "snow", "so", "soap", "soft", "sold", "soldier", "some", "son", "song", "soon", "sound", "soup", "south", "speak", "spell", "spend", "spent", "spread", "spring", "stand", "star", "start", "state", "stay", "step", "stick", "still", "stone", "stop", "store", "story", "straight", "strange", "street", "string", "strong", "such", "sugar", "summer", "sun", "supper", "suppose", "sure", "surprise", "sweet", "swim", "table", "tail", "take", "talk", "tall", "taste", "teach", "team", "tear", "tell", "ten", "than", "thank", "that", "the", "their", "them", "then", "there", "these", "they", "thick", "thin", "thing", "think", "third", "this", "those", "though", "thought", "three", "threw", "throat", "through", "throw", "tie", "till", "time", "tin", "tiny", "tip", "tire", "to", "today", "toe", "told", "tomorrow", "tone", "too", "took", "top", "touch", "town", "track", "train", "tree", "trip", "true", "truly", "trust", "truth", "try", "turn", "twelve", "twenty", "two", "under", "until", "up", "upon", "us", "use", "used", "very", "view", "visit", "voice", "wait", "walk", "wall", "want", "war", "warm", "was", "wash", "watch", "water", "way", "we", "wear", "week", "weigh", "well", "went", "were", "west", "wet", "what", "wheel", "when", "where", "whether", "which", "while", "white", "who", "whole", "whose", "why", "wide", "wife", "wild", "will", "win", "wind", "winter", "wise", "wish", "with", "within", "without", "woke", "woman", "women", "wonder", "wood", "word", "wore", "work", "world", "worn", "worry", "would", "wound", "write", "written", "wrong", "wrote", "yard", "year", "yes", "yet", "you", "young", "your", # Plain modern additions outside the historic Dale-Chall list "online", "email", "phone", "mobile", "today", "okay", "list", "test", "try", "post", "blog", "page", "click", "type", "send", "free", "help", "user", "site", "data", "code", "team", "task", } # ============================================================================= # TEXT PROCESSING # ============================================================================= ABBREVIATIONS = [ "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Sr.", "Jr.", "U.S.", "U.K.", "E.U.", "i.e.", "e.g.", "etc.", "vs.", "Inc.", "Ltd.", "St.", "Ave.", "No.", "Vol.", "ch.", "ed.", ] def strip_code_blocks(text): """Remove fenced code blocks and inline code from markdown.""" text = re.sub(r"```[\s\S]*?```", "", text) text = re.sub(r"`[^`]+`", "", text) return text def split_sentences(text): """Split text into sentences. Imperfect but good enough.""" protected = text for ab in ABBREVIATIONS: protected = protected.replace(ab, ab.replace(".", "\x00")) sentences = re.split(r"(?<=[.!?])\s+", protected) sentences = [s.replace("\x00", ".").strip() for s in sentences if s.strip()] return sentences def split_paragraphs(text): """Split text into paragraphs by blank lines. Also treats lone-blockquote-marker lines (just '>') as paragraph separators — common in pasted letter / quoted-text formats where the user uses '>' to delimit blocks. """ # Treat lines that contain only ">" or whitespace+">" as blank text = re.sub(r"^\s*>\s*$", "", text, flags=re.MULTILINE) paras = re.split(r"\n\s*\n", text) return [p.strip() for p in paras if p.strip()] def count_words(s): return len(re.findall(r"\b\w+\b", s)) def find_phrase_hits(text, phrases): """Return [(phrase, count), ...] for whole-word phrases (case-insensitive).""" hits = [] for phrase in phrases: # Word boundaries around the phrase pattern = r"\b" + re.escape(phrase) + r"\b" matches = re.findall(pattern, text, flags=re.IGNORECASE) if matches: hits.append((phrase, len(matches))) return hits def find_regex_hits(text, patterns): """Return [(pattern, count, sample), ...] for each pattern with matches.""" hits = [] for pat in patterns: matches = re.findall(pat, text, flags=re.IGNORECASE) if matches: sample = matches[0] if isinstance(matches[0], str) else str(matches[0]) hits.append((pat, len(matches), sample[:80])) return hits # ============================================================================= # DETECTORS # ============================================================================= def find_em_dashes(text): em = text.count("—") en = text.count("–") double_hyphen = len(re.findall(r"(?= min_run: runs.append(list(current)) current = [] if len(current) >= min_run: runs.append(list(current)) return runs def find_two_word_punchlines(sentences, short_max=4, long_min=15): """Find any sentence ≤short_max words preceded by one ≥long_min words. Threshold lowered from 20 to 15 — patterns.md examples show real cases with ~13-word setups (e.g. 'won against 5,800 builders. It works.').""" hits = [] for i in range(1, len(sentences)): prev_wc = count_words(sentences[i - 1]) cur_wc = count_words(sentences[i]) if prev_wc >= long_min and cur_wc <= short_max: hits.append((i, sentences[i], cur_wc, sentences[i - 1][:80])) return hits def find_negation_reversal_candidates(sentences): hits = [] for i, s in enumerate(sentences): for pat in NEGATION_OPENERS: if re.search(pat, s): hits.append((i, s, pat)) break return hits def find_cross_sentence_negation(sentences): """Detect 'X isn't/aren't/wasn't Y. It's/They're/X is Z.' across sentence pairs. The negation-reveal pattern that the single-sentence regex misses.""" hits = [] neg_pattern = re.compile( r"\b(?:isn't|is not|aren't|are not|wasn't|was not|weren't|were not)\b", re.IGNORECASE, ) affirm_start = re.compile( r"^\s*(?:It'?s|It is|They'?re|They are|That'?s|That is|What it is)\b", re.IGNORECASE, ) for i in range(len(sentences) - 1): cur = sentences[i] nxt = sentences[i + 1] # Both sentences must be reasonably short for the pattern to read as setup-reveal if count_words(cur) > 25 or count_words(nxt) > 15: continue if neg_pattern.search(cur) and affirm_start.search(nxt): hits.append((i, cur, nxt)) return hits def find_dramatic_countdown(sentences): """Find 2+ consecutive short sentences starting with 'Not'.""" hits = [] for i in range(1, len(sentences)): prev = sentences[i - 1] cur = sentences[i] if ( count_words(prev) <= 8 and count_words(cur) <= 8 and re.match(r"^\s*Not\b", prev, re.IGNORECASE) and re.match(r"^\s*Not\b", cur, re.IGNORECASE) ): hits.append((i, [prev, cur])) return hits def find_anaphora(sentences, min_run=3): """3+ consecutive sentences starting with the same 2-word opening.""" hits = [] if len(sentences) < min_run: return hits current_run = [0] for i in range(1, len(sentences)): prev_words = sentences[i - 1].split()[:2] cur_words = sentences[i].split()[:2] if ( len(prev_words) == 2 and len(cur_words) == 2 and prev_words[0].lower() == cur_words[0].lower() and prev_words[1].lower() == cur_words[1].lower() ): current_run.append(i) else: if len(current_run) >= min_run: hits.append([(idx, sentences[idx]) for idx in current_run]) current_run = [i] if len(current_run) >= min_run: hits.append([(idx, sentences[idx]) for idx in current_run]) return hits def find_three_beat_stacks(text): """Heuristic: 'word, word, and word' pattern.""" pattern = r"\b(\w+(?:\s+\w+)?)\s*,\s*(\w+(?:\s+\w+)?)\s*,\s*and\s+(\w+(?:\s+\w+)?)\b" return re.findall(pattern, text) def find_setup_reveal_endings(paragraphs): """Paragraphs ending with a setup-reveal phrase.""" hits = [] for i, p in enumerate(paragraphs): sentences = split_sentences(p) if not sentences: continue last = sentences[-1] for pat in SETUP_REVEAL_PHRASES: if re.search(pat, last, flags=re.IGNORECASE): hits.append((i, last, pat)) break return hits def find_buzzword_density(paragraphs, threshold=3): """Paragraphs with `threshold`+ buzzwords.""" hits = [] for i, p in enumerate(paragraphs): count = 0 found = [] for bw in BUZZWORDS: pattern = r"\b" + re.escape(bw) + r"\b" n = len(re.findall(pattern, p, flags=re.IGNORECASE)) if n: count += n found.append((bw, n)) if count >= threshold: hits.append((i, count, found)) return hits def find_crafted_closer(text): """Final non-empty line matches crafted-closer patterns.""" lines = [l.strip() for l in text.strip().split("\n") if l.strip()] if not lines: return None last = lines[-1] for pat in CRAFTED_CLOSERS: if re.search(pat, last, flags=re.IGNORECASE): return (last, pat) return None def find_performative_opening(text): """First sentence matches a performative opening pattern.""" sentences = split_sentences(strip_code_blocks(text)) if not sentences: return None first = sentences[0] for pat in PERFORMATIVE_OPENINGS: if re.search(pat, first, flags=re.IGNORECASE): return (first, pat) return None def find_hedge_stacking(sentences): """Sentences with 3+ hedge words.""" hits = [] for i, s in enumerate(sentences): count = 0 for pat in HEDGE_WORDS: count += len(re.findall(pat, s, flags=re.IGNORECASE)) if count >= 3: hits.append((i, s, count)) return hits def find_while_openers(text): """Count 'While X, Y' sentence openers.""" matches = WHILE_OPENER.findall(text) return len(matches) def find_acknowledgment_loop(text, title=None): """First sentence echoes the title (if provided) or paraphrases prompt.""" if not title: return None sentences = split_sentences(strip_code_blocks(text)) if not sentences: return None first = sentences[0].lower() title_words = set(re.findall(r"\b\w+\b", title.lower())) first_words = set(re.findall(r"\b\w+\b", first)) overlap = title_words & first_words # Stop words don't count stop = {"a", "an", "the", "to", "of", "in", "on", "for", "and", "or", "is", "are"} overlap -= stop if len(overlap) >= 3: return (first, list(overlap)) return None def find_fabricated_cases(text): """Find 'Take Sarah, a marketing manager...' patterns.""" return FABRICATED_CASE.findall(text) def compute_burstiness(sentences): """Std dev of sentence lengths divided by mean. Returns None for <5 sentences.""" if len(sentences) < 5: return None lengths = [count_words(s) for s in sentences] mean = sum(lengths) / len(lengths) if mean == 0: return None variance = sum((x - mean) ** 2 for x in lengths) / len(lengths) std = math.sqrt(variance) return round(std / mean, 3) def find_bigram_repetition(text, threshold=5): """Find 2-word phrases appearing `threshold`+ times. Excludes stopword-only bigrams.""" words = re.findall(r"\b\w+\b", text.lower()) if len(words) < 10: return [] bigrams = {} stop = {"a", "an", "the", "to", "of", "in", "on", "for", "and", "or", "is", "are", "be", "was", "were", "by", "as", "at", "with", "this", "that", "it", "its"} for i in range(len(words) - 1): if words[i] in stop and words[i + 1] in stop: continue bg = (words[i], words[i + 1]) bigrams[bg] = bigrams.get(bg, 0) + 1 return [(bg, count) for bg, count in bigrams.items() if count >= threshold] def contraction_ratio(text): """Ratio of contractions to could-be-contractions. 0 = formal/AI lean.""" contractions = len(re.findall(r"\b\w+'(?:s|t|re|ve|ll|d|m)\b", text)) expansions = len(re.findall(r"\b(?:do not|does not|did not|will not|would not|could not|should not|cannot|can not|is not|are not|was not|were not|has not|have not|had not|it is|that is|there is|i am)\b", text, flags=re.IGNORECASE)) total = contractions + expansions if total == 0: return None return round(contractions / total, 2) def detect_model_fingerprint(text): """Heuristic: count GPT/Claude/Gemini markers and report dominant.""" gpt_count = sum(len(re.findall(p, text, flags=re.IGNORECASE)) for p in GPT_MARKERS) claude_count = sum(len(re.findall(p, text, flags=re.IGNORECASE)) for p in CLAUDE_MARKERS) gemini_count = sum(len(re.findall(p, text, flags=re.IGNORECASE)) for p in GEMINI_MARKERS) total = gpt_count + claude_count + gemini_count if total < 2: return ("none", {"gpt": gpt_count, "claude": claude_count, "gemini": gemini_count}) # Find max counts = {"gpt": gpt_count, "claude": claude_count, "gemini": gemini_count} sorted_counts = sorted(counts.items(), key=lambda x: -x[1]) if sorted_counts[0][1] >= 2 and sorted_counts[0][1] >= 1.5 * (sorted_counts[1][1] or 1): return (sorted_counts[0][0], counts) return ("mixed", counts) def detect_genre(text): """Crude genre inference. Falls back to 'casual'.""" text_lower = text.lower() # Academic markers if ( len(re.findall(r"\b(?:hypothesis|methodology|et al\.|fig\.|p\s*<\s*0\.0|table \d+)", text_lower)) >= 2 or "abstract:" in text_lower or re.search(r"\[\d+\]|\(\d{4}\)", text) ): return "academic" # Marketing markers if len(re.findall(r"\b(?:cta|conversion|landing page|sign up|free trial|book a demo|pricing)\b", text_lower)) >= 2: return "marketing" # Encyclopedic markers if ( re.search(r"^[A-Z][\w\s]+ \(born", text) or re.search(r"^[A-Z][\w\s]+ \(c\.\s*\d{4}", text) or len(re.findall(r"\bwas (?:a|an|the)\b", text)) >= 5 ): return "encyclopedic" # Fiction: dialogue heavy if text.count('"') >= 6: return "fiction" return "casual" def find_markdown_tells(text): """Detect bold-first bullets, emoji bullets, excessive headers, etc.""" tells = {} # Bold-first bullets bold_bullets = len(re.findall(r"^\s*[-*]\s+\*\*[^*]+\*\*\s*[:.]", text, flags=re.MULTILINE)) if bold_bullets >= 3: tells["bold_first_bullets"] = bold_bullets # Emoji bullets emoji_bullets = len(re.findall(r"^\s*[🔹✨📌📍🎯💡⭐🚀🔥]", text, flags=re.MULTILINE)) if emoji_bullets >= 1: tells["emoji_bullets"] = emoji_bullets # Excessive headers h2_count = len(re.findall(r"^##\s+", text, flags=re.MULTILINE)) h3_count = len(re.findall(r"^###\s+", text, flags=re.MULTILINE)) word_count = count_words(text) if word_count > 0 and (h2_count + h3_count) > word_count / 200: tells["excessive_headers"] = { "h2": h2_count, "h3": h3_count, "word_count": word_count, } # Title patterns in headers title_patterns = re.findall( r"^#+\s+(?:[\w\s]+:\s+(?:A|The|Your|Everything)\s+(?:Comprehensive|Ultimate|Definitive|Complete)\s+Guide|The Ultimate Guide to|Everything You Need to Know|How to \w+ in 20\d{2})", text, flags=re.MULTILINE | re.IGNORECASE, ) if title_patterns: tells["clichéd_title_patterns"] = title_patterns return tells # ============================================================================= # COMPREHENSION DETECTORS # ============================================================================= def count_syllables(word): """Estimate syllable count via vowel-group heuristic. Counts vowel runs, subtracts trailing silent 'e', minimum 1. Approximate but stdlib-only. Used for Flesch / FK / SMOG. """ word = word.lower().strip() if not word: return 0 word = re.sub(r"[^a-z]", "", word) if not word: return 0 # Special-case very short words if len(word) <= 3: return 1 # Count vowel groups vowel_runs = re.findall(r"[aeiouy]+", word) syllables = len(vowel_runs) # Silent trailing 'e' if word.endswith("e") and not word.endswith("le") and syllables > 1: syllables -= 1 # 'le' at the end of a word with consonant before counts (e.g. "table" = 2) if word.endswith("le") and len(word) > 2 and word[-3] not in "aeiouy": # Already handled by vowel-group counting (the "e" in "le" is its own syllable) pass return max(1, syllables) def find_undefined_acronyms(text): """F1. Acronyms without parenthetical expansion, excluding the allowlist. Returns dict with: - 'acronyms': list of (acronym, count) for undefined ones - 'total_count': total occurrences of undefined acronyms - 'distinct_count': distinct undefined acronyms - 'density_per_100w': occurrences per 100 words """ # Find parenthetical expansions: "Search Request Agent (SRA)" introduced = set(PAREN_EXPANSION.findall(text)) # Find all acronym tokens all_tokens = ACRONYM_TOKEN.findall(text) counts = {} for tok in all_tokens: if tok in KNOWN_ACRONYMS: continue if tok in introduced: continue # Skip purely numeric (rare given our regex but safe) if tok.isdigit(): continue counts[tok] = counts.get(tok, 0) + 1 pairs = sorted(counts.items(), key=lambda x: -x[1]) total = sum(counts.values()) words = max(1, count_words(text)) density = round(total / words * 100, 2) return { "acronyms": pairs, "total_count": total, "distinct_count": len(counts), "density_per_100w": density, } def find_named_entities(text, sentences): """F2. Named-entity bombing — capitalized non-sentence-start tokens. Heuristic — no NER. Counts capitalized words that aren't: - first word of a sentence - common acronyms - first word of a heading line (markdown # ## ###) - the pronoun "I" - month/day-of-week (very common false positives) """ # Build a set of words occurring as sentence-initial. We approximate by # taking the first non-trivial word of each sentence. sentence_starts = set() for s in sentences: ws = re.findall(r"\b\w+\b", s) if ws: sentence_starts.add(ws[0]) common_calendar = { "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", } # Strip markdown heading lines (count their tokens but don't double-flag the first word) body = re.sub(r"^#+\s+(.*)$", r"\1", text, flags=re.MULTILINE) # Scan for capitalized tokens that aren't sentence-initial tokens = re.findall(r"\b([A-Z][a-zA-Z]+)\b", body) counts = {} for tok in tokens: if tok in common_calendar: continue if tok in KNOWN_ACRONYMS: continue if tok in sentence_starts and (tok in {"The", "This", "That", "These", "Those", "It", "We", "I", "You", "They", "He", "She", "A", "An", "Our", "Your", "My", "Their", "His", "Her", "If", "When", "Where", "What", "Why", "How", "After", "Before", "In", "On", "At", "From", "To", "But", "And", "Or", "So", "Then", "While", "Now", "First", "Last", "Second", "Third", "Most", "Some", "All", "Each", "Every", "No", "Yes", "Note", "TLDR", "TL", "Re", "Over", "Since", "Until", "About", "Among", "Through", "During", "Within", "Across", "Despite", "Although", "Though", "Because", "Whereas", "Without", "With", "Like", "Unlike", "Once", "Twice", "Whether", "However", "Moreover", "Therefore", "Thus", "Hence", "Otherwise", "Even", "Still", "Just", "Only", "Already", "Yet", "Sometimes", "Often", "Rarely", "Always", "Never", "Maybe", "Perhaps", "Probably"}): continue counts[tok] = counts.get(tok, 0) + 1 pairs = sorted(counts.items(), key=lambda x: -x[1]) total = sum(counts.values()) words = max(1, count_words(text)) density = round(total / words * 100, 2) return { "entities": pairs, "total_count": total, "distinct_count": len(counts), "density_per_100w": density, } def find_stat_bombing(sentences): """F3. Sentences with 4+ numeric tokens (uncontextualized stat clusters). Threshold of 4 (rather than 3) excludes ordinary narrative sentences that happen to mention several numbers (year + count + percentage). True stat bombing reads like "$50M pipeline, $14M ARR, 93% gap, 50% lift" — many numeric claims tightly packed. Returns list of (idx, sentence_excerpt, numeric_count). """ hits = [] for i, s in enumerate(sentences): nums = NUMERIC_TOKEN.findall(s) if len(nums) >= 4: hits.append((i, s[:120], len(nums))) return hits def find_wall_of_text(paragraphs): """F4. Paragraphs with >5 sentences or >100 words.""" hits = [] for i, p in enumerate(paragraphs): sents = split_sentences(p) wc = count_words(p) if len(sents) > 5 or wc > 100: hits.append((i, len(sents), wc, p[:80])) return hits def find_density_without_headings(text): """F5. >500 words with no headings, OR heading density < 1 per 300 words.""" h2 = len(re.findall(r"^##\s+", text, flags=re.MULTILINE)) h3 = len(re.findall(r"^###\s+", text, flags=re.MULTILINE)) h_total = h2 + h3 words = count_words(text) if words >= 500 and h_total == 0: return {"flagged": True, "reason": "500+ words, zero headings", "h_count": 0, "words": words} if words >= 300 and h_total > 0 and (words / h_total) > 300: return { "flagged": True, "reason": f"heading density too low ({h_total} headings for {words} words)", "h_count": h_total, "words": words, } return {"flagged": False, "h_count": h_total, "words": words} def find_telegraphic_colons(paragraphs): """G1. Mid-paragraph "Capital-Word(s): Capital-Word" patterns; flag at 3+/para.""" hits = [] for i, p in enumerate(paragraphs): labels = COLON_LABEL.findall(p) if len(labels) >= 3: hits.append((i, len(labels), labels[:5])) return hits def find_list_pretending_prose(paragraphs): """G2. Paragraphs with 2+ semicolons or 3+ '+' separators in prose.""" hits = [] for i, p in enumerate(paragraphs): # Skip paragraphs that look like lists or code if re.match(r"^\s*[-*+]\s", p): continue semi = p.count(";") plus = p.count("+") if semi >= 2 or plus >= 3: hits.append((i, semi, plus, p[:80])) return hits def find_long_sentences(sentences, threshold=30): """G3. Any sentence over `threshold` words.""" hits = [] for i, s in enumerate(sentences): wc = count_words(s) if wc > threshold: hits.append((i, wc, s[:120])) return hits def find_runon_sentences(sentences, clause_threshold=4): """G4. Sentences with `threshold`+ comma+conjunction independent clauses.""" hits = [] conj_pat = re.compile(r",\s+(and|but|or|so|yet|because|while|although|though|however|since|whereas)\b", re.IGNORECASE) for i, s in enumerate(sentences): clauses = len(conj_pat.findall(s)) # Also count em-dash and semicolon-introduced clauses clauses += s.count("—") clauses += s.count(";") if clauses >= clause_threshold: hits.append((i, clauses, s[:120])) return hits def find_glue_word_starts(sentences): """G5. Sentence-initial glue-word patterns.""" hits = [] for i, s in enumerate(sentences): for pat in GLUE_WORD_OPENERS: if re.search(pat, s, re.IGNORECASE): hits.append((i, s[:80], pat)) break return hits def find_forward_references(text): """H5. 'as we'll see', 'more on this later', etc.""" return find_regex_hits(text, FORWARD_REFERENCE) def find_no_skim_layer(text, words): """I9. 0 bold/strong markdown when 500+ words.""" bolds = len(re.findall(r"\*\*[^*]+\*\*", text)) if words >= 500 and bolds == 0: return {"flagged": True, "bolds": 0, "words": words} return {"flagged": False, "bolds": bolds, "words": words} def find_hierarchy_collapse(text): """I5. Heading levels skip (H1 → H3, H2 → H4, etc.).""" headings = [] for m in re.finditer(r"^(#+)\s+(.+)$", text, flags=re.MULTILINE): level = len(m.group(1)) if level <= 6: headings.append((level, m.group(2)[:60])) skips = [] if not headings: return skips for i in range(1, len(headings)): prev, cur = headings[i - 1][0], headings[i][0] if cur > prev + 1: skips.append({ "from_level": prev, "to_level": cur, "from": headings[i - 1][1], "to": headings[i][1], }) return skips def find_parallelism_failure(text): """I12. Sequential bullets with mixed grammatical forms. Heuristic: for sequential bullet lines, classify the first token as verb-ish (-s/-ing/-ed or imperative/base form), noun-ish (capitalized noun), or question (ends with ?). If 3+ different forms appear in 4+ bullets, flag. """ blocks = [] current = [] for line in text.split("\n"): m = re.match(r"^\s*[-*+]\s+(.+)$", line) if m: current.append(m.group(1)) else: if len(current) >= 4: blocks.append(current) current = [] if len(current) >= 4: blocks.append(current) flagged = [] for block in blocks: forms = set() for item in block: words = re.findall(r"\b\w+\b", item) if not words: continue first = words[0] # Strip markdown formatting like **bold** clean = re.sub(r"^\*+", "", item).strip() # Question if clean.rstrip(".!").endswith("?"): forms.add("question") continue if first.lower().endswith("ing"): forms.add("gerund") continue # Title-case starting word = noun phrase likely if first[0].isupper() and len(first) > 1: forms.add("noun") continue # Lowercase starting word — assume verb (imperative) forms.add("verb") if len(forms) >= 3: flagged.append({"block_size": len(block), "forms": sorted(forms), "sample": block[:3]}) return flagged def count_passive_voice(sentences): """J1. Passive voice percentage (sentences containing passive constructions).""" if not sentences: return {"count": 0, "percent": 0.0} matches = 0 for s in sentences: if PASSIVE_VOICE.search(s): matches += 1 pct = round(matches / len(sentences) * 100, 1) return {"count": matches, "percent": pct} def count_nominalizations(text): """J2. -tion / -ment / -ance / -ence / -ity / -ization / -ism / -ness density.""" matches = NOMINALIZATION_SUFFIXES.findall(text) # Filter out very short common words that match the regex but aren't true nominalizations # (e.g. some short words may slip through; the regex requires \w{3,} prefix anyway) words = max(1, count_words(text)) density = round(len(matches) / words * 100, 2) return {"count": len(matches), "density_per_100w": density, "examples": matches[:10]} def count_decorative_qualifiers(text): """J5. Decorative qualifier density per 100 words.""" matches = DECORATIVE_QUALIFIERS.findall(text) words = max(1, count_words(text)) density = round(len(matches) / words * 100, 2) return {"count": len(matches), "density_per_100w": density, "examples": matches[:10]} def find_negative_constructions(text): """J8. 'not un-', 'not in-', 'don't fail to', etc.""" return find_regex_hits(text, NEGATIVE_CONSTRUCTIONS) def acronym_window_compound(text, words_per_window=100): """Compound trigger: 4+ DISTINCT undefined acronyms in any 100-word window. Threshold is on distinct acronyms (not occurrences) to avoid escalating when one acronym is repeated. The spec says "3+ undefined acronyms" but with instance counting this fires too readily on cover-letter prose with a few project names. Tightening to 4 distinct undefined acronyms in a window. """ word_tokens = re.findall(r"\b\S+\b", text) if len(word_tokens) < words_per_window: return False introduced = set(PAREN_EXPANSION.findall(text)) for start in range(0, len(word_tokens) - words_per_window + 1, max(1, words_per_window // 4)): window = " ".join(word_tokens[start : start + words_per_window]) acros = ACRONYM_TOKEN.findall(window) undef = {a for a in acros if a not in KNOWN_ACRONYMS and a not in introduced} if len(undef) >= 4: return True return False def named_entity_window_compound(text, sentences, words_per_window=100): """Compound trigger: 7+ named entities in any 100-word window. Spec says 5+, but at exactly 5/100w prose with a few company names triggers. Tightening to 7+ matches the "named-entity bombing" case from comprehension.md F2 — extreme density, not normal personal-story prose. """ word_tokens = re.findall(r"\b\S+\b", text) if len(word_tokens) < words_per_window: return False sentence_starts = set() for s in sentences: ws = re.findall(r"\b\w+\b", s) if ws: sentence_starts.add(ws[0]) common_calendar = { "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", } common_starts = {"The", "This", "That", "These", "Those", "It", "We", "I", "You", "They", "He", "She", "A", "An", "Our", "Your", "My", "Their", "If", "When", "Where", "What", "Why", "How", "After", "Before", "In", "On", "At", "From", "To", "But", "And", "Or", "So", "Then"} for start in range(0, len(word_tokens) - words_per_window + 1, max(1, words_per_window // 4)): window = " ".join(word_tokens[start : start + words_per_window]) # Find capitalized tokens toks = re.findall(r"\b([A-Z][a-zA-Z]+)\b", window) ents = [t for t in toks if t not in common_calendar and t not in KNOWN_ACRONYMS and not (t in sentence_starts and t in common_starts)] # Use distinct count — repeated mentions of the same brand shouldn't escalate. if len(set(ents)) >= 7: return True return False def long_paragraph_no_subheading(text, paragraphs, threshold_words=200): """Compound trigger: any paragraph >200 words with no subheading inside. Threshold 200 (rather than 150) avoids escalating standard letter / essay paragraphs. The trigger fires for genuinely intimidating wall-of-text blocks that lack any internal structure — see calibration.md §9. """ for p in paragraphs: if count_words(p) > threshold_words: if not re.search(r"^#+\s+", p, flags=re.MULTILINE): return True return False # ============================================================================= # READABILITY METRICS # ============================================================================= def compute_readability_metrics(text, sentences, words_total): """Compute the 8 metrics from references/readability-metrics.md. Returns a dict with all metric values rounded to 2 decimal places (or None if the input is too short to compute reliably). """ if not sentences or words_total < 5: return { "flesch_reading_ease": None, "flesch_kincaid_grade": None, "smog": None, "coleman_liau": None, "dale_chall": None, "lexical_density": None, "avg_sentence_length": None, "sentence_length_stddev": None, "passive_voice_pct": None, "polysyllable_count": None, "difficult_word_pct": None, } # Tokenize words for syllable + Dale-Chall accounting word_list = re.findall(r"\b[a-zA-Z']+\b", text) word_count = len(word_list) if word_count == 0: return { "flesch_reading_ease": None, "flesch_kincaid_grade": None, "smog": None, "coleman_liau": None, "dale_chall": None, "lexical_density": None, "avg_sentence_length": None, "sentence_length_stddev": None, "passive_voice_pct": None, "polysyllable_count": None, "difficult_word_pct": None, } # Syllable totals syllables_total = 0 polysyllables = 0 for w in word_list: s = count_syllables(w) syllables_total += s if s >= 3: polysyllables += 1 sentence_count = len(sentences) asl = word_count / sentence_count # avg sentence length asw = syllables_total / word_count # avg syllables per word # 1. Flesch Reading Ease fre = 206.835 - 1.015 * asl - 84.6 * asw # 2. Flesch-Kincaid Grade Level fkgl = 0.39 * asl + 11.8 * asw - 15.59 # 3. SMOG (only meaningful for ≥30 sentences) smog = 1.0430 * math.sqrt(polysyllables * 30 / sentence_count) + 3.1291 # 4. Coleman-Liau Index letters = sum(1 for c in text if c.isalpha()) L = letters / word_count * 100 # letters per 100 words S = sentence_count / word_count * 100 # sentences per 100 words cli = 0.0588 * L - 0.296 * S - 15.8 # 5. Dale-Chall (simplified — using curated wordlist) difficult = 0 for w in word_list: if w.lower().strip("'") not in DALE_CHALL_WORDLIST: difficult += 1 diff_pct = difficult / word_count * 100 dc_score = 0.1579 * diff_pct + 0.0496 * asl if diff_pct > 5: dc_score += 3.6365 # 6. Lexical density (heuristic) content_words = sum(1 for w in word_list if w.lower() not in STOPWORDS) lex_density = content_words / word_count * 100 # 7. Avg sentence length + stddev lengths = [count_words(s) for s in sentences] if lengths: m = sum(lengths) / len(lengths) var = sum((x - m) ** 2 for x in lengths) / len(lengths) std = math.sqrt(var) else: m = 0 std = 0 # 8. Passive voice % passive_data = count_passive_voice(sentences) return { "flesch_reading_ease": round(fre, 2), "flesch_kincaid_grade": round(fkgl, 2), "smog": round(smog, 2), "coleman_liau": round(cli, 2), "dale_chall": round(dc_score, 2), "lexical_density": round(lex_density, 2), "avg_sentence_length": round(m, 2), "sentence_length_stddev": round(std, 2), "passive_voice_pct": passive_data["percent"], "polysyllable_count": polysyllables, "difficult_word_pct": round(diff_pct, 2), } # ============================================================================= # COMPREHENSION ANALYSIS # ============================================================================= def analyze_comprehension(text, audience="casual", sentences=None, paragraphs=None, total_words=None): """Run the comprehension axis. Mirrors structure of analyze() AI-slop axis.""" clean = strip_code_blocks(text) if sentences is None: sentences = split_sentences(clean) if paragraphs is None: paragraphs = split_paragraphs(clean) if total_words is None: total_words = sum(count_words(s) for s in sentences) # Detector outputs acronyms = find_undefined_acronyms(clean) entities = find_named_entities(clean, sentences) stat_bomb = find_stat_bombing(sentences) walls = find_wall_of_text(paragraphs) density_no_h = find_density_without_headings(text) colons = find_telegraphic_colons(paragraphs) list_prose = find_list_pretending_prose(paragraphs) long_sents = find_long_sentences(sentences, threshold=30) runons = find_runon_sentences(sentences) glue = find_glue_word_starts(sentences) forward = find_forward_references(clean) skim = find_no_skim_layer(text, total_words) hierarchy = find_hierarchy_collapse(text) parallelism = find_parallelism_failure(text) passive = count_passive_voice(sentences) nominalizations = count_nominalizations(clean) hedge_stack = find_hedge_stacking(sentences) decorative = count_decorative_qualifiers(clean) negatives = find_negative_constructions(clean) # Readability metrics panel metrics = compute_readability_metrics(clean, sentences, total_words) # Severity counting per comprehension.md / calibration.md §9 # H = high, M = medium, L = low compH = 0 compM = 0 compL = 0 # F1: H if density >= 3 per 100 words f1_flag = acronyms["density_per_100w"] >= 3 if f1_flag: compH += 1 # Each undefined acronym is also a small instance hit (treat as 1 H point per 5 occurrences) if acronyms["total_count"] >= 5: compH += acronyms["total_count"] // 5 # F2: H if density >= 5 per 100 words. Above 8/100w add an extra H per 10 entities. f2_flag = entities["density_per_100w"] >= 5 if f2_flag: compH += 1 # Only stack additional H weight when the entity density is well above threshold. if entities["density_per_100w"] >= 8 and entities["total_count"] >= 10: compH += entities["total_count"] // 10 # F3: H per stat-bombed sentence compH += len(stat_bomb) # F4: M per wall paragraph compM += len(walls) # F5: H if flagged if density_no_h["flagged"]: compH += 1 # G1: H per paragraph with 3+ telegraphic colons compH += len(colons) # G2: M per list-pretending-to-be-prose paragraph compM += len(list_prose) # G3: M per long sentence (30-40w), H per very long (>40w). # 30+ word sentences are common in polished prose — only count as H once # they cross the 40-word "comprehension cliff" or stack multiple instances. very_long = [s for s in long_sents if s[1] > 40] moderate_long = [s for s in long_sents if 30 < s[1] <= 40] compH += len(very_long) compM += len(moderate_long) # G4: H per run-on (4+ independent clauses always overflows working memory) compH += len(runons) # G5: L per glue-word instance compL += len(glue) # H5: H per forward-reference compH += sum(c for _, c, _ in forward) # I9: M if no skim layer if skim["flagged"]: compM += 1 # I5: M per hierarchy skip compM += len(hierarchy) # I12: M per parallelism block compM += len(parallelism) # J1: M if passive % > 10 if passive["percent"] > 10: compM += 1 # J2: M if nominalization density > 5 per 100w if nominalizations["density_per_100w"] > 5: compM += 1 # J4: M per hedge-stacked sentence (reuse AI-slop detector) compM += len(hedge_stack) # J5: L per decorative qualifier instance over 2 per 100w threshold if decorative["density_per_100w"] > 2: # Count overage as low-severity points compL += decorative["count"] # J8: L per negative construction compL += sum(c for _, c, _ in negatives) # Density score per calibration.md §9 units = max(1, total_words / 500) comp_density = ((compH * 3) + (compM * 1) + (compL * 0.25)) / units # Verdict thresholds — same scale as AI-Slop (calibration.md §9) if comp_density >= 18: verdict = "CRITICAL" elif comp_density >= 10: verdict = "HIGH" elif comp_density >= 5: verdict = "MEDIUM" elif comp_density >= 2: verdict = "LOW" else: verdict = "PASS" # Compound triggers — escalate one tier escalations = [] if acronym_window_compound(clean): escalations.append("4+ distinct undefined acronyms in a 100-word window") if named_entity_window_compound(clean, sentences): escalations.append("7+ distinct named entities in a 100-word window") if any(c >= 3 for _, c, _ in colons): escalations.append("3+ telegraphic colon-labels in one paragraph") if long_paragraph_no_subheading(text, paragraphs): escalations.append("Paragraph over 150 words with no subheading") if escalations: order = ["PASS", "LOW", "MEDIUM", "HIGH", "CRITICAL"] idx = order.index(verdict) verdict = order[min(len(order) - 1, idx + 1)] # Audience adjustment — relax thresholds for academic/technical audience_preset = AUDIENCE_PRESETS.get(audience, AUDIENCE_PRESETS["casual"]) audience_adjustment = None if audience in ("academic", "technical"): # Long sentences are tolerated more; downgrade if the verdict is driven # primarily by long-sentence or passive-voice flags. long_sent_share = ( len(long_sents) * 3 / max(1, comp_density * units) if comp_density > 0 else 0 ) if long_sent_share > 0.4 and verdict in ("HIGH", "MEDIUM"): order = ["PASS", "LOW", "MEDIUM", "HIGH", "CRITICAL"] idx = order.index(verdict) verdict = order[max(0, idx - 1)] audience_adjustment = ( f"{audience} audience: downgraded one tier " f"(long sentences expected in this register)" ) return { "verdict": verdict, "density": round(comp_density, 2), "totals": {"high": compH, "medium": compM, "low": compL}, "audience": audience, "audience_targets": audience_preset, "audience_adjustment": audience_adjustment, "escalations": escalations, "patterns": { "F1_undefined_acronyms": acronyms, "F2_named_entities": entities, "F3_stat_bombing": stat_bomb, "F4_wall_of_text": walls, "F5_density_no_headings": density_no_h, "G1_telegraphic_colons": colons, "G2_list_as_prose": list_prose, "G3_long_sentences": long_sents, "G4_runon_sentences": runons, "G5_glue_word_starts": glue, "H5_forward_references": forward, "I5_hierarchy_collapse": hierarchy, "I9_no_skim_layer": skim, "I12_parallelism_failure": parallelism, "J1_passive_voice": passive, "J2_nominalizations": nominalizations, "J4_hedge_stacking": hedge_stack, "J5_decorative_qualifiers": decorative, "J8_negative_constructions": negatives, }, "metrics": metrics, } # ============================================================================= # ANALYSIS # ============================================================================= def combined_recommendation(slop_verdict, comp_verdict): """Pick the cross-axis recommendation per calibration.md §11.""" rank = {"PASS": 0, "LOW": 1, "MEDIUM": 2, "HIGH": 3, "CRITICAL": 4} s = rank[slop_verdict] c = rank[comp_verdict] worst = max(s, c) if worst <= 1: return "Ship it. Polish-pass at most." if s == 2 and c == 2: return "Both cleanup. Often the same fixes." if s >= 3 and c >= 3: return "Full rewrite. Both axes failing." if c >= 3 and s <= 2: return "Comprehension rewrite. The texture is fine but the reader can't follow." if s >= 3 and c <= 2: return "AI-Slop rewrite. The reader-friendliness is fine but the AI texture is loud." if s == 2 or c == 2: return "Significant cleanup. The fixes overlap; tackle them together." return "Spot-fix the listed items. Reader will follow with minor friction." def analyze(text, genre=None, strict_em_dash=False, audience="casual"): """Run the full scan (both axes) and return a structured result.""" clean = strip_code_blocks(text) paragraphs = split_paragraphs(clean) sentences = split_sentences(clean) word_counts = [count_words(s) for s in sentences] total_words = sum(word_counts) em, en, dh = find_em_dashes(clean) # Genre detection detected_genre = detect_genre(clean) if not genre: genre = detected_genre # Model fingerprint fingerprint, fp_counts = detect_model_fingerprint(clean) # Burstiness burst = compute_burstiness(sentences) # Build results result = { "stats": { "words": total_words, "paragraphs": len(paragraphs), "sentences": len(sentences), "sentence_avg": round(sum(word_counts) / len(word_counts), 1) if word_counts else 0, "sentence_min": min(word_counts) if word_counts else 0, "sentence_max": max(word_counts) if word_counts else 0, "burstiness": burst, "contraction_ratio": contraction_ratio(clean), "detected_genre": detected_genre, "applied_genre": genre, "model_fingerprint": fingerprint, "fingerprint_counts": fp_counts, }, "high": { "em_dashes": em, "en_dashes": en, "double_hyphens": dh, "verbs_h": find_phrase_hits(clean, VERBS_H), "nouns_h": find_phrase_hits(clean, NOUNS_H), "intensifiers_h": find_phrase_hits(clean, INTENSIFIERS_H), "connectors_h": find_phrase_hits(clean, CONNECTORS_H), "sycophancy_open": find_regex_hits(clean, SYCOPHANCY_OPEN_H), "sycophancy_close": find_regex_hits(clean, SYCOPHANCY_CLOSE_H), "vague_authority_h": find_regex_hits(clean, VAGUE_AUTH_H), "knowledge_cutoff": find_regex_hits(clean, KNOWLEDGE_CUTOFF), "stake_inflation": find_regex_hits(clean, STAKE_INFLATION), "grandiose": find_regex_hits(clean, GRANDIOSE), "copula_avoidance": find_regex_hits(clean, COPULA_AVOIDANCE), "ing_tails": ING_TAIL.findall(clean), "throat_clearing": find_regex_hits(clean, THROAT_CLEARING), "rhetorical_qa": RHETORICAL_QA.findall(clean), "crafted_closer": find_crafted_closer(clean), "performative_opening": find_performative_opening(clean), "setup_reveal_endings": find_setup_reveal_endings(paragraphs), "fabricated_cases": find_fabricated_cases(clean), "buzzword_density": find_buzzword_density(paragraphs), "negation_reversals": find_negation_reversal_candidates(sentences), "cross_sentence_negation": find_cross_sentence_negation(sentences), "short_sentence_clusters_h": [r for r in find_short_sentence_clusters(sentences) if len(r) >= 4], }, "medium": { "dramatic_countdown": find_dramatic_countdown(sentences), "anaphora": find_anaphora(sentences), "short_sentence_clusters_m": [r for r in find_short_sentence_clusters(sentences) if len(r) == 3], "two_word_punchlines": find_two_word_punchlines(sentences), "three_beat_stacks": find_three_beat_stacks(clean), "verbs_m": find_phrase_hits(clean, VERBS_M), "nouns_m": find_phrase_hits(clean, NOUNS_M), "intensifiers_m": find_phrase_hits(clean, INTENSIFIERS_M), "connectors_m": find_phrase_hits(clean, CONNECTORS_M), "vague_authority_m": find_regex_hits(clean, VAGUE_AUTH_M), "hedge_stacking": find_hedge_stacking(sentences), "hedged_superlatives": find_regex_hits(clean, HEDGED_SUPERLATIVES), "while_openers": find_while_openers(clean), "x_meets_y": len(X_MEETS_Y.findall(clean)), "more_than_just": len(MORE_THAN_JUST.findall(clean)), "false_concession": find_regex_hits(clean, FALSE_CONCESSION), "false_range": len(FALSE_RANGE.findall(clean)), "pedagogical": find_regex_hits(clean, PEDAGOGICAL), "royal_we": find_regex_hits(clean, ROYAL_WE), "whether_or_openers": len(WHETHER_OR.findall(clean)), "both_sides_ism": find_regex_hits(clean, BOTH_SIDES), "real_tic": len(REAL_TIC.findall(clean)), "vapid_analogies": find_regex_hits(clean, VAPID_ANALOGY), "historical_analogy_stacking": [ m for m in [HISTORICAL_ANALOGY.findall(clean)] if len(m) >= 3 ], "dead_metaphor_repetition": [ (w, len(re.findall(r"\b" + w + r"\b", clean, flags=re.IGNORECASE))) for w in DEAD_METAPHORS if len(re.findall(r"\b" + w + r"\b", clean, flags=re.IGNORECASE)) >= 3 ], }, "low": { "magic_adverbs": find_phrase_hits(clean, MAGIC_ADVERBS), "bigram_repetition": find_bigram_repetition(clean), "markdown_tells": find_markdown_tells(text), }, } # Compute counts high_count = ( result["high"]["em_dashes"] + result["high"]["en_dashes"] + result["high"]["double_hyphens"] + sum(c for _, c in result["high"]["verbs_h"]) + sum(c for _, c in result["high"]["nouns_h"]) + sum(c for _, c in result["high"]["intensifiers_h"]) + sum(c for _, c in result["high"]["connectors_h"]) + sum(c for _, _, c in []) # placeholder + sum(c for _, c, _ in result["high"]["sycophancy_open"]) + sum(c for _, c, _ in result["high"]["sycophancy_close"]) + sum(c for _, c, _ in result["high"]["vague_authority_h"]) + sum(c for _, c, _ in result["high"]["knowledge_cutoff"]) + sum(c for _, c, _ in result["high"]["stake_inflation"]) + sum(c for _, c, _ in result["high"]["grandiose"]) + sum(c for _, c, _ in result["high"]["copula_avoidance"]) + len(result["high"]["ing_tails"]) + sum(c for _, c, _ in result["high"]["throat_clearing"]) + len(result["high"]["rhetorical_qa"]) + (1 if result["high"]["crafted_closer"] else 0) + (1 if result["high"]["performative_opening"] else 0) + len(result["high"]["setup_reveal_endings"]) + len(result["high"]["fabricated_cases"]) + len(result["high"]["buzzword_density"]) + len(result["high"]["negation_reversals"]) + len(result["high"]["cross_sentence_negation"]) + len(result["high"]["short_sentence_clusters_h"]) ) medium_count = ( len(result["medium"]["dramatic_countdown"]) + len(result["medium"]["anaphora"]) + len(result["medium"]["short_sentence_clusters_m"]) + len(result["medium"]["two_word_punchlines"]) + len(result["medium"]["three_beat_stacks"]) + sum(c for _, c in result["medium"]["verbs_m"]) + sum(c for _, c in result["medium"]["nouns_m"]) + sum(c for _, c in result["medium"]["intensifiers_m"]) + sum(c for _, c in result["medium"]["connectors_m"]) + sum(c for _, c, _ in result["medium"]["vague_authority_m"]) + len(result["medium"]["hedge_stacking"]) + sum(c for _, c, _ in result["medium"]["hedged_superlatives"]) + result["medium"]["while_openers"] + result["medium"]["x_meets_y"] + result["medium"]["more_than_just"] + sum(c for _, c, _ in result["medium"]["false_concession"]) + result["medium"]["false_range"] + sum(c for _, c, _ in result["medium"]["pedagogical"]) + sum(c for _, c, _ in result["medium"]["royal_we"]) + result["medium"]["whether_or_openers"] + sum(c for _, c, _ in result["medium"]["both_sides_ism"]) + result["medium"]["real_tic"] + sum(c for _, c, _ in result["medium"]["vapid_analogies"]) + len(result["medium"]["historical_analogy_stacking"]) + len(result["medium"]["dead_metaphor_repetition"]) ) low_count = ( sum(c for _, c in result["low"]["magic_adverbs"]) + len(result["low"]["bigram_repetition"]) + len(result["low"]["markdown_tells"]) ) # Apply genre adjustments if genre == "marketing": # Marketing legitimately uses some intensifiers and structure # Down-weight intensifier and connector buzzwords slightly adjusted_h = high_count - int(0.3 * sum(c for _, c in result["high"]["intensifiers_h"])) adjusted_h = max(0, adjusted_h) high_count = adjusted_h elif genre == "academic": # Academic legitimately uses hedging adjusted_m = medium_count - len(result["medium"]["hedge_stacking"]) adjusted_m = max(0, adjusted_m) medium_count = adjusted_m elif genre == "encyclopedic": # Wikipedia-style triggers false positives — reduce all by one tier high_count = max(0, high_count - 2) medium_count = max(0, medium_count - 2) # Em-dash strict mode if strict_em_dash and em > 0: # Already counted as H; nothing extra needed pass elif not strict_em_dash: # Em dashes alone = L unless 3+ per 500 words if total_words > 0 and em < (3 * total_words / 500): # Move em dashes from high to low high_count -= em low_count += em # Compute density score per calibration.md §1 units = max(1, total_words / 500) density = ((high_count * 3) + (medium_count * 1) + (low_count * 0.25)) / units # Verdict thresholds if density >= 18: verdict = "CRITICAL" elif density >= 10: verdict = "HIGH" elif density >= 5: verdict = "MEDIUM" elif density >= 2: verdict = "LOW" else: verdict = "PASS" # Compound triggers escalated = False # Three or more H tells in one paragraph paragraphs_with_h = [] for p in paragraphs: h_in_p = 0 for phrases in [VERBS_H, NOUNS_H, INTENSIFIERS_H, CONNECTORS_H]: for ph in phrases: h_in_p += len(re.findall(r"\b" + re.escape(ph) + r"\b", p, flags=re.IGNORECASE)) if h_in_p >= 3: paragraphs_with_h.append((p[:80], h_in_p)) if paragraphs_with_h: escalated = True # Uncanny valley uncanny_valley = False if ( high_count == 0 and (medium_count + low_count) >= 8 * units and burst is not None and burst < 0.5 ): uncanny_valley = True escalated = True if escalated: verdict_order = ["PASS", "LOW", "MEDIUM", "HIGH", "CRITICAL"] idx = verdict_order.index(verdict) verdict = verdict_order[min(len(verdict_order) - 1, idx + 1)] # Sanded-prose signature h_vocab_total = ( sum(c for _, c in result["high"]["verbs_h"]) + sum(c for _, c in result["high"]["nouns_h"]) ) structural_count = ( sum(c for _, c, _ in result["high"]["copula_avoidance"]) + len(result["high"]["ing_tails"]) + len(result["high"]["negation_reversals"]) + len(result["high"]["cross_sentence_negation"]) + len(result["medium"]["anaphora"]) + result["medium"]["false_range"] + result["medium"]["while_openers"] + len(result["medium"]["hedge_stacking"]) ) sanded = h_vocab_total <= 1 and structural_count >= 5 result["verdict"] = verdict result["totals"] = {"high": high_count, "medium": medium_count, "low": low_count} result["density"] = round(density, 2) result["calibration"] = { "compound_escalation": bool(paragraphs_with_h), "uncanny_valley": uncanny_valley, "sanded_prose": sanded, "em_dash_mode": "strict" if strict_em_dash else "default", } # ========================================================================= # COMPREHENSION AXIS (parallel to AI-Slop) # ========================================================================= comp = analyze_comprehension( text, audience=audience, sentences=sentences, paragraphs=paragraphs, total_words=total_words, ) result["comprehension"] = comp # Combined cross-axis recommendation result["combined_recommendation"] = combined_recommendation(verdict, comp["verdict"]) return result # ============================================================================= # OUTPUT FORMATTERS # ============================================================================= def format_human(result): lines = [] s = result["stats"] comp = result.get("comprehension", {}) lines.append("=" * 70) lines.append("SLOP-COP DUAL-AXIS SCAN") lines.append("=" * 70) lines.append("") lines.append(f"AI-Slop: {result['verdict']:<10} (density {result['density']})") if comp: lines.append( f"Comprehension: {comp['verdict']:<10} (density {comp['density']}) " f"[audience: {comp['audience']}]" ) lines.append("") rec = result.get("combined_recommendation", "") if rec: lines.append(f"Combined: {rec}") lines.append("") lines.append("-" * 70) lines.append("AI-SLOP AXIS") lines.append("-" * 70) lines.append(f"Verdict: {result['verdict']}") lines.append(f"Density score: {result['density']} per 500w") lines.append( f"Violations: {result['totals']['high']}H, " f"{result['totals']['medium']}M, {result['totals']['low']}L" ) lines.append("") lines.append("--- Stats ---") lines.append(f"Words: {s['words']} | Paragraphs: {s['paragraphs']} | Sentences: {s['sentences']}") lines.append(f"Sentence avg: {s['sentence_avg']}w | min {s['sentence_min']}w | max {s['sentence_max']}w") burst = s["burstiness"] burst_str = f"{burst} (humans 0.6-1.2, AI 0.2-0.4)" if burst is not None else "n/a (too few sentences)" lines.append(f"Burstiness: {burst_str}") contr = s["contraction_ratio"] lines.append(f"Contraction ratio: {contr if contr is not None else 'n/a'}") lines.append(f"Detected genre: {s['detected_genre']}") if s["applied_genre"] != s["detected_genre"]: lines.append(f"Applied genre: {s['applied_genre']} (override)") lines.append(f"Model fingerprint: {s['model_fingerprint']} {s['fingerprint_counts']}") lines.append("") # Calibration cal = result["calibration"] lines.append("--- Calibration ---") if cal["compound_escalation"]: lines.append("COMPOUND TRIGGER: 3+ H tells in one paragraph — verdict escalated one tier") if cal["uncanny_valley"]: lines.append("UNCANNY VALLEY: many weak tells stacking with low burstiness") if cal["sanded_prose"]: lines.append("SANDED-PROSE SIGNATURE: low famous-vocab, high structural — looks prompt-engineered") lines.append(f"Em-dash mode: {cal['em_dash_mode']}") lines.append("") # High severity h = result["high"] lines.append("--- HIGH SEVERITY ---") if h["em_dashes"] or h["en_dashes"] or h["double_hyphens"]: lines.append(f"Em/en dashes / double-hyphens: {h['em_dashes']}/{h['en_dashes']}/{h['double_hyphens']}") if h["verbs_h"]: lines.append("LLM-favored verbs:") for phrase, count in h["verbs_h"]: lines.append(f" - \"{phrase}\" ×{count}") if h["nouns_h"]: lines.append("Cliché metaphors / grandiose nouns:") for phrase, count in h["nouns_h"]: lines.append(f" - \"{phrase}\" ×{count}") if h["intensifiers_h"]: lines.append("Empty intensifiers:") for phrase, count in h["intensifiers_h"]: lines.append(f" - \"{phrase}\" ×{count}") if h["connectors_h"]: lines.append("Closing/connector clichés:") for phrase, count in h["connectors_h"]: lines.append(f" - \"{phrase}\" ×{count}") if h["sycophancy_open"]: lines.append("Sycophancy openers:") for pat, count, sample in h["sycophancy_open"]: lines.append(f" - \"{sample}\" ×{count}") if h["sycophancy_close"]: lines.append("Sycophancy closers:") for pat, count, sample in h["sycophancy_close"]: lines.append(f" - \"{sample}\" ×{count}") if h["vague_authority_h"]: lines.append("Vague-authority weasels:") for pat, count, sample in h["vague_authority_h"]: lines.append(f" - \"{sample}\" ×{count}") if h["knowledge_cutoff"]: lines.append("Knowledge-cutoff disclaimer leakage:") for pat, count, sample in h["knowledge_cutoff"]: lines.append(f" - \"{sample}\" ×{count}") if h["stake_inflation"]: lines.append("Stake inflation / future-flourish:") for pat, count, sample in h["stake_inflation"]: lines.append(f" - \"{sample}\" ×{count}") if h["grandiose"]: lines.append("Grandiose framing:") for pat, count, sample in h["grandiose"]: lines.append(f" - \"{sample}\" ×{count}") if h["copula_avoidance"]: lines.append("Copula avoidance:") for pat, count, sample in h["copula_avoidance"]: lines.append(f" - \"{sample}\" ×{count}") if h["ing_tails"]: lines.append(f"Present-participle '-ing' tails: {len(h['ing_tails'])}") for t in h["ing_tails"][:5]: lines.append(f" - \"...{t}...\"") if h["throat_clearing"]: lines.append("Throat-clearing meta-comments:") for pat, count, sample in h["throat_clearing"]: lines.append(f" - \"{sample}\" ×{count}") if h["rhetorical_qa"]: lines.append(f"Self-posed rhetorical Q+A: {len(h['rhetorical_qa'])}") if h["performative_opening"]: lines.append(f"Performative opening: \"{h['performative_opening'][0][:80]}\"") if h["crafted_closer"]: lines.append(f"Crafted closer: \"{h['crafted_closer'][0]}\"") if h["setup_reveal_endings"]: lines.append("Setup-reveal endings:") for idx, sentence, pat in h["setup_reveal_endings"]: lines.append(f" - Para {idx+1}: \"{sentence[:120]}\"") if h["fabricated_cases"]: lines.append(f"Fabricated case studies: {h['fabricated_cases']}") if h["buzzword_density"]: lines.append("Buzzword density (3+ in one paragraph):") for idx, count, found in h["buzzword_density"]: words = ", ".join(f"{w}×{n}" for w, n in found) lines.append(f" - Para {idx+1}: {count} buzzwords ({words})") lines.append("") # Medium severity m = result["medium"] lines.append("--- MEDIUM SEVERITY ---") # Negation reversals (now in high) — show in high section block instead if h.get("negation_reversals"): lines.append("Negation reversal candidates (high severity):") for idx, sentence, pat in h["negation_reversals"]: lines.append(f" - Sentence {idx+1}: \"{sentence[:120]}\"") if h.get("cross_sentence_negation"): lines.append("Cross-sentence negation reversal (X isn't Y. It's Z.):") for idx, cur, nxt in h["cross_sentence_negation"]: lines.append(f" - \"{cur[:80]}\" → \"{nxt[:80]}\"") if h.get("short_sentence_clusters_h"): lines.append("Short-sentence clusters (4+ in a row, high severity):") for run in h["short_sentence_clusters_h"][:2]: for idx, sent, wc in run: lines.append(f" - Sentence {idx+1} ({wc}w): \"{sent[:80]}\"") if m.get("short_sentence_clusters_m"): lines.append("Short-sentence clusters (3 in a row):") for run in m["short_sentence_clusters_m"][:2]: for idx, sent, wc in run: lines.append(f" - Sentence {idx+1} ({wc}w): \"{sent[:80]}\"") if m["dramatic_countdown"]: lines.append("Dramatic countdown candidates:") for idx, sents in m["dramatic_countdown"]: for s in sents: lines.append(f" - \"{s[:80]}\"") if m["anaphora"]: lines.append("Anaphora abuse (3+ identical openings):") for run in m["anaphora"]: lines.append(f" - {len(run)} consecutive sentences:") for idx, sent in run: lines.append(f" \"{sent[:80]}\"") if m["two_word_punchlines"]: lines.append("Two-word punchline candidates:") for idx, sentence, wc, prev in m["two_word_punchlines"]: lines.append(f" - Sentence {idx+1} ({wc}w): \"{sentence}\" after \"{prev}...\"") if m["three_beat_stacks"]: lines.append(f"Three-beat stack candidates: {len(m['three_beat_stacks'])}") for triple in m["three_beat_stacks"][:5]: lines.append(f" - \"{triple[0]}, {triple[1]}, and {triple[2]}\"") for label, items in [ ("LLM-favored verbs (M)", m["verbs_m"]), ("Cliché metaphors (M)", m["nouns_m"]), ("Empty intensifiers (M)", m["intensifiers_m"]), ("Connectors (M)", m["connectors_m"]), ]: if items: lines.append(f"{label}:") for phrase, count in items: lines.append(f" - \"{phrase}\" ×{count}") if m["hedge_stacking"]: lines.append(f"Hedge stacking (3+ hedges in one sentence): {len(m['hedge_stacking'])}") for idx, sent, count in m["hedge_stacking"][:3]: lines.append(f" - {count} hedges: \"{sent[:120]}\"") if m["hedged_superlatives"]: lines.append("Hedged superlatives:") for pat, count, sample in m["hedged_superlatives"]: lines.append(f" - \"{sample}\" ×{count}") if m["while_openers"] >= 2: lines.append(f"'While X, Y' openers: {m['while_openers']} (pattern emerges at >2)") if m["x_meets_y"]: lines.append(f"'X meets Y' formula: {m['x_meets_y']}") if m["more_than_just"]: lines.append(f"'More than just X' formula: {m['more_than_just']}") if m["false_range"]: lines.append(f"False range ('From X to Y'): {m['false_range']}") if m["false_concession"]: lines.append("False concession openers:") for pat, count, sample in m["false_concession"]: lines.append(f" - \"{sample}\" ×{count}") if m["pedagogical"]: lines.append("Pedagogical voice:") for pat, count, sample in m["pedagogical"]: lines.append(f" - \"{sample}\" ×{count}") if m["royal_we"]: lines.append("Royal-we / 'as a society':") for pat, count, sample in m["royal_we"]: lines.append(f" - \"{sample}\" ×{count}") if m["whether_or_openers"]: lines.append(f"'Whether you're X or Y' openers: {m['whether_or_openers']}") lines.append("") # Low severity l = result["low"] lines.append("--- LOW SEVERITY ---") if l["magic_adverbs"]: lines.append("Magic adverbs:") for adv, count in l["magic_adverbs"]: note = " (survives only when contrasting reality with theory)" if adv == "actually" else "" lines.append(f" - \"{adv}\" ×{count}{note}") if l["bigram_repetition"]: lines.append(f"Bigram repetition (5+ uses): {len(l['bigram_repetition'])}") for bg, count in l["bigram_repetition"][:5]: lines.append(f" - \"{bg[0]} {bg[1]}\" ×{count}") if l["markdown_tells"]: lines.append("Markdown / formatting tells:") for tell, val in l["markdown_tells"].items(): lines.append(f" - {tell}: {val}") lines.append("") # =========================================================================== # COMPREHENSION AXIS section # =========================================================================== if comp: lines.append("-" * 70) lines.append("COMPREHENSION AXIS") lines.append("-" * 70) lines.append(f"Verdict: {comp['verdict']}") lines.append(f"Density score: {comp['density']} per 500w") lines.append( f"Violations: {comp['totals']['high']}H, " f"{comp['totals']['medium']}M, {comp['totals']['low']}L" ) lines.append(f"Audience: {comp['audience']}") if comp.get("audience_adjustment"): lines.append(f"Audience tweak: {comp['audience_adjustment']}") if comp.get("escalations"): lines.append("Compound triggers (escalated one tier):") for esc in comp["escalations"]: lines.append(f" - {esc}") lines.append("") # Readability metric panel m = comp.get("metrics", {}) if m and m.get("flesch_reading_ease") is not None: lines.append("--- Readability metrics panel ---") tgt = comp.get("audience_targets", {}) fre = m["flesch_reading_ease"] fre_tag = "" if fre >= tgt.get("flesch_min", 60) else " (below target)" lines.append(f"Flesch Reading Ease: {fre}{fre_tag} (target ≥{tgt.get('flesch_min', 60)})") fkgl = m["flesch_kincaid_grade"] fk_tag = "" if fkgl <= tgt.get("fk_max", 9) else " (above target)" lines.append(f"Flesch-Kincaid Grade: {fkgl}{fk_tag} (target ≤{tgt.get('fk_max', 9)})") lines.append(f"SMOG Index: {m['smog']}") lines.append(f"Coleman-Liau Index: {m['coleman_liau']}") lines.append(f"Dale-Chall Score: {m['dale_chall']} ({m['difficult_word_pct']}% difficult)") lex = m["lexical_density"] lex_tag = "" if lex <= tgt.get("lex_max", 55) else " (above target)" lines.append(f"Lexical density: {lex}%{lex_tag} (target ≤{tgt.get('lex_max', 55)}%)") asl = m["avg_sentence_length"] asl_tag = "" if asl <= tgt.get("sent_max", 18) else " (above target)" lines.append(f"Avg sentence length: {asl}w (stddev {m['sentence_length_stddev']}){asl_tag} (target ≤{tgt.get('sent_max', 18)}w)") pv = m["passive_voice_pct"] pv_tag = "" if pv <= tgt.get("passive_max", 10) else " (above target)" lines.append(f"Passive voice: {pv}%{pv_tag} (target ≤{tgt.get('passive_max', 10)}%)") lines.append("") # Density signals p = comp.get("patterns", {}) lines.append("--- Density signals ---") f1 = p.get("F1_undefined_acronyms", {}) lines.append(f"Acronym density: {f1.get('density_per_100w', 0)} per 100w (threshold 3+, count {f1.get('total_count', 0)})") f2 = p.get("F2_named_entities", {}) lines.append(f"Named-entity density: {f2.get('density_per_100w', 0)} per 100w (threshold 5+, count {f2.get('total_count', 0)})") sb = p.get("F3_stat_bombing", []) max_num = max((c for _, _, c in sb), default=0) if sb else 0 lines.append(f"Stat-bombed sentences: {len(sb)} (max numerics in one sentence: {max_num})") col = p.get("G1_telegraphic_colons", []) max_col = max((c for _, c, _ in col), default=0) if col else 0 lines.append(f"Telegraphic colon-labels: {len(col)} paragraphs flagged (max in one paragraph: {max_col})") wt = p.get("F4_wall_of_text", []) max_para_w = max((w for _, _, w, _ in wt), default=0) if wt else 0 lines.append(f"Wall-of-text paragraphs: {len(wt)} (max paragraph words: {max_para_w})") lines.append("") # H severity hits lines.append("--- HIGH SEVERITY (comprehension) ---") if f1.get("total_count", 0) > 0: lines.append(f"Undefined acronyms ({f1['total_count']} total, {f1['distinct_count']} distinct):") for ac, cnt in f1["acronyms"][:8]: lines.append(f" - {ac} ×{cnt}") if f2.get("total_count", 0) > 0: lines.append(f"Named entities without context ({f2['total_count']} total):") for ent, cnt in f2["entities"][:8]: lines.append(f" - {ent} ×{cnt}") if sb: lines.append(f"Stat-bombed sentences (3+ numerics):") for idx, sample, n in sb[:5]: lines.append(f" - Sentence {idx+1} ({n} numerics): \"{sample}\"") if col: lines.append(f"Telegraphic colon-labeling paragraphs:") for idx, n, labels in col[:3]: lines.append(f" - Para {idx+1} ({n} colons): {labels[:3]}") ls = p.get("G3_long_sentences", []) if ls: lines.append(f"Long sentences (>30 words): {len(ls)}") for idx, wc, sample in ls[:3]: lines.append(f" - Sentence {idx+1} ({wc}w): \"{sample}\"") ro = p.get("G4_runon_sentences", []) if ro: lines.append(f"Run-on sentences (4+ clauses): {len(ro)}") for idx, n, sample in ro[:3]: lines.append(f" - Sentence {idx+1} ({n} clauses): \"{sample}\"") fr = p.get("H5_forward_references", []) if fr: lines.append("Forward references:") for pat, cnt, sample in fr: lines.append(f" - \"{sample}\" ×{cnt}") dnh = p.get("F5_density_no_headings", {}) if dnh.get("flagged"): lines.append(f"Density-without-headings: {dnh.get('reason')}") # M severity hits lines.append("") lines.append("--- MEDIUM SEVERITY (comprehension) ---") if wt: lines.append(f"Wall-of-text paragraphs ({len(wt)}):") for idx, sc, wc, sample in wt[:3]: lines.append(f" - Para {idx+1} ({sc} sentences, {wc} words): \"{sample}\"") lp = p.get("G2_list_as_prose", []) if lp: lines.append(f"List-pretending-to-be-prose paragraphs: {len(lp)}") for idx, semi, plus, sample in lp[:3]: lines.append(f" - Para {idx+1} ({semi} semicolons, {plus} plus signs): \"{sample}\"") sk = p.get("I9_no_skim_layer", {}) if sk.get("flagged"): lines.append(f"No skim layer: 0 bold/strong markers in {sk.get('words')} words") hc = p.get("I5_hierarchy_collapse", []) if hc: lines.append(f"Hierarchy collapse (heading skips): {len(hc)}") for skip in hc[:3]: lines.append(f" - H{skip['from_level']} → H{skip['to_level']}: \"{skip['from']}\" → \"{skip['to']}\"") pf = p.get("I12_parallelism_failure", []) if pf: lines.append(f"Parallelism failure in lists: {len(pf)} blocks") for blk in pf[:2]: lines.append(f" - {blk['block_size']} bullets, mixed forms: {blk['forms']}") pv_data = p.get("J1_passive_voice", {}) if pv_data.get("percent", 0) > 10: lines.append(f"Passive voice excess: {pv_data['percent']}% (threshold 10%)") nm = p.get("J2_nominalizations", {}) if nm.get("density_per_100w", 0) > 5: lines.append(f"Nominalization density: {nm['density_per_100w']} per 100w (threshold 5)") if nm.get("examples"): lines.append(f" - examples: {nm['examples'][:6]}") hs = p.get("J4_hedge_stacking", []) if hs: lines.append(f"Hedge stacking (3+ hedges/sentence): {len(hs)}") for idx, sent, n in hs[:2]: lines.append(f" - {n} hedges: \"{sent[:120]}\"") # L severity hits lines.append("") lines.append("--- LOW SEVERITY (comprehension) ---") gw = p.get("G5_glue_word_starts", []) if gw: lines.append(f"Glue-word sentence starts: {len(gw)}") for idx, sample, _ in gw[:3]: lines.append(f" - Sentence {idx+1}: \"{sample}\"") dq = p.get("J5_decorative_qualifiers", {}) if dq.get("density_per_100w", 0) > 2: lines.append( f"Decorative qualifiers: {dq['count']} ({dq['density_per_100w']} per 100w; threshold 2)" ) if dq.get("examples"): lines.append(f" - examples: {dq['examples'][:6]}") nc = p.get("J8_negative_constructions", []) if nc: lines.append(f"Negative constructions: {sum(c for _, c, _ in nc)} occurrences") for pat, cnt, sample in nc[:3]: lines.append(f" - \"{sample}\" ×{cnt}") lines.append("") lines.append("=" * 70) lines.append("Note: scanner catches mechanical violations only.") lines.append("Qualitative patterns (the actual force of metaphors, real-vs-") lines.append("decorative judgment, voice consistency, missing thesis,") lines.append("curse of knowledge) require reading.") lines.append("=" * 70) return "\n".join(lines) def format_quick(result): """Compact one-screen output for embedding in other skills.""" lines = [] comp = result.get("comprehension", {}) if comp: lines.append( f"AI-Slop: {result['verdict']} (density {result['density']}) | " f"Comprehension: {comp['verdict']} (density {comp['density']})" ) else: lines.append(f"Verdict: {result['verdict']} (density {result['density']})") lines.append( f"AI-Slop violations: {result['totals']['high']}H, " f"{result['totals']['medium']}M, {result['totals']['low']}L" ) if comp: lines.append( f"Comp violations: {comp['totals']['high']}H, " f"{comp['totals']['medium']}M, {comp['totals']['low']}L " f"[audience: {comp['audience']}]" ) burst = result["stats"]["burstiness"] lines.append(f"Burstiness: {burst if burst is not None else 'n/a'}") lines.append(f"Genre: {result['stats']['detected_genre']} | Fingerprint: {result['stats']['model_fingerprint']}") if result.get("combined_recommendation"): lines.append(f"Combined: {result['combined_recommendation']}") # Top fixes — pick the highest-count items fixes = [] for phrase, count in result["high"]["verbs_h"][:2]: fixes.append(f"\"{phrase}\" ×{count}") for phrase, count in result["high"]["nouns_h"][:1]: fixes.append(f"\"{phrase}\" ×{count}") for phrase, count in result["high"]["intensifiers_h"][:1]: fixes.append(f"\"{phrase}\" ×{count}") if result["high"]["em_dashes"]: fixes.append(f"em dashes ×{result['high']['em_dashes']}") if result["high"]["sycophancy_open"]: fixes.append("opener sycophancy") if result["high"]["sycophancy_close"]: fixes.append("closer sycophancy") # Comprehension fixes if comp: cp = comp.get("patterns", {}) f1 = cp.get("F1_undefined_acronyms", {}) if f1.get("total_count", 0) >= 3: fixes.append(f"undefined acronyms ×{f1['total_count']}") f2 = cp.get("F2_named_entities", {}) if f2.get("total_count", 0) >= 5: fixes.append(f"named-entity bombing ×{f2['total_count']}") ls = cp.get("G3_long_sentences", []) if ls: fixes.append(f"long sentences ×{len(ls)}") col = cp.get("G1_telegraphic_colons", []) if col: fixes.append(f"telegraphic colon-labels ×{len(col)}") ro = cp.get("G4_runon_sentences", []) if ro: fixes.append(f"run-on sentences ×{len(ro)}") if fixes: lines.append("Top fixes: " + ", ".join(fixes[:5])) return "\n".join(lines) # ============================================================================= # CLI # ============================================================================= def main(): parser = argparse.ArgumentParser( description="slop-cop dual-axis scanner — AI-slop + comprehension." ) parser.add_argument("path", nargs="?", help="Path to a text/markdown file. Reads stdin if omitted.") parser.add_argument("--json", action="store_true", help="Output structured JSON.") parser.add_argument("--quick", action="store_true", help="Compact one-screen output.") parser.add_argument( "--genre", choices=["casual", "marketing", "academic", "encyclopedic", "fiction"], help="Override detected genre. Adjusts AI-slop severity thresholds per calibration.md §3.", ) parser.add_argument( "--audience", choices=["casual", "marketing", "academic", "encyclopedic", "technical", "fiction", "healthcare"], default="casual", help="Audience for the comprehension axis. Adjusts metric targets per calibration.md §10. Default: casual.", ) parser.add_argument( "--strict-em-dash", action="store_true", help="Treat ALL em dashes as H severity (Mahmoud-mode). Default: clusters only.", ) args = parser.parse_args() if args.path: try: text = Path(args.path).read_text(encoding="utf-8") except FileNotFoundError: print(f"File not found: {args.path}", file=sys.stderr) sys.exit(1) else: text = sys.stdin.read() if not text.strip(): print("Empty input.", file=sys.stderr) sys.exit(1) result = analyze( text, genre=args.genre, strict_em_dash=args.strict_em_dash, audience=args.audience, ) if args.json: print(json.dumps(result, indent=2, default=str)) elif args.quick: print(format_quick(result)) else: print(format_human(result)) if __name__ == "__main__": main()