#!/usr/bin/env python3 """LLM Law Library MCP Server — open legal research for AI. Comprehensive search over US law: - 10.7M court opinions (1685-present, all 50 states + federal) - 1.9M statutory sections (all 50 states + DC + federal) - 876K definition pointers across 52 jurisdictions - Shepardizer v2 (3.7M citation treatments) Tools: caselaw_search — search court opinions (vector/bm25/hybrid) caselaw_case — load full opinion text caselaw_shepardize — check if a case is still good law statute_search — search statutory codes by text or citation statute_section — load full statute section text term_search — look up statutory definitions of a legal term jurisdiction_terms — list defined terms for a jurisdiction law_library_help — documentation and usage guide law_library_stats — index statistics """ import json import sys import urllib.request MAC_URL = "http://192.168.86.145:8031" def query_mac(endpoint: str, data: dict = None, timeout: int = 30) -> dict: url = MAC_URL + endpoint try: if data: req = urllib.request.Request( url, data=json.dumps(data).encode(), headers={"Content-Type": "application/json"}) else: req = urllib.request.Request(url) with urllib.request.urlopen(req, timeout=timeout) as resp: return json.loads(resp.read()) except Exception as e: return {"error": str(e)} TOOLS = [ { "name": "caselaw_search", "description": ( "Search the complete US case law corpus: 10.7M opinions, 77M passages, " "1685 to present, all 50 states + federal courts. Three retrieval modes: " "'hybrid' (default, best for most queries — fuses lexical BM25 + semantic vector), " "'bm25' (deterministic lexical — best for exact citations like '410 U.S. 113', " "party names, or specific legal phrases), " "'vector' (semantic similarity — best for conceptual queries). " "Returns ranked results with case name, date, court, citation count, " "and Shepardizer signals showing whether each case is still good law." ), "inputSchema": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query — natural language, citation, or party name", }, "mode": { "type": "string", "enum": ["hybrid", "bm25", "vector"], "description": ( "Retrieval mode. 'hybrid' (default) fuses BM25 + vector via RRF. " "'bm25' for deterministic lexical search (exact citations, party names). " "'vector' for semantic similarity." ), "default": "hybrid", }, "top_k": { "type": "integer", "description": "Max results (default 10, max 25)", "default": 10, }, "court_id": { "type": "string", "description": "Filter by court ID (e.g. 'scotus', 'ca9', 'deld'). Only applies to bm25/hybrid modes.", }, "source": { "type": "string", "enum": ["opinion", "parenthetical"], "description": "Filter source type (vector mode only). Omit for both.", }, }, "required": ["query"], }, }, { "name": "caselaw_case", "description": ( "Load a full court opinion by ID. Returns case name, date, judges, " "citation count, Shepardizer treatment summary, syllabus, and the " "complete opinion text. Use after caselaw_search to read the full opinion." ), "inputSchema": { "type": "object", "properties": { "opinion_id": { "type": "integer", "description": "The opinion ID (from search results source_id or opinion_id field)", }, }, "required": ["opinion_id"], }, }, { "name": "caselaw_shepardize", "description": ( "Shepardize a court opinion: check whether it is still good law. " "Shows how subsequent courts treated this case — affirmed, distinguished, " "overruled, reversed, etc. — with counts. Built from the open CourtListener " "citation graph with 3.7M classified treatments. Includes both total and " "LLM-verified counts for negative treatments." ), "inputSchema": { "type": "object", "properties": { "opinion_id": { "type": "integer", "description": "The opinion ID to shepardize", }, }, "required": ["opinion_id"], }, }, { "name": "statute_search", "description": ( "Search US statutory codes — 1.9M sections across all 50 states, DC, and federal. " "BM25 deterministic lexical search. Finds statutes by citation (e.g. '42 U.S.C. § 1983'), " "topic (e.g. 'qualified immunity'), or catch line text. " "Filter by jurisdiction to search within a single state. " "Results include definition counts showing how many defined terms appear in each section." ), "inputSchema": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query — citation, topic, or keyword", }, "jurisdiction": { "type": "string", "description": ( "Filter by jurisdiction. Format: 'federal' for US Code, " "'state:CA' for California, 'state:NY' for New York, 'state:TX' for Texas, etc. " "All 50 states + DC available. Omit to search all jurisdictions." ), }, "top_k": { "type": "integer", "description": "Max results (default 10, max 25)", "default": 10, }, "status": { "type": "string", "description": "Filter by status: 'active', 'repealed', 'renumbered', etc.", }, }, "required": ["query"], }, }, { "name": "statute_section", "description": ( "Load the full text of a statute section by citation or ID. Returns the complete " "section text, catch line, status, effective date, plus any statutory definitions " "found in the section and any 'red flags' (terms used in this section that have " "no statutory definition in this jurisdiction — important gaps to investigate)." ), "inputSchema": { "type": "object", "properties": { "citation": { "type": "string", "description": "Official citation (e.g. '42 U.S.C. § 1983', '11 Del. C. § 101')", }, "statute_version_id": { "type": "integer", "description": "Statute version ID (from statute_search results). Use citation OR this.", }, }, }, }, { "name": "term_search", "description": ( "Look up where a legal term is statutorily defined across US jurisdictions. " "Returns definition pointers — which sections define the term, in which states, " "with what scope (section/chapter/title), and how many court opinions cite that definition. " "Also shows 'red flags': jurisdictions where the term is heavily USED in statutes but " "has NO statutory definition (indicating reliance on common law meaning, a cross-reference " "gap, or a missing definition). This is critical for legal research — the dangerous case " "is a definition you don't know exists. " "Covers 876K definition pointers across 52 jurisdictions, 210K unique terms." ), "inputSchema": { "type": "object", "properties": { "term": { "type": "string", "description": "Legal term to look up (e.g. 'person', 'fiduciary', 'motor vehicle')", }, "jurisdiction": { "type": "string", "description": "Filter to one jurisdiction (e.g. 'federal', 'state:CA'). Omit for all.", }, "include_red_flags": { "type": "boolean", "description": "Include red flags — sections that USE this term but have no definition for it. Default false.", "default": False, }, "top_k": { "type": "integer", "description": "Max results (default 20, max 50)", "default": 20, }, }, "required": ["term"], }, }, { "name": "jurisdiction_terms", "description": ( "List all statutorily defined terms for a jurisdiction, ranked by how often " "court opinions cite the definitions. Shows the most legally important definitions " "in a state. Also reports red flag count (terms used but not defined)." ), "inputSchema": { "type": "object", "properties": { "jurisdiction": { "type": "string", "description": "Jurisdiction to list (e.g. 'federal', 'state:DE', 'state:CA')", }, "top_k": { "type": "integer", "description": "Max terms to return (default 50, max 200)", "default": 50, }, }, "required": ["jurisdiction"], }, }, { "name": "law_library_help", "description": ( "Show documentation for the LLM Law Library — what data is available, " "how to use each tool, what jurisdictions are covered, and key concepts " "like Shepardizer, red flags, and definition pointers." ), "inputSchema": {"type": "object", "properties": {}}, }, { "name": "law_library_stats", "description": ( "Show comprehensive index statistics: caselaw corpus size, statute coverage, " "definition counts, Shepardizer treatment counts, and per-jurisdiction section counts." ), "inputSchema": {"type": "object", "properties": {}}, }, ] def handle_jsonrpc(request: dict) -> dict | None: method = request.get("method", "") req_id = request.get("id") params = request.get("params", {}) if method == "initialize": return {"jsonrpc": "2.0", "id": req_id, "result": { "protocolVersion": "2024-11-05", "capabilities": {"tools": {"listChanged": False}}, "serverInfo": {"name": "llm-law-library", "version": "3.0.0"}, }} if method == "notifications/initialized": return None if method == "tools/list": return {"jsonrpc": "2.0", "id": req_id, "result": {"tools": TOOLS}} if method == "tools/call": tool_name = params.get("name", "") args = params.get("arguments", {}) handler = { "caselaw_search": handle_search, "caselaw_case": handle_case, "caselaw_shepardize": handle_shepardize, "statute_search": handle_statute_search, "statute_section": handle_statute_section, "term_search": handle_term_search, "jurisdiction_terms": handle_jurisdiction_terms, "law_library_help": handle_help, "law_library_stats": handle_stats, }.get(tool_name) if handler: return {"jsonrpc": "2.0", "id": req_id, "result": handler(args)} return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32601, "message": f"Unknown tool: {tool_name}"}} if req_id is not None: return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32601, "message": f"Unknown method: {method}"}} return None def handle_search(args: dict) -> dict: query = args.get("query", "").strip() if not query: return _err("query is required") req = {"query": query, "top_k": min(args.get("top_k", 10), 25)} mode = args.get("mode", "hybrid") req["mode"] = mode if args.get("court_id"): req["court_id"] = args["court_id"] if args.get("source"): req["source"] = args["source"] data = query_mac("/search", req) if "error" in data and "results" not in data: return _err(data["error"]) lines = [f"Mode: {data.get('mode', mode)} | Results: {data.get('count', 0)}\n"] for i, r in enumerate(data.get("results", []), 1): oid = r.get('opinion_id') or r.get('source_id') score_key = 'score' if 'score' in r else 'similarity' header = f"{i}. [opinion:{oid}] {score_key}={r.get(score_key, 0)}" if r.get('case_name'): header += f"\n {r['case_name']}" if r.get('date_filed'): header += f" ({r['date_filed']})" if r.get('precedential_status'): header += f" [{r['precedential_status']}]" if r.get('citation_count') and r['citation_count'] > 0: header += f" (cited {r['citation_count']}x)" if r.get('court_id'): header += f" [{r['court_id']}]" if r.get('citations'): header += f"\n Citations: {', '.join(r['citations'][:3])}" if r.get('shepardizer'): shep = r['shepardizer'] if isinstance(shep, dict): if 'worst' in shep: worst = shep.get('worst', '') treatments = shep.get('treatments', {}) parts = [] if worst: parts.append(f"WORST: {worst}") for k, v in treatments.items(): parts.append(f"{k}: {v}") header += f"\n Shepardizer: {', '.join(parts)}" else: parts = [f"{k}: {v}" for k, v in shep.items()] header += f"\n Shepardizer: {', '.join(parts)}" lines.append(header) if r.get('passage_text'): lines.append(f" {r['passage_text'][:500]}") lines.append("") return _ok("\n".join(lines) if lines else "No results.") def handle_case(args: dict) -> dict: opinion_id = args.get("opinion_id") if not opinion_id: return _err("opinion_id is required") data = query_mac("/case", {"opinion_id": opinion_id}, timeout=15) if "error" in data and "full_text" not in data: return _err(data["error"]) lines = [ f"Case: {data.get('case_name', 'Unknown')}", f"Filed: {data.get('date_filed', 'Unknown')}", f"Status: {data.get('precedential_status', 'Unknown')}", f"Judges: {data.get('judges', 'Unknown')}", f"Citations: {data.get('citation_count', 0)} in cluster, {data.get('citing_count', 0)} citing", f"Text length: {data.get('text_length', 0)} chars", ] if data.get('shepardizer'): shep = ', '.join(f"{k}: {v}" for k, v in data['shepardizer'].items()) lines.append(f"Shepardizer: {shep}") if data.get('syllabus'): lines.append(f"\nSyllabus: {data['syllabus']}") if data.get('full_text'): lines.append(f"\n{'='*60}\nFULL OPINION TEXT\n{'='*60}\n{data['full_text']}") return _ok("\n".join(lines)) def handle_shepardize(args: dict) -> dict: opinion_id = args.get("opinion_id") if not opinion_id: return _err("opinion_id is required") data = query_mac("/case", {"opinion_id": opinion_id}, timeout=15) if "error" in data and "case_name" not in data: return _err(data["error"]) lines = [f"Shepardizer: {data.get('case_name', 'Unknown')} ({data.get('date_filed', '')})"] if data.get('shepardizer'): lines.append("\nTreatment Summary:") for treatment, count in data['shepardizer'].items(): lines.append(f" {treatment}: {count}") else: lines.append("\nNo treatments found.") lines.append(f"\nCiting opinions: {data.get('citing_count', 0)}") return _ok("\n".join(lines)) def handle_statute_search(args: dict) -> dict: query = args.get("query", "").strip() if not query: return _err("query is required") req = { "query": query, "top_k": min(args.get("top_k", 10), 25), } if args.get("jurisdiction"): req["jurisdiction"] = args["jurisdiction"] if args.get("status"): req["status"] = args["status"] data = query_mac("/statute_search", req) if "error" in data and "results" not in data: return _err(data["error"]) lines = [f"Results: {data.get('count', 0)}\n"] for i, r in enumerate(data.get("results", []), 1): header = f"{i}. [{r.get('jurisdiction', '')}] {r.get('official_citation', '')}" header += f" (score={r.get('score', 0)})" if r.get('catch_line'): header += f"\n {r['catch_line']}" header += f"\n Code: {r.get('code_name', '')} | Status: {r.get('status', '')}" if r.get('definitions_count'): header += f" | Defines {r['definitions_count']} terms" lines.append(header) if r.get('snippet'): lines.append(f" {r['snippet'][:400]}") lines.append("") return _ok("\n".join(lines) if lines else "No results.") def handle_statute_section(args: dict) -> dict: req = {} if args.get("citation"): req["citation"] = args["citation"] elif args.get("statute_version_id"): req["statute_version_id"] = args["statute_version_id"] else: return _err("Provide citation or statute_version_id") data = query_mac("/statute_section", req, timeout=15) if "error" in data and "full_text" not in data: return _err(data["error"]) lines = [ f"Citation: {data.get('official_citation', 'Unknown')}", f"Jurisdiction: {data.get('jurisdiction', 'Unknown')}", f"Code: {data.get('code_name', 'Unknown')}", f"Catch line: {data.get('catch_line', '')}", f"Status: {data.get('status', 'Unknown')}", f"Kind: {data.get('kind', 'Unknown')}", ] if data.get('effective_date'): lines.append(f"Effective: {data['effective_date']}") if data.get('definitions'): lines.append(f"\n--- DEFINITIONS IN THIS SECTION ({len(data['definitions'])}) ---") for d in data['definitions']: entry = f" {d['term']}" if d.get('scope_hint'): entry += f" (scope: {d['scope_hint']})" if d.get('signal_type'): entry += f" [{d['signal_type']}]" if d.get('caselaw_cite_count') and d['caselaw_cite_count'] > 0: entry += f" (cited {d['caselaw_cite_count']}x in caselaw)" lines.append(entry) if d.get('snippet'): lines.append(f" {d['snippet'][:200]}") if data.get('red_flags'): lines.append(f"\n--- RED FLAGS ({len(data['red_flags'])}) ---") lines.append("Terms used in this section but NOT defined in this jurisdiction:") for f in data['red_flags']: entry = f" {f['term']}" if f.get('detail'): entry += f" — {f['detail']}" lines.append(entry) if data.get('full_text'): lines.append(f"\n{'='*60}\nFULL SECTION TEXT ({data.get('text_length', 0)} chars)\n{'='*60}") lines.append(data['full_text']) return _ok("\n".join(lines)) def handle_term_search(args: dict) -> dict: term = args.get("term", "").strip() if not term: return _err("term is required") req = { "term": term, "top_k": min(args.get("top_k", 20), 50), "include_red_flags": args.get("include_red_flags", False), } if args.get("jurisdiction"): req["jurisdiction"] = args["jurisdiction"] data = query_mac("/term_search", req) if "error" in data: return _err(data["error"]) lines = [ f"Term: \"{data.get('term', term)}\"", f"Defined in {data.get('jurisdictions_defining', 0)} jurisdictions", ] pointers = data.get("pointers", []) if pointers: lines.append(f"\n--- DEFINITION POINTERS ({len(pointers)}) ---") for p in pointers: entry = f" [{p['jurisdiction']}] {p['section_citation']}" if p.get('scope_hint'): entry += f" (scope: {p['scope_hint']})" if p.get('signal_type'): entry += f" [{p['signal_type']}]" if p.get('caselaw_cite_count') and p['caselaw_cite_count'] > 0: entry += f" (cited {p['caselaw_cite_count']}x)" lines.append(entry) if p.get('snippet'): lines.append(f" {p['snippet'][:200]}") if data.get("similar_terms"): lines.append(f"\nSimilar terms: {', '.join(data['similar_terms'])}") if data.get("red_flags"): lines.append(f"\n--- RED FLAGS ({len(data['red_flags'])}) ---") lines.append("Sections that USE this term but have NO statutory definition for it:") for f in data['red_flags']: entry = f" [{f['jurisdiction']}] {f['section_citation']}" if f.get('detail'): entry += f" — {f['detail']}" lines.append(entry) return _ok("\n".join(lines)) def handle_jurisdiction_terms(args: dict) -> dict: jurisdiction = args.get("jurisdiction", "").strip() if not jurisdiction: return _err("jurisdiction is required") req = { "jurisdiction": jurisdiction, "top_k": min(args.get("top_k", 50), 200), } data = query_mac("/jurisdiction_definitions", req) if "error" in data: return _err(data["error"]) lines = [ f"Jurisdiction: {data.get('jurisdiction', jurisdiction)}", f"Total defined terms: {data.get('total_defined_terms', 0)}", f"Red flag terms (used but undefined): {data.get('red_flag_count', 0)}", ] terms = data.get("top_terms", []) if terms: lines.append(f"\n--- TOP DEFINED TERMS (by caselaw importance) ---") for t in terms: entry = f" {t['term']} — {t['definition_count']} definition(s)" if t.get('max_caselaw_citations') and t['max_caselaw_citations'] > 0: entry += f", cited {t['max_caselaw_citations']}x in caselaw" lines.append(entry) return _ok("\n".join(lines)) def handle_help(args: dict) -> dict: return _ok("""LLM LAW LIBRARY — Open Legal Research Infrastructure ===================================================== FREE, OPEN, COURT-ADMISSIBLE legal research for any LLM. No Westlaw subscription needed. No hallucinated citations. DATA COVERAGE ------------- CASELAW: 10.7M court opinions, 77M embedded passages, 1685-present. All 50 states + federal courts. Source: CourtListener (Free Law Project), CC0. Shepardizer v2: 3.7M citation treatments (how later courts treated each case). STATUTES: 1.9M statutory sections across 52 jurisdictions. All 50 US states + District of Columbia + federal (US Code). BM25 deterministic search — same query always returns same results. DEFINITIONS: 876K definition pointers across 52 jurisdictions, 210K unique terms. Where each legal term is statutorily defined, in which section, with what scope. Ranked by caselaw importance (how often courts cite each definition). RED FLAGS: 6,600+ "used-but-undefined" signals. Terms heavily used in a jurisdiction's statutes but with NO statutory definition. Four possible causes: 1. Definition in a general definitions chapter we haven't linked (cross-reference gap) 2. Definition by cross-reference we missed 3. Legislature relying on common-law meaning (legitimate, but worth flagging) 4. Recall gap in our extraction The dangerous case for a lawyer isn't a definition they know about — it's one they don't. TOOLS ----- caselaw_search — Search opinions. Modes: hybrid (default), bm25 (deterministic), vector (semantic). Use bm25 mode for exact citations ('410 U.S. 113') or party names. Use hybrid for general legal research questions. caselaw_case — Load full opinion text by opinion_id (from search results). caselaw_shepardize — Check if a case is still good law. Shows treatment counts. statute_search — Search statutes. Filter by jurisdiction (e.g. 'state:CA', 'federal'). BM25 deterministic search. Finds by citation, topic, or catch line. statute_section — Load full statute section text. Shows definitions in the section and red flags (terms used but not defined in this jurisdiction). term_search — Look up a legal term across all jurisdictions. Shows where it's defined, with scope and caselaw importance. Use include_red_flags=true to also see where the term is used WITHOUT a definition. jurisdiction_terms — List all defined terms for a state. Shows the most legally important definitions ranked by caselaw citation count. law_library_stats — Detailed statistics on the index. JURISDICTION CODES ------------------ Federal: 'federal' States: 'state:XX' where XX is the two-letter state code. Examples: 'state:CA' (California), 'state:NY' (New York), 'state:TX' (Texas), 'state:DE' (Delaware), 'state:FL' (Florida), 'state:DC' (District of Columbia) SIGNAL TYPES (definition pointers) ----------------------------------- bm25_pattern — Found via BM25 phrase search in statute text xml_structured — From authoritative XML source (federal only, highest confidence) caselaw_cite — Found in court opinions citing the statute flash_classification — LLM-classified definition section red_flag — Term used in statute but NOT defined in this jurisdiction TRUST HIERARCHY: xml_structured > caselaw_cite > bm25_pattern > flash_classification KEY CONCEPTS ------------ Shepardizer: Named after Shepard's Citations (1873). Tracks how later courts treat earlier decisions. If a case is "overruled" or "reversed," it may no longer be good law. Severity hierarchy: overruled > abrogated > reversed > vacated > superseded > questioned > distinguished > limited > criticized > modified. Definition Pointers: We store WHERE terms are defined, not the definitions themselves. "Person" might have 40 different statutory definitions across 30 jurisdictions, each with different scope. We point you to the right section; you read it in context. Red Flags: The most dangerous gap in legal research is a definition you don't know exists. Red flags catch this: "this statute uses 'fiduciary' 12 times but we found no statutory definition for 'fiduciary' in this jurisdiction." SOURCE & LICENSE ---------------- Caselaw: CourtListener / Free Law Project. CC0 public domain. Statutes: Scraped from official state legislative websites. Public law. Built by the LLM Law Library project. Open infrastructure for AI legal research. """) def handle_stats(args: dict) -> dict: data = query_mac("/stats", timeout=15) if "error" in data: return _err(data["error"]) lines = ["=== LLM Law Library Statistics ===\n"] cl = data.get('caselaw', {}) lines.append("CASELAW:") lines.append(f" Passages indexed: {cl.get('total_passages', 0):,} ({cl.get('rag_index_size', '?')})") for st, cnt in cl.get('passage_counts', {}).items(): lines.append(f" {st}: {cnt:,}") lines.append(f" BM25 opinions: {cl.get('bm25_opinions', 0):,}") lines.append(f" Shepardizer treatments: {cl.get('shepardizer_treatments', 0):,}") st = data.get('statutes', {}) lines.append(f"\nSTATUTES:") lines.append(f" Total sections: {st.get('total_sections', 0):,}") lines.append(f" Jurisdictions: {st.get('jurisdictions', 0)}") if st.get('jurisdiction_detail'): lines.append(f" Top jurisdictions:") for j in st['jurisdiction_detail'][:10]: lines.append(f" {j['jurisdiction']}: {j['section_count']:,}") df = data.get('definitions', {}) lines.append(f"\nDEFINITIONS:") lines.append(f" Definition pointers: {df.get('total_pointers', 0):,}") lines.append(f" Unique terms: {df.get('unique_terms', 0):,}") lines.append(f" Jurisdictions: {df.get('jurisdictions', 0)}") lines.append(f" Red flags: {df.get('red_flags', 0):,}") lines.append(f" Extracted definitions: {df.get('extracted_definitions', 0):,}") return _ok("\n".join(lines)) def _ok(text: str) -> dict: return {"content": [{"type": "text", "text": text}], "isError": False} def _err(text: str) -> dict: return {"content": [{"type": "text", "text": f"Error: {text}"}], "isError": True} def main(): for line in sys.stdin: line = line.strip() if not line: continue try: request = json.loads(line) except json.JSONDecodeError: continue response = handle_jsonrpc(request) if response is not None: sys.stdout.write(json.dumps(response) + "\n") sys.stdout.flush() if __name__ == "__main__": main()