#!/usr/bin/env python3 """AI Daily Newsletter - Unified news fetcher. Fetches from 20+ AI sources: RSS feeds, Hacker News, GitHub Trending, and HuggingFace Papers. Outputs unified JSON to stdout. Usage: python3 fetch_ai_news.py [--hours 24] [--limit 20] [--outdir PATH] """ import argparse import concurrent.futures import json import os import re import subprocess import sys from datetime import datetime, timezone, timedelta from time import mktime from html import unescape import feedparser import requests from bs4 import BeautifulSoup # --------------------------------------------------------------------------- # AI keyword filter set # --------------------------------------------------------------------------- AI_KEYWORDS = re.compile( r"\b(" r"AI|LLM|GPT|Claude|Agent|RAG|DeepSeek|Gemini|Llama|" r"Transformer|Diffusion|RLHF|MCP|Anthropic|OpenAI|" r"Machine\s*Learning|Deep\s*Learning|Neural\s*Net|" r"Foundation\s*Model|Fine[\s-]?tun|Embedding|Vector\s*DB|" r"Copilot|Midjourney|Stable\s*Diffusion|ChatGPT|" r"Mistral|Qwen|Phi-|Groq|vLLM|GGUF|LoRA|" r"Computer\s*Vision|NLP|MLOps|GenAI|Generative" r")\b", re.IGNORECASE, ) def matches_ai(text: str) -> bool: """Return True if text contains an AI-related keyword.""" return bool(AI_KEYWORDS.search(text or "")) # --------------------------------------------------------------------------- # RSS source registry # --------------------------------------------------------------------------- RSS_SOURCES = [ # --- Tier 1: 主流 AI 媒体 --- { "url": "https://venturebeat.com/category/ai/feed/", "name": "VentureBeat AI", "category": "industry", "ai_filter": True, }, { "url": "https://techcrunch.com/category/artificial-intelligence/feed/", "name": "TechCrunch AI", "category": "industry", "ai_filter": True, }, { "url": "https://www.theverge.com/ai-artificial-intelligence/rss/index.xml", "name": "The Verge AI", "category": "announcements", }, { "url": "https://www.technologyreview.com/topic/artificial-intelligence/feed", "name": "MIT Technology Review AI", "category": "research", }, { "url": "https://artificialintelligence-news.com/feed/", "name": "AI News", "category": "announcements", }, # --- AI 公司博客 --- { "url": "https://openai.com/blog/rss.xml", "name": "OpenAI Blog", "category": "announcements", }, { "url": "https://www.anthropic.com/feed.xml", "name": "Anthropic", "category": "announcements", }, { "url": "https://blog.google/technology/ai/rss/", "name": "Google AI Blog", "category": "announcements", }, { "url": "https://deepmind.google/blog/rss.xml", "name": "DeepMind Blog", "category": "research", }, { "url": "https://blogs.microsoft.com/ai/feed/", "name": "Microsoft AI Blog", "category": "announcements", }, { "url": "https://ai.meta.com/blog/rss/", "name": "Meta AI Blog", "category": "tools", }, # --- AI Newsletters --- { "url": "https://www.latent.space/feed", "name": "Latent Space AINews", "category": "industry", "filter_prefix": "[AINews]", }, { "url": "https://www.interconnects.ai/feed", "name": "Interconnects", "category": "industry", }, { "url": "https://www.oneusefulthing.org/feed", "name": "One Useful Thing", "category": "announcements", }, { "url": "https://chinai.substack.com/feed", "name": "ChinAI", "category": "policy", }, { "url": "https://www.deeplearning.ai/the-batch/feed/", "name": "The Batch (Andrew Ng)", "category": "industry", }, # --- AI Bloggers --- { "url": "https://simonwillison.net/atom/everything/", "name": "Simon Willison", "category": "tools", }, { "url": "https://garymarcus.substack.com/feed", "name": "Gary Marcus", "category": "policy", }, # --- Papers (Arxiv) --- { "url": "http://export.arxiv.org/rss/cs.AI", "name": "Arxiv cs.AI", "category": "research", }, { "url": "http://export.arxiv.org/rss/cs.LG", "name": "Arxiv cs.LG", "category": "research", }, # --- Product --- { "url": "https://www.producthunt.com/feed", "name": "Product Hunt", "category": "tools", }, ] # --------------------------------------------------------------------------- # Date parsing # --------------------------------------------------------------------------- def parse_date(entry) -> datetime | None: """Extract datetime from a feedparser entry.""" for attr in ("published_parsed", "updated_parsed"): val = getattr(entry, attr, None) if val: try: return datetime.fromtimestamp(mktime(val), tz=timezone.utc) except Exception: pass return None # --------------------------------------------------------------------------- # RSS fetcher # --------------------------------------------------------------------------- def fetch_rss(source: dict, cutoff: datetime, limit: int) -> list[dict]: """Fetch a single RSS source and return entries within the time window.""" url = source["url"] results = [] try: feed = feedparser.parse(url) for entry in feed.entries: if len(results) >= limit: break title = entry.get("title", "No Title") # Optional prefix filter (e.g. Latent Space [AINews]) prefix = source.get("filter_prefix") if prefix and not title.startswith(prefix): continue # Optional AI keyword filter if source.get("ai_filter"): text = f"{title} {entry.get('summary', '')}" if not matches_ai(text): continue pub_date = parse_date(entry) if pub_date and pub_date < cutoff: continue summary_raw = entry.get("summary", "") or "" # Strip HTML tags from summary summary = BeautifulSoup(summary_raw, "html.parser").get_text()[:500] results.append({ "source": source["name"], "category": source["category"], "title": unescape(title), "url": entry.get("link", ""), "time": pub_date.isoformat() if pub_date else "", "summary": summary, }) except Exception as e: print(f"[RSS] Error fetching {source['name']} ({url}): {e}", file=sys.stderr) return results # --------------------------------------------------------------------------- # GitHub Trending # --------------------------------------------------------------------------- def fetch_readme(repo_path: str) -> str: """Fetch README text for a GitHub repo (first 1000 chars). Returns empty string on failure.""" headers = {"User-Agent": "Mozilla/5.0 AI-Daily-Newsletter/1.0"} for branch in ("main", "master"): try: url = f"https://raw.githubusercontent.com/{repo_path}/{branch}/README.md" resp = requests.get(url, headers=headers, timeout=10) if resp.status_code == 200: return resp.text[:1000] except Exception: pass return "" def fetch_github_trending(limit: int) -> list[dict]: """Scrape GitHub Trending, fetch README for AI candidates, filter by README content.""" candidates = [] try: resp = requests.get( "https://github.com/trending", params={"since": "daily"}, headers={"User-Agent": "Mozilla/5.0 AI-Daily-Newsletter/1.0"}, timeout=15, ) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") for article in soup.select("article.Box-row"): h2 = article.select_one("h2 a") if not h2: continue repo_path = h2.get("href", "").strip("/") repo_url = f"https://github.com/{repo_path}" repo_name = repo_path.split("/")[-1] if "/" in repo_path else repo_path p = article.select_one("p") desc = p.get_text(strip=True) if p else "" lang_span = article.select_one("[itemprop='programmingLanguage']") lang = lang_span.get_text(strip=True) if lang_span else "" stars_links = article.select("a.Link--muted") stars = stars_links[0].get_text(strip=True).replace(",", "") if stars_links else "" # 初步过滤:名字或描述命中 AI 关键词 if not matches_ai(f"{repo_name} {desc}"): continue candidates.append({ "repo_path": repo_path, "repo_url": repo_url, "desc": desc, "lang": lang, "stars": stars, }) except Exception as e: print(f"[GitHub] Error fetching trending page: {e}", file=sys.stderr) return [] # 并发抓取 README,二次验证 AI 相关性 results = [] with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: future_to_repo = { executor.submit(fetch_readme, c["repo_path"]): c for c in candidates } for future in concurrent.futures.as_completed(future_to_repo): c = future_to_repo[future] readme = future.result() combined = f"{c['repo_path']} {c['desc']} {readme}" if not matches_ai(combined): print(f" [GitHub] SKIP {c['repo_path']} (README 不含 AI 关键词)", file=sys.stderr) continue # 取 README 前 500 字作为 summary 补充 readme_snippet = readme[:500].strip() if readme else "" results.append({ "source": "GitHub Trending", "category": "tools", "title": c["repo_path"], "url": c["repo_url"], "time": datetime.now(timezone.utc).isoformat(), "summary": c["desc"], "readme": readme_snippet, "github_url": c["repo_url"], "lang": c["lang"], "stars": c["stars"], }) if len(results) >= limit: break return results # --------------------------------------------------------------------------- # HuggingFace Papers (via subprocess) # --------------------------------------------------------------------------- def fetch_hf_papers(limit: int) -> list[dict]: """Run fetch_hf_papers.py as a subprocess and parse its JSON output.""" script_dir = os.path.dirname(os.path.abspath(__file__)) script_path = os.path.join(script_dir, "fetch_hf_papers.py") if not os.path.exists(script_path): print("[HF] fetch_hf_papers.py not found, skipping", file=sys.stderr) return [] try: result = subprocess.run( [sys.executable, script_path], capture_output=True, text=True, timeout=120, ) if result.returncode != 0: print(f"[HF] fetch_hf_papers.py failed: {result.stderr}", file=sys.stderr) return [] papers = json.loads(result.stdout) return papers[:limit] except subprocess.TimeoutExpired: print("[HF] fetch_hf_papers.py timed out", file=sys.stderr) return [] except Exception as e: print(f"[HF] Error running fetch_hf_papers.py: {e}", file=sys.stderr) return [] # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="AI Daily Newsletter - News Fetcher") parser.add_argument("--hours", type=int, default=24, help="Time window in hours (default: 24)") parser.add_argument("--limit", type=int, default=20, help="Max entries per source (default: 20)") parser.add_argument("--outdir", type=str, help="Save JSON to directory instead of stdout") args = parser.parse_args() cutoff = datetime.now(timezone.utc) - timedelta(hours=args.hours) all_entries = [] source_count = 0 # --- Phase 1: Concurrent RSS fetch --- print(f"[INFO] Fetching {len(RSS_SOURCES)} RSS sources (window: {args.hours}h)...", file=sys.stderr) with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: futures = { executor.submit(fetch_rss, src, cutoff, args.limit): src["name"] for src in RSS_SOURCES } for future in concurrent.futures.as_completed(futures): name = futures[future] try: entries = future.result() if entries: source_count += 1 all_entries.extend(entries) print(f" [RSS] {name}: {len(entries)} entries", file=sys.stderr) else: print(f" [RSS] {name}: 0 entries", file=sys.stderr) except Exception as e: print(f" [RSS] {name}: error - {e}", file=sys.stderr) # --- Phase 2: GitHub, HF (concurrent) --- print("[INFO] Fetching GitHub Trending, HF Papers...", file=sys.stderr) with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: gh_future = executor.submit(fetch_github_trending, args.limit) hf_future = executor.submit(fetch_hf_papers, args.limit) for name, future in [("GitHub", gh_future), ("HF Papers", hf_future)]: try: entries = future.result() if entries: source_count += 1 all_entries.extend(entries) print(f" [{name}] {len(entries)} entries", file=sys.stderr) else: print(f" [{name}] 0 entries", file=sys.stderr) except Exception as e: print(f" [{name}] error - {e}", file=sys.stderr) # --- Sort by time descending --- def sort_key(entry): t = entry.get("time", "") if not t: return "" return t all_entries.sort(key=sort_key, reverse=True) # --- Output --- output = json.dumps(all_entries, ensure_ascii=False, indent=2) stats = f"[DONE] {source_count} sources | {len(all_entries)} entries" print(stats, file=sys.stderr) if args.outdir: os.makedirs(args.outdir, exist_ok=True) date_str = datetime.now().strftime("%Y-%m-%d") filepath = os.path.join(args.outdir, f"ai-news-{date_str}.json") with open(filepath, "w", encoding="utf-8") as f: f.write(output) print(f"[SAVED] {filepath}", file=sys.stderr) else: print(output) if __name__ == "__main__": main()