// ==UserScript== // @name SourceCapsule - Save X/Twitter Threads & Articles as Markdown for LLMs + Offline HTML // @namespace https://github.com/wolfgang-aura/SourceCapsule // @version 1.2.1 // @description One click saves an X (Twitter) thread, Article, or post as clean Markdown for LLM context (Claude, ChatGPT) plus a self-contained offline HTML archive - images, video, and quoted posts embedded, with honest completeness reporting. Local-first, with optional expiring share links. // @author wolfgang-aura // @license MIT // @match https://x.com/* // @match https://twitter.com/* // @match https://mobile.x.com/* // @match https://mobile.twitter.com/* // @icon https://abs.twimg.com/favicons/twitter.3.ico // @grant GM_xmlhttpRequest // @grant unsafeWindow // @grant GM_registerMenuCommand // @grant GM_unregisterMenuCommand // @connect pbs.twimg.com // @connect video.twimg.com // @connect abs.twimg.com // @connect cdn.syndication.twimg.com // @connect x.com // @connect twitter.com // @connect 127.0.0.1 // @connect localhost // @connect share.sourcecapsule.app // @connect sourcecapsule-share.wolfgang-aura.workers.dev // @run-at document-start // @noframes // @downloadURL https://raw.githubusercontent.com/wolfgang-aura/SourceCapsule/main/sourcecapsule.user.js // @updateURL https://raw.githubusercontent.com/wolfgang-aura/SourceCapsule/main/sourcecapsule.user.js // ==/UserScript== /* * SourceCapsule * ------------- * Saves an X (Twitter) Article or single post as ONE self-contained .html file * that opens fully offline: every image and short video is base64-inlined, and * quoted tweets are rebuilt as real, styled, selectable HTML (not screenshots). * * ARCHITECTURE (read this before editing) * ======================================= * The code is split into two layers with a deliberate seam between them: * * 1. FRAGILE LAYER - anything that reads X's DOM. X reshuffles its markup * often, so ALL of its selectors live in the CONFIG block below, and the * extraction functions produce a plain-object "model". When X breaks the * tool, the fix is almost always here, and almost always just a selector. * * 2. STABLE LAYER - the durable engine: privileged fetch -> base64 -> * assemble HTML -> download. It only ever touches the model, never X's DOM, * so it rarely needs to change. * * The model is the contract between the two. See buildModel* (producers) and * assembleHtml (consumer). * * WHY A USERSCRIPT? CORS. Reading the raw bytes of pbs.twimg.com / * video.twimg.com media to base64-encode them is blocked from a normal page * context. GM_xmlhttpRequest (with the @connect grants above) is the privileged * fetch that makes inlining possible. That single constraint is why this is a * userscript and not a plain content script. */ (function () { 'use strict'; // =========================================================================== // CONFIG - *** EDIT HERE WHEN X CHANGES *** // --------------------------------------------------------------------------- // If the tool stops finding part of the page, a selector below is almost // certainly stale. Update it here; the rest of the code should not need to // change. Each selector lists fallbacks (tried in order). // =========================================================================== const CONFIG = { selectors: { // The main content column of a status / article page. primaryColumn: ['div[data-testid="primaryColumn"]', 'main[role="main"]'], // A single tweet block (the primary post and any quoted/embedded tweets). tweet: ['article[data-testid="tweet"]', 'article[role="article"]'], // The rich text of a tweet. `div[lang]` is a fallback: X wraps tweet text // in a div carrying a `lang` attribute even if the testid changes. tweetText: ['div[data-testid="tweetText"]', 'div[lang]'], // Author name/handle block within a tweet. userName: ['div[data-testid="User-Name"]'], // Avatar image within a tweet. avatar: ['div[data-testid="Tweet-User-Avatar"] img', 'img[src*="profile_images"]'], // Photos within a tweet. tweetPhoto: [ 'div[data-testid="tweetPhoto"] img', 'a[href*="/photo/"] img', 'img[src*="pbs.twimg.com/media/"]', ], // Video container within a tweet. videoPlayer: ['div[data-testid="videoPlayer"]', 'div[data-testid="videoComponent"]'], // External link-preview card within a tweet (the payload of a link post). card: ['div[data-testid="card.wrapper"]', 'a[data-testid="card.wrapper"]'], // "Show more" link X renders on long-form (note) posts in timeline views; // its presence means the visible text is only a preview. showMore: ['[data-testid="tweet-text-show-more-link"]'], // Time element (carries the canonical permalink). timeLink: ['a[href*="/status/"] time'], // Long-form Article rich-text root. articleRoot: [ 'div[data-testid="twitterArticleReadView"]', 'div[data-testid="twitterArticleRichTextView"]', 'div[data-testid="twitterArticleReader"]', ], articleTextRoot: ['div[data-testid="longformRichTextComponent"]'], // Long-form Article title. articleTitle: [ 'div[data-testid="twitter-article-title"]', 'div[data-testid="twitterArticleTitle"]', 'h1[role="heading"]', 'h1', ], }, video: { inlineEnabled: true, inlineCapBytes: Infinity, // Fetch any discovered MP4; fallback only after preservation fails. minPlayableBytes: 32 * 1024, networkCaptureMaxChars: 2_000_000, }, image: { preferOriginal: true, // request the full-resolution pbs.twimg.com variant }, fetchTimeoutMs: 30000, buttonId: 'sourcecapsule-btn', // Per-post Export buttons attached to each post on status/article pages, so the // user picks exactly which post to export instead of relying on one page-level // button (avoids accidentally exporting the wrong tweet). Set false to disable. perPostButtons: true, postControlClass: 'sourcecapsule-post-ctl', postControlFlag: 'data-sourcecapsule-ctl', toastId: 'sourcecapsule-toast', styleId: 'sourcecapsule-style', debug: true, debugEmbed: true, // Scroll the page top-to-bottom before extracting so X's lazy/virtualized // media loads into the DOM. The #1 suspected cause of missing tweet images. forceLoad: true, forceLoadMaxMs: 45000, forceLoadSettleMs: 2500, videoNudgeTimeoutMs: 700, // Fetch each embedded/quoted tweet by id from X's public syndication endpoint // to get its authoritative text + media, instead of scraping the fragile, // virtualized article DOM. This is what makes quote media reliably correct. useSyndication: true, share: { // Hosted share service (Cloudflare Worker + R2). Point this at a local // `npm run dev:share` (http://127.0.0.1:8787) through the userscript-manager // menu when developing; new hosts also need an @connect grant above. defaultApiBase: 'https://sourcecapsule-share.wolfgang-aura.workers.dev', maxBytes: 25 * 1024 * 1024, expiryDays: [1, 7, 30], defaultExpiryDays: 7, }, }; const APP = 'SourceCapsule'; const VERSION = '1.2.1'; // =========================================================================== // Small utilities // =========================================================================== const log = (...a) => CONFIG.debug && console.log(`[${APP}]`, ...a); const warn = (...a) => console.warn(`[${APP}]`, ...a); const errlog = (...a) => console.error(`[${APP}]`, ...a); const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); const withTimeout = (promise, ms) => Promise.race([ Promise.resolve(promise).catch((error) => ({ error })), sleep(ms).then(() => ({ timedOut: true })), ]); /** Return the first element matching any selector in the list, or null. */ function pick(root, selectorList, { quiet = false } = {}) { const list = Array.isArray(selectorList) ? selectorList : [selectorList]; for (const sel of list) { const el = (root || document).querySelector(sel); if (el) return el; } if (!quiet) warn('selector miss (none matched):', list.join(' || ')); return null; } /** Return all elements matching the FIRST selector in the list that hits. */ function pickAll(root, selectorList) { const list = Array.isArray(selectorList) ? selectorList : [selectorList]; for (const sel of list) { const els = (root || document).querySelectorAll(sel); if (els.length) return Array.from(els); } return []; } /** Return matches for all selectors, including root, without stopping early. */ function pickAllMatchesIncludingRoot(root, selectorList) { const list = Array.isArray(selectorList) ? selectorList : [selectorList]; const seen = new Set(); const els = []; const add = (el) => { if (el && !seen.has(el)) { seen.add(el); els.push(el); } }; for (const sel of list) { if (root && root.matches && root.matches(sel)) add(root); (root || document).querySelectorAll(sel).forEach(add); } return els; } function escapeHtml(s) { return String(s == null ? '' : s) .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') .replace(/'/g, '''); } function escapeJsonScript(s) { return escapeJsonForHtml(s); } function slugify(s) { const base = String(s || '') .toLowerCase() .replace(/[^\w\s-]/g, '') .trim() .replace(/\s+/g, '-') .replace(/-+/g, '-') .slice(0, 80) .replace(/^-+|-+$/g, ''); return base || 'x-export'; } function nowStamp() { return new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); } function humanBytes(n) { if (!n && n !== 0) return '?'; const u = ['B', 'KB', 'MB', 'GB']; let i = 0; while (n >= 1024 && i < u.length - 1) { n /= 1024; i++; } return `${n.toFixed(i ? 1 : 0)} ${u[i]}`; } function formatDuration(seconds) { const n = Number(seconds); if (!Number.isFinite(n) || n <= 0) return ''; const total = Math.round(n); const h = Math.floor(total / 3600); const m = Math.floor((total % 3600) / 60); const s = total % 60; if (h) return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`; return `${m}:${String(s).padStart(2, '0')}`; } function escapeAttr(s) { return escapeHtml(s); } // Only http(s)/mailto URLs may become an href in the EXPORTED file, which opens in a // file:// context. This neutralizes javascript:/data:/vbscript: schemes that would // otherwise survive escaping and execute when a reader clicks a link in the archive. // Returns '' for anything not on the scheme allowlist; callers must drop the link then. function safeUrl(u) { const s = String(u == null ? '' : u).trim(); if (!s) return ''; return /^(?:https?:|mailto:)/i.test(s) ? s : ''; } // X's syndication API returns tweet text with &, <, > already HTML-encoded (the classic // Twitter behaviour). Decode those back to plain text before our own escaping, so we don't // double-encode and render a literal "&" in the archive. Decode & last so an // encoded "<" doesn't get turned into a real "<". function decodeBasicEntities(s) { return String(s == null ? '' : s) .replace(/</g, '<') .replace(/>/g, '>') .replace(/&/g, '&'); } function countBlocks(blocks, predicate) { let count = 0; const walk = (items) => { (items || []).forEach((b) => { if (predicate(b)) count += 1; if (b.kind === 'quote' || b.kind === 'blockquote') walk(b.blocks); }); }; walk(blocks); return count; } function normalizeExternalLinks(html) { return String(html || '').replace( /]*\bhref="(https?:\/\/[^"]+)"[^>]*)>/gi, (tag, attrs) => { let next = attrs; if (/\btarget\s*=/.test(next)) { next = next.replace(/\btarget\s*=\s*"[^"]*"/i, 'target="_blank"'); } else { next += ' target="_blank"'; } if (/\brel\s*=/.test(next)) { next = next.replace(/\brel\s*=\s*"([^"]*)"/i, (relTag, relValue) => { const rels = new Set( String(relValue || '') .split(/\s+/) .filter(Boolean) ); rels.add('noopener'); rels.add('noreferrer'); return `rel="${Array.from(rels).join(' ')}"`; }); } else { next += ' rel="noopener noreferrer"'; } return ``; } ); } function videoDimensionsFromUrl(url) { const match = String(url || '').match(/\/(\d{2,5})x(\d{2,5})(?:\/|[._-])/); if (!match) return {}; const width = Number(match[1]); const height = Number(match[2]); return Number.isFinite(width) && Number.isFinite(height) ? { width, height } : {}; } function applyVideoDimensions(block, dimensions) { const width = Number(dimensions && dimensions.width); const height = Number(dimensions && dimensions.height); if (Number.isFinite(width) && width > 0) block.width = Math.round(width); if (Number.isFinite(height) && height > 0) block.height = Math.round(height); } function escapeJsonForHtml(s) { return String(s) .replace(//g, '\\u003e') .replace(/&/g, '\\u0026') .replace(/[\s\S]/g, (c) => { const code = c.charCodeAt(0); if (code <= 0x7f) return c; return `\\u${code.toString(16).padStart(4, '0')}`; }); } function safeIsoTime(value) { const d = new Date(value); return Number.isNaN(d.getTime()) ? '' : d.toISOString(); } function readableUtcTime(value) { const iso = safeIsoTime(value); if (!iso) return 'Unknown time'; return iso.replace('T', ' ').replace(/\.\d{3}Z$/, ' UTC'); } function decodeHtmlCodePoint(match, code, radix = 10) { const n = parseInt(code, radix); try { return Number.isFinite(n) ? String.fromCodePoint(n) : match; } catch { return match; } } function textFromHtml(html) { return String(html || '') .replace(//gi, '\n') .replace(/<[^>]+>/g, ' ') .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/&#(\d+);/g, (match, code) => decodeHtmlCodePoint(match, code)) .replace(/&#x([0-9a-f]+);/gi, (match, code) => decodeHtmlCodePoint(match, code, 16)) .replace(/\s+/g, ' ') .trim(); } function blockTextForLanguage(block) { if (!block) return ''; if (block.kind === 'heading') return block.text || ''; if (block.kind === 'paragraph') return textFromHtml(block.html); if (block.kind === 'code') return block.text || ''; if (block.kind === 'list') return (block.items || []).map(textFromHtml).join(' '); if (block.kind === 'quote' || block.kind === 'blockquote') return (block.blocks || []).map(blockTextForLanguage).join(' '); return ''; } function inferDocumentLang(model) { const text = [model.title, model.heading, ...(model.blocks || []).map(blockTextForLanguage)] .join(' ') .slice(0, 12000); const cjk = (text.match(/[\u3400-\u9fff]/g) || []).length; const latin = (text.match(/[A-Za-z]/g) || []).length; if (cjk >= 12 && cjk >= latin * 0.25) return 'zh-CN'; return 'en'; } function statusIdFromSourceUrl(url) { const id = statusIdFromUrl(url); if (id) return id; const article = String(url || '').match(/\/article\/(\d+)/); return article ? article[1] : ''; } // Reserved first-path segments on x.com/twitter.com that are NOT user handles. const NON_HANDLE_SEGMENTS = new Set([ 'i', 'home', 'search', 'explore', 'notifications', 'messages', 'settings', 'compose', 'hashtag', 'intent', 'share', 'login', 'signup', 'about', 'tos', 'privacy', ]); /** * Best-effort author handle from a post/article URL (e.g. https://x.com/dingyi/status/123 -> * "@dingyi"). Used only as a fallback when the DOM author metadata is missing. Returns '' for * reserved paths (/i/, /home, ...) or anything that does not look like a handle. */ function handleFromSourceUrl(url) { const m = String(url || '').match( /^https?:\/\/(?:[\w-]+\.)*(?:x|twitter)\.com\/([A-Za-z0-9_]{1,15})(?:[/?#]|$)/i ); if (!m) return ''; if (NON_HANDLE_SEGMENTS.has(m[1].toLowerCase())) return ''; return `@${m[1]}`; } function publishedAtFromElement(root, expectedStatusId = '') { const times = Array.from( (root || document).querySelectorAll ? (root || document).querySelectorAll('time[datetime]') : [] ); if (!times.length) return ''; const normalizedExpected = String(expectedStatusId || ''); const matching = normalizedExpected ? times.find((time) => { const anchor = time.closest && time.closest('a[href*="/status/"]'); return anchor && statusIdFromUrl(anchor.href) === normalizedExpected; }) : null; const time = matching || times[0]; return safeIsoTime(time.getAttribute('datetime') || ''); } function normalizeVideoUrl(url) { if (!url) return ''; let value = String(url).trim(); if (!value || value.startsWith('blob:') || value.startsWith('data:')) return ''; value = value .replace(/\\u0026/g, '&') .replace(/\\\//g, '/') .replace(/&/g, '&'); try { return new URL(value, typeof location !== 'undefined' ? location.href : undefined).toString(); } catch { return /^https?:\/\//.test(value) ? value : ''; } } function videoUrlKind(url) { const lower = String(url || '').toLowerCase(); if (lower.includes('.mp4')) return 'mp4'; if (lower.includes('.m3u8')) return 'hls'; return ''; } function isInterestingVideoUrl(url) { const lower = String(url || '').toLowerCase(); return ( lower.includes('video.twimg.com') || lower.includes('.mp4') || lower.includes('.m3u8') || lower.includes('amplify_video') || lower.includes('ext_tw_video') || lower.includes('tweet_video') ); } function videoCandidate(url, source = 'unknown', extra = {}) { const normalized = normalizeVideoUrl(url); if (!normalized || !isInterestingVideoUrl(normalized)) return null; return { url: normalized, kind: videoUrlKind(normalized), source, bitrate: Number(extra.bitrate) > 0 ? Number(extra.bitrate) : undefined, ...videoDimensionsFromUrl(normalized), ...extra, }; } function addVideoCandidate(out, seen, candidate) { if (!candidate || !candidate.url || seen.has(candidate.url)) return; seen.add(candidate.url); out.push(candidate); } function videoCandidatesFromText(text, source = 'text') { const out = []; const seen = new Set(); const raw = String(text || ''); const patterns = [ /https?:\\\/\\\/video\.twimg\.com\\\/[^"'<>\\\s]+/g, /https?:\/\/video\.twimg\.com\/[^"' <>\s]+/g, ]; patterns.forEach((pattern) => { raw.replace(pattern, (url) => { addVideoCandidate(out, seen, videoCandidate(url, source)); return url; }); }); return out; } function xVideoMediaKey(url) { const value = String(url || ''); const match = value.match( /(?:amplify_video_thumb|amplify_video|ext_tw_video_thumb|ext_tw_video|tweet_video_thumb|tweet_video)\/(\d+)/i ); return match ? match[1] : ''; } function structuredPosterUrl(value) { if (!value) return ''; if (typeof value === 'string') return value; if (typeof value !== 'object') return ''; return ( value.original_img_url || value.url || value.media_url_https || value.media_url || value.preview_image_url || value.thumbnail_url || '' ); } function sortVideoCandidates(candidates) { return (candidates || []).slice().sort((a, b) => { if (a.kind !== b.kind) return a.kind === 'mp4' ? -1 : 1; const bitrateDelta = (Number(b.bitrate) || 0) - (Number(a.bitrate) || 0); if (bitrateDelta) return bitrateDelta; const pixelsB = (Number(b.width) || 0) * (Number(b.height) || 0); const pixelsA = (Number(a.width) || 0) * (Number(a.height) || 0); return pixelsB - pixelsA; }); } function videoCandidatesFromStructuredData(value, source = 'json') { const out = []; const seen = new Set(); const add = (url, candidateSource, extra) => addVideoCandidate(out, seen, videoCandidate(url, candidateSource, extra)); const walk = (item, itemSource) => { if (!item) return; if (typeof item === 'string') { videoCandidatesFromText(item, itemSource).forEach((candidate) => addVideoCandidate(out, seen, candidate) ); return; } if (Array.isArray(item)) { item.forEach((child) => walk(child, itemSource)); return; } if (typeof item !== 'object') return; const variants = item.video_info && Array.isArray(item.video_info.variants) ? item.video_info.variants : Array.isArray(item.variants) ? item.variants : []; const posterUrl = structuredPosterUrl(item.media_url_https) || structuredPosterUrl(item.media_url) || structuredPosterUrl(item.preview_image_url) || structuredPosterUrl(item.preview_image) || structuredPosterUrl(item.thumbnail_url); const mediaKey = item.media_key || item.id_str || item.id || xVideoMediaKey(posterUrl); variants.forEach((variant) => { if (!variant || !variant.url) return; add(variant.url, `${itemSource}:variant`, { bitrate: variant.bitrate, contentType: variant.content_type || variant.contentType || '', posterUrl, mediaKey, }); }); if ( item.url && (item.content_type === 'video/mp4' || item.contentType === 'video/mp4' || String(item.url).includes('.mp4') || String(item.url).includes('.m3u8')) ) { add(item.url, itemSource, { bitrate: item.bitrate, contentType: item.content_type || item.contentType || '', posterUrl, mediaKey, }); } Object.keys(item).forEach((key) => walk(item[key], itemSource)); }; walk(value, source); return sortVideoCandidates(out); } function videoCandidatesFromJsonText(text, source = 'json') { const raw = String(text || '').trim(); if (!raw || (raw[0] !== '{' && raw[0] !== '[')) return []; try { return videoCandidatesFromStructuredData(JSON.parse(raw), source); } catch { return []; } } // =========================================================================== // STABLE LAYER - privileged fetch + base64 inlining // =========================================================================== /** Hosts the privileged byte fetch is allowed to hit. All inlineable X media lives on * *.twimg.com; restricting here (in addition to the @connect grants) bounds SSRF so a * crafted media URL in a post cannot make the script fetch an arbitrary origin. */ function isAllowedMediaHost(url) { try { const host = new URL(url, location.href).hostname.toLowerCase(); return host === 'twimg.com' || host.endsWith('.twimg.com'); } catch { return false; } } /** Fetch raw bytes through the userscript manager (bypasses page CORS). */ function gmFetchBytes(url) { return new Promise((resolve, reject) => { if (typeof GM_xmlhttpRequest !== 'function') { reject(new Error('GM_xmlhttpRequest unavailable - is the userscript manager granting it?')); return; } if (!isAllowedMediaHost(url)) { reject(new Error(`Refusing to fetch non-twimg media host: ${url}`)); return; } GM_xmlhttpRequest({ method: 'GET', url, responseType: 'arraybuffer', timeout: CONFIG.fetchTimeoutMs, onload: (res) => { if (res.status >= 200 && res.status < 300 && res.response) { const header = (res.responseHeaders || '').match(/content-type:\s*([^\r\n;]+)/i); const mime = (header && header[1] ? header[1] : guessMime(url)).trim(); resolve({ bytes: new Uint8Array(res.response), mime }); } else { reject(new Error(`HTTP ${res.status} for ${url}`)); } }, onerror: () => reject(new Error(`Network error for ${url}`)), ontimeout: () => reject(new Error(`Timeout (${CONFIG.fetchTimeoutMs}ms) for ${url}`)), }); }); } function guessMime(url) { const u = url.split('?')[0].toLowerCase(); if (u.endsWith('.png')) return 'image/png'; if (u.endsWith('.gif')) return 'image/gif'; if (u.endsWith('.webp')) return 'image/webp'; if (u.endsWith('.mp4')) return 'video/mp4'; if (u.endsWith('.svg')) return 'image/svg+xml'; return 'image/jpeg'; } /** ArrayBuffer/Uint8Array -> base64 (chunked to avoid call-stack limits). */ function bytesToBase64(bytes) { let binary = ''; const chunk = 0x8000; for (let i = 0; i < bytes.length; i += chunk) { binary += String.fromCharCode.apply(null, bytes.subarray(i, i + chunk)); } return btoa(binary); } /** base64 string -> Uint8Array (inverse of bytesToBase64; atob exists in Node 18+). */ function base64ToBytes(b64) { const binary = atob(String(b64 || '')); const bytes = new Uint8Array(binary.length); for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i); return bytes; } /** "data:[;base64]," -> { bytes, mime }. */ function dataUriToBytes(dataUri) { const s = String(dataUri || ''); const comma = s.indexOf(','); if (comma === -1 || !s.startsWith('data:')) return { bytes: new Uint8Array(0), mime: '' }; const header = s.slice(5, comma); const mime = header.split(';')[0] || ''; const payload = s.slice(comma + 1); const bytes = /;base64/i.test(header) ? base64ToBytes(payload) : new TextEncoder().encode(decodeURIComponent(payload)); return { bytes, mime }; } /** MIME -> a sensible file extension for sidecar media files. */ function mimeToExt(mime) { switch (String(mime || '').toLowerCase()) { case 'image/jpeg': case 'image/jpg': return 'jpg'; case 'image/png': return 'png'; case 'image/gif': return 'gif'; case 'image/webp': return 'webp'; case 'image/svg+xml': return 'svg'; case 'video/mp4': return 'mp4'; default: return 'bin'; } } /** * PURE: decide the on-disk folder names for one export, given the user's layout pref. * `date` is a pre-formatted "YYYY-MM-DD" string (caller supplies the local date). Returns the * directory segments from the chosen root down to the per-post folder. The post-folder name is * stable (handle + status id) so re-exporting the same post overwrites instead of duplicating. */ function bundlePaths(model, prefs, date) { const layout = prefs && prefs.layout === 'flat' ? 'flat' : 'date'; const handle = String((model.author && model.author.handle) || '').replace(/^@/, ''); const statusId = statusIdFromSourceUrl(model.sourceUrl || ''); let postName; if (handle && statusId) postName = `${slugify(handle)}-${statusId}`; else if (statusId) postName = `post-${statusId}`; else postName = slugify(model.title || model.heading || 'x-export'); const dateFolder = String(date || ''); if (layout === 'flat') { const folder = dateFolder ? `${dateFolder}_${postName}` : postName; return { layout, dateFolder, postName, postFolder: folder, segments: [folder] }; } return { layout, dateFolder, postName, postFolder: postName, segments: dateFolder ? [dateFolder, postName] : [postName], }; } /** Local "YYYY-MM-DD" for date-grouped folders (the user's day, not UTC). */ function localDateStamp(d = new Date()) { const pad = (n) => String(n).padStart(2, '0'); return `${d.getFullYear()}-${pad(d.getMonth() + 1)}-${pad(d.getDate())}`; } function normalizeTags(value) { const input = Array.isArray(value) ? value : String(value || '').split(','); const seen = new Set(); return input .map((tag) => String(tag || '') .trim() .replace(/^#+/, '') .replace(/\s+/g, '-') ) .filter((tag) => { const key = tag.toLowerCase(); if (!tag || seen.has(key)) return false; seen.add(key); return true; }) .slice(0, 20); } function applyCaptureMetadata(model, metadata = {}) { model.userNote = String(metadata.note || '') .trim() .slice(0, 2000); model.tags = normalizeTags(metadata.tags); return model; } function libraryIndexEntry(model, paths, stats) { const id = statusIdFromSourceUrl(model.sourceUrl) || slugify(model.title || 'capture'); const relativePath = [...paths.segments, `${paths.postName}.llm.md`].join('/'); return { id, title: markdownLineText(model.heading || model.title || 'X capture'), author: markdownLineText( [model.author && model.author.name, model.author && model.author.handle] .filter(Boolean) .join(' ') ), type: model.thread ? `thread (${model.thread.capturedPosts} posts)` : model.type || 'post', sourceUrl: model.sourceUrl || '', savedAt: safeIsoTime(model.exportedAt), path: relativePath, note: markdownLineText(model.userNote || ''), tags: normalizeTags(model.tags), capture: `${stats.images} image(s), ${stats.videos} video(s), ${stats.incompleteMedia} incomplete, ${stats.missingMedia} missing`, }; } function renderLibraryIndexItem(entry) { const lines = [ ``, `## ${entry.title || 'X capture'}`, '', `- ID: ${entry.id}`, `- Type: ${entry.type}`, `- Author: ${entry.author || 'Unknown'}`, `- Source: ${entry.sourceUrl}`, `- Saved: ${entry.savedAt}`, `- File: ${entry.path}`, `- Capture: ${entry.capture}`, ]; if (entry.tags && entry.tags.length) lines.push(`- Tags: ${entry.tags.join(', ')}`); if (entry.note) lines.push(`- Saved because: ${entry.note}`); lines.push('', ``); return lines.join('\n'); } function updateLibraryIndexText(existing, entry) { const header = '# SourceCapsule Library Index\n\n\n'; const current = String(existing || '').trim() || header.trim(); const escapedId = String(entry.id).replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const pattern = new RegExp( `[\\s\\S]*?`, 'g' ); const item = renderLibraryIndexItem(entry); if (pattern.test(current)) return `${current.replace(pattern, item).trim()}\n`; return `${current.trim()}\n\n${item}\n`; } // --------------------------------------------------------------------------- // Store-only ZIP writer (no dependency). Used only as the fallback delivery on // browsers without the File System Access API. Media is already compressed, so // we store (method 0) rather than deflate - simpler and effectively the same size. // --------------------------------------------------------------------------- const CRC32_TABLE = (() => { const table = new Uint32Array(256); for (let n = 0; n < 256; n++) { let c = n; for (let k = 0; k < 8; k++) c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1; table[n] = c >>> 0; } return table; })(); function crc32(bytes) { let crc = 0xffffffff; for (let i = 0; i < bytes.length; i++) crc = CRC32_TABLE[(crc ^ bytes[i]) & 0xff] ^ (crc >>> 8); return (crc ^ 0xffffffff) >>> 0; } /** * Build a store-only ZIP from `[{ name, bytes }]` -> Uint8Array. Names use forward slashes * (e.g. "media/image-001.jpg") and must be ASCII. No timestamps (set to 0). */ function buildZip(entries) { const enc = new TextEncoder(); const chunks = []; const central = []; let offset = 0; const u16 = (n) => [n & 0xff, (n >>> 8) & 0xff]; const u32 = (n) => [n & 0xff, (n >>> 8) & 0xff, (n >>> 16) & 0xff, (n >>> 24) & 0xff]; for (const entry of entries) { const nameBytes = enc.encode(entry.name); const data = entry.bytes instanceof Uint8Array ? entry.bytes : new Uint8Array(entry.bytes); const crc = crc32(data); const local = [ ...u32(0x04034b50), // local file header signature ...u16(20), // version needed ...u16(0), // flags ...u16(0), // method: store ...u16(0), // mod time ...u16(0), // mod date ...u32(crc), ...u32(data.length), // compressed size ...u32(data.length), // uncompressed size ...u16(nameBytes.length), ...u16(0), // extra length ]; chunks.push(new Uint8Array(local), nameBytes, data); central.push({ nameBytes, crc, size: data.length, offset }); offset += local.length + nameBytes.length + data.length; } const centralStart = offset; let centralSize = 0; for (const c of central) { const header = [ ...u32(0x02014b50), // central directory header signature ...u16(20), // version made by ...u16(20), // version needed ...u16(0), // flags ...u16(0), // method: store ...u16(0), // mod time ...u16(0), // mod date ...u32(c.crc), ...u32(c.size), ...u32(c.size), ...u16(c.nameBytes.length), ...u16(0), // extra length ...u16(0), // comment length ...u16(0), // disk number start ...u16(0), // internal attrs ...u32(0), // external attrs ...u32(c.offset), ]; chunks.push(new Uint8Array(header), c.nameBytes); centralSize += header.length + c.nameBytes.length; } const end = [ ...u32(0x06054b50), // end of central directory signature ...u16(0), // disk number ...u16(0), // disk with central dir ...u16(central.length), ...u16(central.length), ...u32(centralSize), ...u32(centralStart), ...u16(0), // comment length ]; chunks.push(new Uint8Array(end)); const total = chunks.reduce((sum, c) => sum + c.length, 0); const out = new Uint8Array(total); let pos = 0; for (const c of chunks) { out.set(c, pos); pos += c.length; } return out; } async function sha256Hex(bytes) { if (typeof crypto === 'undefined' || !crypto.subtle) return ''; const digest = await crypto.subtle.digest('SHA-256', bytes); return Array.from(new Uint8Array(digest)) .map((b) => b.toString(16).padStart(2, '0')) .join(''); } /** Fetch a URL and return it as a data: URI (plus size for the cap check). */ async function fetchAsDataUri(url) { const { bytes, mime } = await gmFetchBytes(url); const sha256 = await sha256Hex(bytes); return { dataUri: `data:${mime};base64,${bytesToBase64(bytes)}`, bytes, size: bytes.length, mime, sha256: sha256 ? `sha256:${sha256}` : '', }; } function bytesAscii(bytes, offset, length) { if (!bytes || bytes.length < offset + length) return ''; let out = ''; for (let i = offset; i < offset + length; i++) out += String.fromCharCode(bytes[i]); return out; } function mp4HasBox(bytes, boxType) { if (!bytes || !boxType) return false; const needle = String(boxType); for (let i = 4; i <= bytes.length - 4; i++) { if (bytesAscii(bytes, i, 4) === needle) return true; } return false; } function validateMp4Download({ bytes, size, mime, url }) { const actualSize = Number(size || (bytes && bytes.length) || 0); const isVideo = /^video\//i.test(mime || '') || /octet-stream/i.test(mime || ''); if (!isVideo) throw new Error(`unexpected content-type ${mime || 'unknown'}`); if (actualSize < CONFIG.video.minPlayableBytes) { throw new Error(`video response too small (${humanBytes(actualSize)})`); } if ( String(url || '') .toLowerCase() .includes('.mp4') && !mp4HasBox(bytes, 'mdat') ) { throw new Error('mp4 response has no media data box'); } } function imageFetchCandidates(url) { const out = []; const add = (candidate) => { if (candidate && !out.includes(candidate)) out.push(candidate); }; add(url); try { const u = new URL(url, typeof location !== 'undefined' ? location.href : undefined); if (u.hostname === 'pbs.twimg.com' && u.pathname.startsWith('/media/')) { const originalName = u.searchParams.get('name'); ['orig', '4096x4096', 'large', 'medium', 'small'].forEach((name) => { const next = new URL(u.toString()); next.searchParams.set('name', name); add(next.toString()); }); if (originalName) { const next = new URL(u.toString()); next.searchParams.set('name', originalName); add(next.toString()); } } } catch { // Keep the original candidate only. } return out; } async function fetchImageAsDataUri(url) { let lastError = null; for (const candidate of imageFetchCandidates(url)) { try { return await fetchAsDataUri(candidate); } catch (e) { lastError = e; } } throw lastError || new Error(`image fetch failed for ${url}`); } /** Rewrite a pbs.twimg.com image URL to its full-resolution variant. */ function highResImageUrl(url) { try { const base = typeof location !== 'undefined' ? location.href : undefined; const u = new URL(url, base); if (u.hostname === 'pbs.twimg.com' && u.pathname.startsWith('/media/')) { if (CONFIG.image.preferOriginal) { const fmt = u.searchParams.get('format') || 'jpg'; u.searchParams.set('format', fmt); u.searchParams.set('name', 'orig'); } return u.toString(); } // Bump avatar size where the URL uses the _normal/_bigger suffix convention. if ( u.hostname === 'pbs.twimg.com' && /_(normal|bigger|mini|x96|200x200)\./.test(u.pathname) ) { return u.toString().replace(/_(normal|bigger|mini|x96|200x200)\./, '_400x400.'); } return url; } catch { return url; } } // =========================================================================== // FRAGILE LAYER - read X's DOM, produce the normalized model // --------------------------------------------------------------------------- // MODEL SHAPE // { // type: 'article' | 'post', // title, sourceUrl, exportedAt, // author: { name, handle, avatarUrl, avatarDataUri? }, // blocks: Block[] // } // Block: // { kind: 'heading', level, text } // { kind: 'paragraph', html } // sanitized inline html // { kind: 'list', ordered, items: html[] } // { kind: 'divider' } // { kind: 'code', text } // { kind: 'blockquote', blocks } // { kind: 'image', url, alt, dataUri? } // { kind: 'video', posterUrl, mp4Url?, sourceUrl, mode?, dataUri?, posterDataUri? } // { kind: 'quote', author, blocks, sourceUrl } // a rebuilt quoted tweet // =========================================================================== function detectPageType() { const p = location.pathname; if (/\/i\/article\//.test(p) || /\/article\//.test(p)) return 'article'; if (pick(document, CONFIG.selectors.articleTextRoot, { quiet: true })) return 'article'; if (pick(document, CONFIG.selectors.articleRoot, { quiet: true })) return 'article'; if (/\/status\/\d+/.test(p)) return 'post'; return null; } function statusIdFromUrl(url) { const match = String(url || '').match(/\/status\/(\d+)/); return match ? match[1] : ''; } function normalizeStatusUrl(url) { if (!url) return ''; try { const u = new URL(url, location.origin); const match = u.pathname.match(/^(.*\/status\/\d+)/); return match ? `${u.origin}${match[1]}` : u.href.split('?')[0]; } catch { return String(url).split('?')[0]; } } function currentStatusId() { return statusIdFromUrl(location.pathname); } function canonicalUrl(root = document, expectedStatusId = currentStatusId()) { // Prefer the matching permalink carried by a