/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /** * This module handles the visit extraction data from browsing history */ import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs"; const MS_PER_DAY = 86_400_000; const MICROS_PER_MS = 1_000; const MS_PER_SEC = 1_000; const MICROS_PER_SEC = 1_000_000; const SECONDS_PER_DAY = 86_400; // History fetch defaults const DEFAULT_DAYS = 60; const DEFAULT_MAX_RESULTS = 3000; // Sessionization defaults const DEFAULT_GAP_SEC = 900; const DEFAULT_MAX_SESSION_SEC = 7200; // Recency defaults const DEFAULT_HALFLIFE_DAYS = 14; const DEFAULT_RECENCY_FLOOR = 0.5; const DEFAULT_SESSION_WEIGHT = 1.0; const SEARCH_ENGINE_DOMAINS = [ "google", "bing", "duckduckgo", "search.brave", "yahoo", "startpage", "ecosia", "baidu", "yandex", ]; function escapeRe(s) { return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } const SEARCH_ENGINE_PATTERN = new RegExp( `(^|\\.)(${SEARCH_ENGINE_DOMAINS.map(escapeRe).join("|")})\\.`, "i" ); /** * Fetch recent browsing history from Places (SQL), aggregate by URL, * tag "search" vs "history", and attach simple frequency percentiles. * * This API is designed to support both: * - Initial ("Day 0") backfills over a fixed time window, and * - Incremental reads using a visit_date watermark (`sinceMicros`). * * Callers can either: * 1. Pass `sinceMicros` (microseconds since epoch, Places visit_date-style) * to fetch visits with `visit_date >= sinceMicros`, or * 2. Omit `sinceMicros` and let `days` define a relative cutoff window * from "now" (e.g., last 60 days). * * Typical usage: * - Day 0: getRecentHistory({ sinceMicros: 0, maxResults: 3000 }) * // or: getRecentHistory({ days: 60, maxResults: 3000 }) * - Incremental: * const rows = await getRecentHistory({ sinceMicros: lastWatermark }); * const nextWatermark = Math.max(...rows.map(r => r.visitDateMicros)); * * NOTE: `visitDateMicros` in the returned objects is the raw Places * visit_date (microseconds since epoch, UTC). * * @param {object} [opts] * @param {number} [opts.sinceMicros=null] * Optional absolute cutoff in microseconds since epoch (Places * visit_date). If provided, this is used directly as the cutoff: * only visits with `visit_date >= sinceMicros` are returned. * * This is the recommended way to implement incremental reads: * store the max `visitDateMicros` from the previous run and pass * it (or max + 1) back in as `sinceMicros`. * * @param {number} [opts.days=DEFAULT_DAYS] * How far back to look if `sinceMicros` is not provided. * The cutoff is computed as: * cutoff = now() - days * MS_PER_DAY * * Ignored when `sinceMicros` is non-null. * * @param {number} [opts.maxResults=DEFAULT_MAX_RESULTS] * Maximum number of rows to return from the SQL query (after * sorting by most recent visit). Note that this caps the number * of visits, not distinct URLs. * * @returns {Promise>} */ export async function getRecentHistory(opts = {}) { // If provided, this is a Places visit_date-style cutoff in microseconds // When non-null, `days` is ignored and we use `sinceMicros` directly. const { sinceMicros = null, days = DEFAULT_DAYS, maxResults = DEFAULT_MAX_RESULTS, } = opts; // Places stores visit_date in microseconds since epoch. let cutoffMicros; if (sinceMicros != null) { cutoffMicros = Math.max(0, sinceMicros); } else { cutoffMicros = Math.max( 0, (Date.now() - days * MS_PER_DAY) * MICROS_PER_MS ); } const isSearchVisit = urlStr => { try { const { hostname, pathname, search } = new URL(urlStr); const isSearchEngine = SEARCH_ENGINE_PATTERN.test(hostname); const looksLikeSearch = /search|results|query/i.test(pathname) || /[?&](q|query|p)=/i.test(search); return isSearchEngine && looksLikeSearch; } catch (e) { console.error("isSearchVisit: failed to parse URL", { error: String(e), urlLength: typeof urlStr === "string" ? urlStr.length : -1, }); return false; } }; const SQL = ` WITH visit_info AS ( SELECT p.id AS place_id, p.url AS url, o.host AS host, p.title AS title, v.visit_date AS visit_date, p.frecency AS frecency, CASE WHEN o.frecency = -1 THEN 1 ELSE o.frecency END AS domain_frecency FROM moz_places p JOIN moz_historyvisits v ON v.place_id = p.id JOIN moz_origins o ON p.origin_id = o.id WHERE v.visit_date >= :cutoff AND p.title IS NOT NULL AND p.frecency IS NOT NULL ORDER BY v.visit_date DESC LIMIT :limit ), /* Collapse to one row per place to compute percentiles (like your groupby/place_id mean) */ per_place AS ( SELECT place_id, MAX(frecency) AS frecency, MAX(domain_frecency) AS domain_frecency FROM visit_info GROUP BY place_id ), /* Percentiles using window function CUME_DIST() */ per_place_with_pct AS ( SELECT place_id, ROUND(100.0 * CUME_DIST() OVER (ORDER BY frecency), 2) AS frecency_pct, ROUND(100.0 * CUME_DIST() OVER (ORDER BY domain_frecency), 2) AS domain_frecency_pct FROM per_place ) /* Final rows: original visits + joined percentiles + source label */ SELECT v.url, v.host, v.title, v.visit_date, p.frecency_pct, p.domain_frecency_pct FROM visit_info v JOIN per_place_with_pct p USING (place_id) ORDER BY v.visit_date DESC `; try { const rows = await PlacesUtils.withConnectionWrapper( "smartwindow-getRecentHistory", async db => { const stmt = await db.execute(SQL, { cutoff: cutoffMicros, limit: maxResults, }); const out = []; for (const row of stmt) { const url = row.getResultByName("url"); const host = row.getResultByName("host"); const title = row.getResultByName("title") || ""; const visitDateMicros = row.getResultByName("visit_date") || 0; const frequencyPct = row.getResultByName("frecency_pct") || 0; const domainFrequencyPct = row.getResultByName("domain_frecency_pct") || 0; out.push({ url, domain: host, title, visitDateMicros, frequencyPct, domainFrequencyPct, source: isSearchVisit(url) ? "search" : "history", }); } return out; } ); return rows; } catch (error) { console.error("Failed to fetch Places history via SQL:", error); return []; } } /** * Sessionize visits using a gap and max session length. * Returns a new array sorted by ascending time and adds: * - session_id * - session_start_ms * - session_start_iso * * @param {Array<{visitDateMicros:number,title?:string,domain?:string,frequencyPct?:number,domainFrequencyPct?:number,source?:'history'|'search'}>} rows * @param {object} [opts] * @param {number} [opts.gapSec=900] Max allowed gap between consecutive visits in a session (seconds) * @param {number} [opts.maxSessionSec=7200] Max session duration from first to current visit (seconds) * @returns {Array} */ export function sessionizeVisits(rows, opts = {}) { const GAP_MS = (opts.gapSec ?? DEFAULT_GAP_SEC) * MS_PER_SEC; const MAX_SESSION_MS = (opts.maxSessionSec ?? DEFAULT_MAX_SESSION_SEC) * MS_PER_SEC; // Normalize and keep only visits with a valid timestamp const normalized = rows // Keep only rows with a valid timestamp .filter(row => Number.isFinite(row.visitDateMicros)) .map(row => ({ ...row, visitTimeMs: Math.floor(row.visitDateMicros / MICROS_PER_MS), })) .sort((a, b) => a.visitTimeMs - b.visitTimeMs); let curStartMs = null; let prevMs = null; for (const row of normalized) { const timeMs = row.visitTimeMs; const startNew = prevMs === null || timeMs - prevMs > GAP_MS || timeMs - curStartMs > MAX_SESSION_MS; if (startNew) { curStartMs = timeMs; } row.session_start_ms = curStartMs; row.session_start_iso = new Date(curStartMs).toISOString(); row.session_id = curStartMs; prevMs = timeMs; } return normalized; } /** * Build per-session feature records from sessionized rows. * * Output record shape: * { * session_id: number, * title_scores: { [title: string]: number }, * domain_scores: { [domain: string]: number }, * session_start_time: number | null, // epoch seconds * session_end_time: number | null, // epoch seconds * search_events: { * session_id: number, * search_count: number, * search_titles: string[], * last_searched: number, // epoch micros * } | {} * } * * @param {Array} rows sessionized visits * @returns {Array} */ export function generateProfileInputs(rows) { const bySession = new Map(); for (const row of rows) { const sessionId = row.session_id; if (!bySession.has(sessionId)) { bySession.set(sessionId, []); } bySession.get(sessionId).push(row); } // session_id -> { title: frecency_pct } const titleScoresBySession = {}; for (const [sessionId, items] of bySession) { const m = {}; for (const r of items) { const title = r.title ?? ""; const pct = r.frequencyPct; if (title && isFiniteNumber(pct)) { m[title] = pct; } } if (Object.keys(m).length) { titleScoresBySession[sessionId] = m; } } // session_id -> { domain: domain_frecency_pct } const domainScoresBySession = {}; for (const [sessionId, items] of bySession) { const m = {}; for (const r of items) { const domain = r.domain ?? r.host ?? ""; const pct = r.domainFrequencyPct; if (domain && isFiniteNumber(pct)) { m[domain] = pct; } } if (Object.keys(m).length) { domainScoresBySession[sessionId] = m; } } // session_id -> { search_count, search_titles (unique), last_searched } const searchSummaryBySession = {}; for (const [sessionId, items] of bySession) { const searchItems = items.filter(r => r.source === "search"); if (!searchItems.length) { continue; } const search_titles = [ ...new Set(searchItems.map(r => r.title).filter(Boolean)), ]; const last_searched_raw = Math.max( ...searchItems.map(r => Number(r.visitDateMicros) || 0) ); searchSummaryBySession[sessionId] = { session_id: sessionId, search_count: searchItems.length, search_titles, last_searched: last_searched_raw, }; } // session start/end times const sessionTimes = { start_time: {}, end_time: {} }; for (const [sessionId, items] of bySession) { const tsList = items .filter(Number.isFinite) .map(r => Number(r.visitDateMicros)); if (tsList.length) { sessionTimes.start_time[sessionId] = Math.min(...tsList); sessionTimes.end_time[sessionId] = Math.max(...tsList); } else { sessionTimes.start_time[sessionId] = null; sessionTimes.end_time[sessionId] = null; } } // final prepared inputs const preparedInputs = []; for (const sessionId of bySession.keys()) { const rawRecord = { session_id: sessionId, title_scores: titleScoresBySession[sessionId] || {}, domain_scores: domainScoresBySession[sessionId] || {}, session_start_time: normalizeEpochSeconds( sessionTimes.start_time[sessionId] ), session_end_time: normalizeEpochSeconds(sessionTimes.end_time[sessionId]), search_events: searchSummaryBySession[sessionId] || {}, }; const record = {}; for (const [key, value] of Object.entries(rawRecord)) { if (value !== undefined) { record[key] = value; } } preparedInputs.push(record); } return preparedInputs; } /** * Aggregate over sessions into three dictionaries: * - agg_domains: domain -> { score, last_seen, num_sessions, session_importance } * - agg_titles: title -> { score, last_seen, num_sessions, session_importance } * - agg_searches: session_id -> { search_count, search_titles[], last_searched(sec) } * * Notes: * - "last value wins" semantics for scores (matches your Python loop) * - session_importance ~ (#sessions total / #sessions item appears in), rounded 2dp * * @param {Array} preparedInputs * @returns {[Record, Record, Record]} */ export function aggregateSessions(preparedInputs) { // domain -> { score, last_seen, sessions:Set } const domainAgg = Object.create(null); // title -> { score, last_seen, sessions:Set } const titleAgg = Object.create(null); // sid -> { search_count, search_titles:Set, last_searched } const searchAgg = Object.create(null); const nowSec = Date.now() / 1000; const totalSessions = preparedInputs.length; for (const session of preparedInputs) { const sessionId = session.session_id; const startSec = session.session_start_time; const endSec = session.session_end_time; const lastSeenSec = endSec ?? startSec ?? nowSec; // domains const domainScores = session.domain_scores || {}; for (const [domain, scoreVal] of Object.entries(domainScores)) { const rec = getOrInit(domainAgg, domain, () => ({ score: 0.0, last_seen: 0, sessions: new Set(), })); rec.score = Number(scoreVal); // last value wins rec.last_seen = Math.max(rec.last_seen, lastSeenSec); rec.sessions.add(sessionId); } // titles const titleScores = session.title_scores || {}; for (const [title, scoreVal] of Object.entries(titleScores)) { const rec = getOrInit(titleAgg, title, () => ({ score: 0.0, last_seen: 0, sessions: new Set(), })); rec.score = Number(scoreVal); // last value wins rec.last_seen = Math.max(rec.last_seen, lastSeenSec); rec.sessions.add(sessionId); } // searches const searchEvents = session.search_events || {}; const { search_count, search_titles, last_searched } = searchEvents; const hasSearchContent = (search_count && search_count > 0) || (Array.isArray(search_titles) && search_titles.length) || Number.isFinite(last_searched); if (hasSearchContent) { const rec = getOrInit(searchAgg, sessionId, () => ({ search_count: 0, search_titles: new Set(), last_searched: 0.0, })); rec.search_count += Number(search_count || 0); for (const title of search_titles || []) { rec.search_titles.add(title); } rec.last_searched = Math.max(rec.last_searched, toSeconds(last_searched)); } } for (const rec of Object.values(domainAgg)) { const n = rec.sessions.size; rec.num_sessions = n; rec.session_importance = n > 0 ? round2(totalSessions / n) : 0.0; delete rec.sessions; } for (const rec of Object.values(titleAgg)) { const n = rec.sessions.size; rec.num_sessions = n; rec.session_importance = n > 0 ? round2(totalSessions / n) : 0.0; delete rec.sessions; } for (const key of Object.keys(searchAgg)) { const rec = searchAgg[key]; rec.search_titles = [...rec.search_titles]; } return [domainAgg, titleAgg, searchAgg]; } /** * Compute top-k domains, titles, and searches from aggregate structures. * * Input shapes: * aggDomains: { * [domain: string]: { * score: number, * last_seen: number, * num_sessions: number, * session_importance: number, * } * } * * aggTitles: { * [title: string]: { * score: number, * last_seen: number, * num_sessions: number, * session_importance: number, * } * } * * aggSearches: { * [sessionId: string|number]: { * search_count: number, * search_titles: string[], * last_searched: number, * } * } * * Output shape: * [ * [ [domain, rank], ... ], // domains, length <= kDomains * [ [title, rank], ... ], // titles, length <= kTitles * [ { sid, cnt, q, ls, r }, ... ], // searches, length <= kSearches * ] * * @param {{[domain: string]: any}} aggDomains * @param {{[title: string]: any}} aggTitles * @param {{[sessionId: string]: any}} aggSearches * @param {object} [options] * @param {number} [options.k_domains=30] * @param {number} [options.k_titles=60] * @param {number} [options.k_searches=10] * @param {number} [options.now] Current time; seconds or ms, normalized internally. */ export function topkAggregates( aggDomains, aggTitles, aggSearches, { k_domains = 30, k_titles = 60, k_searches = 10, now = undefined } = {} ) { // Normalize `now` to epoch seconds. let nowSec; if (now == null) { nowSec = Date.now() / 1000; } else { const asNum = Number(now); // Heuristic: treat 1e12+ as ms, otherwise seconds. nowSec = asNum > 1e12 ? asNum / MS_PER_SEC : asNum; } // Domains: [{key, rank, num_sessions, last_seen}] const domainRanked = Object.entries(aggDomains).map(([domain, info]) => { const score = Number(info.score || 0); const importance = Number(info.session_importance || 0); const lastSeen = Number(info.last_seen || 0); const numSessions = Number(info.num_sessions || 0); const rank = withRecency(score, importance, lastSeen, { now: nowSec }); return { key: domain, rank, num_sessions: numSessions, last_seen: lastSeen, }; }); // Titles: [{key, rank, num_sessions, last_seen}] const titleRanked = Object.entries(aggTitles).map(([title, info]) => { const score = Number(info.score || 0); const importance = Number(info.session_importance || 0); const lastSeen = Number(info.last_seen || 0); const numSessions = Number(info.num_sessions || 0); const rank = withRecency(score, importance, lastSeen, { now: nowSec }); return { key: title, rank, num_sessions: numSessions, last_seen: lastSeen, }; }); // Searches: [{sid, cnt, q, ls, rank}] const searchRanked = Object.entries(aggSearches).map(([sidRaw, info]) => { const sid = Number.isFinite(Number(sidRaw)) ? Number(sidRaw) : sidRaw; const count = Number(info.search_count || 0); // `last_searched` is already seconds (aggregateSessions uses toSeconds). const lastSearchedSec = Number(info.last_searched || 0); const titles = Array.isArray(info.search_titles) ? info.search_titles : []; const rank = withRecency(count, 1.0, lastSearchedSec, { now: nowSec }); return { sid, cnt: count, q: titles, ls: lastSearchedSec, rank, }; }); // Sort with tie-breakers domainRanked.sort( (a, b) => b.rank - a.rank || b.num_sessions - a.num_sessions || b.last_seen - a.last_seen ); titleRanked.sort( (a, b) => b.rank - a.rank || b.num_sessions - a.num_sessions || b.last_seen - a.last_seen ); searchRanked.sort((a, b) => b.rank - a.rank || b.cnt - a.cnt || b.ls - a.ls); // Trim and emit compact structures const domainItems = domainRanked .slice(0, k_domains) .map(({ key, rank }) => [key, round2(rank)]); const titleItems = titleRanked .slice(0, k_titles) .map(({ key, rank }) => [key, round2(rank)]); const searchItems = searchRanked .slice(0, k_searches) .map(({ sid, cnt, q, ls, rank }) => ({ sid, cnt, q, ls, r: round2(rank), })); return [domainItems, titleItems, searchItems]; } /** * Blend a base score with session importance and a time-based decay. * * Intuition: * rank ≈ score * sessionImportance * sessionWeight * recencyFactor * * where recencyFactor is in [floor, 1], decaying over time with a * half-life in days. * * @param {number} score * Base score (e.g., frecency percentile). * @param {number} sessionImportance * Importance derived from how many sessions the item appears in. * @param {number} lastSeenSec * Last-seen timestamp (epoch seconds or micros/ms; normalized via toSeconds()). * @param {object} [options] * @param {number} [options.halfLifeDays=14] * Half-life in days for recency decay; smaller → recency matters more. * @param {number} [options.floor=0.5] * Minimum recency factor; keeps a base weight even for very old items. * @param {number} [options.sessionWeight=1.0] * Additional multiplier on sessionImportance. * @param {number} [options.now] * "Now" timestamp (sec/ms/µs); if omitted, Date.now() is used. * @returns {number} * Rounded rank score (2 decimal places). */ function withRecency( score, sessionImportance, lastSeenSec, { halfLifeDays = DEFAULT_HALFLIFE_DAYS, floor = DEFAULT_RECENCY_FLOOR, sessionWeight = DEFAULT_SESSION_WEIGHT, now = undefined, } = {} ) { const nowSec = now != null ? toSeconds(now) : Date.now() / 1000; const lastSec = toSeconds(lastSeenSec); const ageDays = Math.max(0, (nowSec - lastSec) / SECONDS_PER_DAY); const decay = Math.pow(0.5, ageDays / halfLifeDays); const importanceScore = Number(score) * (Number(sessionImportance) * Number(sessionWeight)); return round2(importanceScore * (floor + (1 - floor) * decay)); } function isFiniteNumber(n) { return typeof n === "number" && Number.isFinite(n); } /** * Convert epoch microseconds → integer epoch seconds. * If value is null/undefined/NaN, returns null. * * @param {number} micros */ function normalizeEpochSeconds(micros) { if (!Number.isFinite(micros)) { return null; } return Math.floor(micros / MICROS_PER_SEC); } function toSeconds(epochMicrosOrMs) { if (!Number.isFinite(epochMicrosOrMs)) { return 0; } const v = Number(epochMicrosOrMs); return v > 1e13 ? v / MICROS_PER_SEC : v / MS_PER_SEC; } function getOrInit(mapObj, key, initFn) { if (!(key in mapObj)) { mapObj[key] = initFn(); } return mapObj[key]; } function round2(x) { return Math.round(Number(x) * 100) / 100; }