/**
 * Definitions for content to self-host.
 */

const SCRIPT_URLS = [
	// Hosted libraries (usually CDN's for open source).
	'/ajax.aspnetcdn.com/',
	'/ajax.cloudflare.com/',
	'/ajax.googleapis.com/ajax/',
	'/cdn.jsdelivr.net/',
	'/cdnjs.com/',
	'/cdnjs.cloudflare.com/',
	'/code.jquery.com/',
	'/maxcdn.bootstrapcdn.com/',
	'/netdna.bootstrapcdn.com/',
	'/oss.maxcdn.com/',
	'/stackpath.bootstrapcdn.com/',

	// Popular scripts (can be site-specific)
	'/a.optmnstr.com/app/js/',
	'/cdn.onesignal.com/sdks/',
	'/cdn.optimizely.com/',
	'/cdn.shopify.com/s/',
	'/css3-mediaqueries-js.googlecode.com/svn/',
	'/html5shim.googlecode.com/svn/',
	'/html5shiv.googlecode.com/svn/',
	'/maps.google.com/maps/api/js',
	'/maps.googleapis.com/maps/api/js',
	'/pagead2.googlesyndication.com/pagead/js/',
	'/platform.twitter.com/widgets.js',
	'/platform-api.sharethis.com/js/',
	'/s7.addthis.com/js/',
	'/stats.wp.com/',
	'/ws.sharethis.com/button/',
	'/www.google.com/recaptcha/api.js',
	'/www.google-analytics.com/analytics.js',
	'/www.googletagmanager.com/gtag/js',
	'/www.googletagmanager.com/gtm.js',
	'/www.googletagservices.com/tag/js/gpt.js',
];

// Regex patterns for matching script and link tags
const SCRIPT_PRE = '<\\s*script[^>]+src\\s*=\\s*[\'"]\\s*((https?:)?/';
const PATTERN_POST = '[^\'" ]+)\\s*["\'][^>]*>';

/**
 * Main worker entry point. Looks for font requests that are being proxied and
 * requests for HTML content. All major browsers explicitly send an accept: text/html
 * for navigational requests and the fallback is to just pass the request through
 * unmodified (safe).
 */
addEventListener('fetch', event => {
	// Fail-safe in case of an unhandled exception
	event.passThroughOnException();

	const url = new URL(event.request.url);
	const bypass = new URL(event.request.url).searchParams.get('cf-worker') === 'bypass';
	if (!bypass) {
		let accept = event.request.headers.get('accept');
		if (event.request.method === 'GET' && isProxyRequest(url)) {
			event.respondWith(proxyUrl(url, event.request));
		} else if (accept && accept.indexOf('text/html') >= 0) {
			event.respondWith(processHtmlRequest(event.request));
		}
	}
});

// Workers can only decode utf-8 so keep a list of character encodings that can be decoded.
const VALID_CHARSETS = ['utf-8', 'utf8', 'iso-8859-1', 'us-ascii'];

/**
 * See if the requested resource is a proxy request to an overwritten origin
 * (something that starts with a prefix in one of our lists).
 *
 * @param {*} url - Requested URL
 * @param {*} request - Original Request
 * @returns {*} - true if the URL matches one of the proxy paths
 */
function isProxyRequest(url) {
	let found_prefix = false;
	const path = url.pathname + url.search;
	for (let prefix of SCRIPT_URLS) {
		if (path.startsWith(prefix) && path.indexOf('cf_hash=') >= 0) {
			found_prefix = true;
			break;
		}
	}
	return found_prefix;
}

/**
 * Generate a new request based on the original. Filter the request
 * headers to prevent leaking user data (cookies, etc) and filter
 * the response headers to prevent the origin setting policy on
 * our origin.
 *
 * @param {URL} url - Unmodified request URL
 * @param {*} request - The original request
 * @returns {*} - fetch response
 */
async function proxyUrl(url, request) {
	let originUrl = 'https:/' + url.pathname + url.search;
	let hashOffset = originUrl.indexOf('cf_hash=');
	if (hashOffset >= 2) {
		originUrl = originUrl.substring(0, hashOffset - 1);
	}

	// Filter the request headers
	let init = {
		method: request.method,
		headers: {},
	};
	const proxy_headers = ['Accept', 'Accept-Encoding', 'Accept-Language', 'Referer', 'User-Agent'];
	for (let name of proxy_headers) {
		let value = request.headers.get(name);
		if (value) {
			init.headers[name] = value;
		}
	}
	// Add an X-Forwarded-For with the client IP
	const clientAddr = request.headers.get('cf-connecting-ip');
	if (clientAddr) {
		init.headers['X-Forwarded-For'] = clientAddr;
	}

	// Filter the response headers
	const response = await fetch(originUrl, init);
	if (response) {
		const responseHeaders = [
			'Content-Type',
			'Cache-Control',
			'Expires',
			'Accept-Ranges',
			'Date',
			'Last-Modified',
			'ETag',
		];
		let responseInit = {
			status: response.status,
			statusText: response.statusText,
			headers: {},
		};
		for (let name of responseHeaders) {
			let value = response.headers.get(name);
			if (value) {
				responseInit.headers[name] = value;
			}
		}
		// Extend the cache time for successful responses (since the url is
		// specific to the hashed content).
		if (response.status === 200) {
			responseInit.headers['Cache-Control'] = 'private; max-age=315360000';
		}

		const newResponse = new Response(response.body, responseInit);
		return newResponse;
	}

	return response;
}

/**
 * Handle all of the processing for a (likely) HTML request.
 * - Pass through the request to the origin and inspect the response.
 * - If the response is HTML set up a streaming transform and pass it on to modifyHtmlStream for processing
 *
 * Extra care needs to be taken to make sure the character encoding from the original
 * HTML is extracted and converted to utf-8 and that the downstream response is identified
 * as utf-8.
 *
 * @param {*} request - The original request
 */
async function processHtmlRequest(request) {
	// Fetch from origin server.
	const response = await fetch(request);
	let contentType = response.headers.get('content-type');
	if (contentType && contentType.indexOf('text/html') !== -1) {
		// Workers can only decode utf-8. If it is anything else, pass the
		// response through unmodified
		const charsetRegex = /charset\s*=\s*([^\s;]+)/gim;
		const match = charsetRegex.exec(contentType);
		if (match !== null) {
			let charset = match[1].toLowerCase();
			if (!VALID_CHARSETS.includes(charset)) {
				return response;
			}
		}

		// Create an identity TransformStream (a.k.a. a pipe).
		// The readable side will become our new response body.
		const { readable, writable } = new TransformStream();

		// Create a cloned response with our modified stream and content type header
		const newResponse = new Response(readable, response);

		// Start the async processing of the response stream (don't wait for it to finish)
		modifyHtmlStream(response.body, writable, request);

		// Return the in-process response so it can be streamed.
		return newResponse;
	} else {
		return response;
	}
}

/**
 * Check to see if the HTML chunk includes a meta tag for an unsupported charset
 * @param {*} chunk - Chunk of HTML to scan
 * @returns {bool} - true if the HTML chunk includes a meta tag for an unsupported charset
 */
function chunkContainsInvalidCharset(chunk) {
	let invalid = false;

	// meta charset
	const charsetRegex = /<\s*meta[^>]+charset\s*=\s*['"]([^'"]*)['"][^>]*>/gim;
	const charsetMatch = charsetRegex.exec(chunk);
	if (charsetMatch) {
		const docCharset = charsetMatch[1].toLowerCase();
		if (!VALID_CHARSETS.includes(docCharset)) {
			invalid = true;
		}
	}
	// content-type
	const contentTypeRegex = /<\s*meta[^>]+http-equiv\s*=\s*['"]\s*content-type[^>]*>/gim;
	const contentTypeMatch = contentTypeRegex.exec(chunk);
	if (contentTypeMatch) {
		const metaTag = contentTypeMatch[0];
		const metaRegex = /charset\s*=\s*([^\s"]*)/gim;
		const metaMatch = metaRegex.exec(metaTag);
		if (metaMatch) {
			const charset = metaMatch[1].toLowerCase();
			if (!VALID_CHARSETS.includes(charset)) {
				invalid = true;
			}
		}
	}
	return invalid;
}

/**
 * Process the streaming HTML response from the origin server.
 * - Attempt to buffer the full head to reduce the likelihood of the patterns spanning multiple response chunks
 * - Scan the first response chunk for a charset meta tag (and bail if it isn't a supported charset)
 * - Pass the gathered head and each subsequent chunk to modifyHtmlChunk() for actual processing of the text.
 *
 * @param {*} readable - Input stream (from the origin).
 * @param {*} writable - Output stream (to the browser).
 * @param {*} request - Original request object for downstream use.
 */
async function modifyHtmlStream(readable, writable, request) {
	const reader = readable.getReader();
	const writer = writable.getWriter();
	const encoder = new TextEncoder();
	let decoder = new TextDecoder('utf-8', { fatal: true });

	let firstChunk = true;
	let unsupportedCharset = false;

	// build the list of url patterns we are going to look for.
	let patterns = [];
	for (let scriptUrl of SCRIPT_URLS) {
		let regex = new RegExp(SCRIPT_PRE + scriptUrl + PATTERN_POST, 'gi');
		patterns.push(regex);
	}

	let partial = '';
	let content = '';

	for (;;) {
		const { done, value } = await reader.read();
		if (done) {
			if (partial.length) {
				partial = await modifyHtmlChunk(partial, patterns, request);
				await writer.write(encoder.encode(partial));
			}
			partial = '';
			break;
		}

		let chunk = null;
		if (unsupportedCharset) {
			// Pass the data straight through
			await writer.write(value);
			continue;
		} else {
			try {
				chunk = decoder.decode(value, { stream: true });
			} catch (e) {
				// Decoding failed, switch to passthrough
				unsupportedCharset = true;
				if (partial.length) {
					await writer.write(encoder.encode(partial));
					partial = '';
				}
				await writer.write(value);
				continue;
			}
		}

		try {
			// Look inside of the first chunk for a HTML charset or content-type meta tag.
			if (firstChunk) {
				firstChunk = false;
				if (chunkContainsInvalidCharset(chunk)) {
					// switch to passthrough
					unsupportedCharset = true;
					if (partial.length) {
						await writer.write(encoder.encode(partial));
						partial = '';
					}
					await writer.write(value);
					continue;
				}
			}

			// TODO: Optimize this so we aren't continuously adding strings together
			content = partial + chunk;
			partial = '';

			// See if there is an unclosed script tag at the end (and if so, carve
			// it out to complete when the remainder comes in).
			// This isn't perfect (case sensitive and doesn't allow whitespace in the tag)
			// but it is good enough for our purpose and much faster than a regex.
			const scriptPos = content.lastIndexOf('<script');
			if (scriptPos >= 0) {
				const scriptClose = content.indexOf('>', scriptPos);
				if (scriptClose === -1) {
					partial = content.slice(scriptPos);
					content = content.slice(0, scriptPos);
				}
			}

			if (content.length) {
				content = await modifyHtmlChunk(content, patterns, request);
			}
		} catch (e) {
			// Ignore the exception
		}
		if (content.length) {
			await writer.write(encoder.encode(content));
			content = '';
		}
	}
	await writer.close();
}

/**
 * Find any of the script tags we are looking for and replace them with hashed versions
 * that are proxied through the origin.
 *
 * @param {*} content - Text chunk from the streaming HTML (or accumulated head)
 * @param {*} patterns - RegEx patterns to match
 * @param {*} request - Original request object for downstream use.
 */
async function modifyHtmlChunk(content, patterns, request) {
	// Fully tokenizing and parsing the HTML is expensive.  This regex is much faster and should be reasonably safe.
	// It looks for the search patterns and extracts the URL as match #1.  It shouldn't match
	// in-text content because the < > brackets would be escaped in the HTML.  There is some potential risk of
	// matching it in an inline script (unlikely but possible).
	const pageUrl = new URL(request.url);
	for (let pattern of patterns) {
		let match = pattern.exec(content);
		while (match !== null) {
			const originalUrl = match[1];
			let fetchUrl = originalUrl;
			if (fetchUrl.startsWith('//')) {
				fetchUrl = pageUrl.protocol + fetchUrl;
			}
			const proxyUrl = await hashContent(originalUrl, fetchUrl, request);
			if (proxyUrl) {
				content = content.split(originalUrl).join(proxyUrl);
				pattern.lastIndex -= originalUrl.length - proxyUrl.length;
			}
			match = pattern.exec(content);
		}
	}
	return content;
}

/**
 * Fetch the original content and return a hash of the result (for detecting changes).
 * Use a local cache because some scripts use cache-control: private to prevent
 * proxies from caching.
 *
 * @param {*} originalUrl - Unmodified URL
 * @param {*} url - URL for the third-party request
 * @param {*} request - Original request for the page HTML so the user-agent can be passed through
 * @param {*} event - Worker event object.
 */
async function hashContent(originalUrl, url, request) {
	let proxyUrl = null;
	let hash = null;
	const userAgent = request.headers.get('user-agent');
	const clientAddr = request.headers.get('cf-connecting-ip');
	const hashCacheKey = new Request(url + 'cf-hash-key');
	let cache = null;

	let foundInCache = false;
	// Try pulling it from the cache API (wrap it in case it's not implemented)
	try {
		cache = caches.default;
		let response = await cache.match(hashCacheKey);
		if (response) {
			hash = await response.text();
			proxyUrl = constructProxyUrl(originalUrl, hash);
			foundInCache = true;
		}
	} catch (e) {
		// Ignore the exception
	}

	if (!foundInCache) {
		try {
			let headers = { 'Referer': request.url, 'User-Agent': userAgent };
			if (clientAddr) {
				headers['X-Forwarded-For'] = clientAddr;
			}
			const response = await fetch(url, { headers: headers });
			let content = await response.arrayBuffer();
			if (content) {
				const hashBuffer = await crypto.subtle.digest('SHA-1', content);
				hash = hex(hashBuffer);
				proxyUrl = constructProxyUrl(originalUrl, hash);

				// Add the hash to the local cache
				try {
					if (cache) {
						let ttl = 60;
						const cacheControl = response.headers.get('cache-control');
						const maxAgeRegex = /max-age\s*=\s*(\d+)/i;
						const match = maxAgeRegex.exec(cacheControl);
						if (match) {
							ttl = parseInt(match[1], 10);
						}
						const hashCacheResponse = new Response(hash, { ttl: ttl });
						cache.put(hashCacheKey, hashCacheResponse);
					}
				} catch (e) {
					// Ignore the exception
				}
			}
		} catch (e) {
			// Ignore the exception
		}
	}

	return proxyUrl;
}

/**
 * Generate the proxy URL given the content hash and base URL
 * @param {*} originalUrl - Original URL
 * @param {*} hash - Hash of content
 * @returns {*} - URL with content hash appended
 */
function constructProxyUrl(originalUrl, hash) {
	let proxyUrl = null;
	let pathStart = originalUrl.indexOf('//');
	if (pathStart >= 0) {
		proxyUrl = originalUrl.substring(pathStart + 1);
		if (proxyUrl.indexOf('?') >= 0) {
			proxyUrl += '&';
		} else {
			proxyUrl += '?';
		}
		proxyUrl += 'cf_hash=' + hash;
	}
	return proxyUrl;
}

/**
 * Convert a buffer into a hex string (for hashes).
 * From: https://developer.mozilla.org/en-US/docs/Web/API/SubtleCrypto/digest
 * @param {*} buffer - Binary buffer
 * @returns {*} - Hex string of the binary buffer
 */
function hex(buffer) {
	var hexCodes = [];
	var view = new DataView(buffer);
	for (var i = 0; i < view.byteLength; i += 4) {
		var value = view.getUint32(i);
		var stringValue = value.toString(16);
		var padding = '00000000';
		var paddedValue = (padding + stringValue).slice(-padding.length);
		hexCodes.push(paddedValue);
	}
	return hexCodes.join('');
}