//! Storage and retrieval for redirect and scriptlet resources. use std::collections::HashMap; use base64::{engine::Engine as _, prelude::BASE64_STANDARD}; use once_cell::sync::Lazy; use regex::Regex; use thiserror::Error; use super::{PermissionMask, Resource, ResourceType}; #[derive(Clone)] enum ResourceContent { /// A valid utf8 string. Used for text/* mime types or for ResourceType:Template Text(String), /// Raw content in the form of a byte array. Used for other mime types like /// "image/gif" or "audio/mp3" Raw(Vec), } impl ResourceContent { fn text_from_base64(base64: &str) -> Result { let decoded = BASE64_STANDARD.decode(base64)?; Ok(Self::Text(String::from_utf8(decoded)?)) } fn raw_from_base64(base64: &str) -> Result { let decoded = BASE64_STANDARD.decode(base64)?; Ok(Self::Raw(decoded)) } } #[derive(Clone)] /// A internal representation of a Resource to store. Stores the content /// in the decoded form to use less memory. /// See [Resource] for details pub struct ResourceImpl { name: String, kind: ResourceType, content: ResourceContent, dependencies: Vec, permission: PermissionMask, } /// Unified resource storage for both redirects and scriptlets. /// /// By default, this uses an in-memory storage implementation, however this can be changed using /// a custom [ResourceStorageBackend] if desired. pub struct ResourceStorage { #[cfg(not(feature = "single-thread"))] backend: Box, #[cfg(feature = "single-thread")] backend: Box, } /// Loads an empty `InMemoryResourceStorage` backend. impl Default for ResourceStorage { fn default() -> Self { Self { backend: Box::new(InMemoryResourceStorage::default()), } } } impl ResourceStorage { #[cfg(not(feature = "single-thread"))] pub fn from_backend(backend: S) -> Self { Self { backend: Box::new(backend), } } #[cfg(feature = "single-thread")] pub fn from_backend(backend: S) -> Self { Self { backend: Box::new(backend), } } /// Constructor using an `InMemoryResourceStorage` as the backend with the given resources. #[cfg(test)] pub fn in_memory_from_resources(resources: impl IntoIterator) -> Self { Self::from_backend(InMemoryResourceStorage::from_resources(resources)) } } /// Customizable backend for [Resource] storage. /// Custom implementations could be used to enable (for example) sharing of resources between /// multiple [crate::Engine]s, an on-disk backend, or special caching behavior. pub trait ResourceStorageBackend { /// Gets the resource associated with `resource_ident`, respecting aliases if necessary. fn get_resource(&self, resource_ident: &str) -> Option; } /// Default implementation of [ResourceStorageBackend] that stores all resources in memory. #[derive(Default, Clone)] pub struct InMemoryResourceStorage { /// Stores each resource by its canonical name resources: HashMap, /// Stores mappings from aliases to their canonical resource names aliases: HashMap, } impl ResourceStorageBackend for InMemoryResourceStorage { fn get_resource(&self, resource_ident: &str) -> Option { let resource = if let Some(resource) = self.resources.get(resource_ident) { Some(resource) } else if let Some(canonical_name) = self.aliases.get(resource_ident) { self.resources.get(canonical_name) } else { None }; resource.cloned() } } impl InMemoryResourceStorage { /// Convenience constructor that allows building storage for many resources at once. Errors are /// silently consumed. pub fn from_resources(resources: impl IntoIterator) -> Self { let mut self_ = Self::default(); resources.into_iter().for_each(|resource| { #[allow(clippy::unnecessary_lazy_evaluations)] self_.add_resource(resource).unwrap_or_else(|_e| { #[cfg(test)] eprintln!("Failed to add resource: {_e:?}") }) }); self_ } /// Adds a resource to storage so that it can be retrieved later. pub fn add_resource(&mut self, resource: Resource) -> Result<(), AddResourceError> { let resource_content: ResourceContent; if let ResourceType::Mime(content_type) = &resource.kind { if !resource.dependencies.is_empty() && !content_type.supports_dependencies() { return Err(AddResourceError::ContentTypeDoesNotSupportDependencies); } if content_type.is_textual() { resource_content = ResourceContent::text_from_base64(&resource.content)?; } else { resource_content = ResourceContent::raw_from_base64(&resource.content)?; } } else { resource_content = ResourceContent::text_from_base64(&resource.content)?; } for ident in std::iter::once(&resource.name).chain(resource.aliases.iter()) { if self.resources.contains_key(ident) || self.aliases.contains_key(ident) { return Err(AddResourceError::NameAlreadyAdded); } } resource.aliases.iter().for_each(|alias| { self.aliases.insert(alias.clone(), resource.name.clone()); }); let resource_impl = ResourceImpl { name: resource.name.clone(), kind: resource.kind, content: resource_content, dependencies: resource.dependencies, permission: resource.permission, }; self.resources.insert(resource.name, resource_impl); Ok(()) } pub fn take_resources(&mut self) -> HashMap { std::mem::take(&mut self.resources) } } /// Formats `arg` such that it either is a JSON string, or is safe to insert within a JSON string, /// depending on `QUOTED`. /// /// Implementation modified from `json-rust` (MIT license). /// https://github.com/maciejhirsz/json-rust #[inline(always)] fn stringify_arg(arg: &str) -> String { const QU: u8 = b'"'; const BS: u8 = b'\\'; const BB: u8 = b'b'; const TT: u8 = b't'; const NN: u8 = b'n'; const FF: u8 = b'f'; const RR: u8 = b'r'; const UU: u8 = b'u'; const __: u8 = 0; // Look up table for characters that need escaping in a product string static ESCAPED: [u8; 256] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F ]; #[inline(never)] fn write_string_complex(output: &mut Vec, string: &str, mut start: usize) { output.extend_from_slice(&string.as_bytes()[..start]); for (index, ch) in string.bytes().enumerate().skip(start) { let escape = ESCAPED[ch as usize]; if escape > 0 { output.extend_from_slice(&string.as_bytes()[start..index]); output.extend_from_slice(&[b'\\', escape]); start = index + 1; } if escape == b'u' { output.extend_from_slice(format!("{ch:04x}").as_bytes()); } } output.extend_from_slice(&string.as_bytes()[start..]); } let mut output = Vec::with_capacity(arg.len() + 2); if QUOTED { output.push(b'"'); } 'process: { for (index, ch) in arg.bytes().enumerate() { if ESCAPED[ch as usize] > 0 { write_string_complex(&mut output, arg, index); break 'process; } } output.extend_from_slice(arg.as_bytes()); } if QUOTED { output.push(b'"'); } // unwrap safety: input is always valid UTF8; output processing only replaces some ASCII // characters with other valid ones String::from_utf8(output).unwrap() } /// Gets the function name from a JS function definition fn extract_function_name(fn_def: &str) -> Option<&str> { // This is not bulletproof, but should be robust against most issues. static FUNCTION_NAME_RE: Lazy = Lazy::new(|| Regex::new(r#"^function\s+([^\(\)\{\}\s]+)\s*\("#).unwrap()); FUNCTION_NAME_RE.captures(fn_def).map(|captures| { // capture 1 is always present in the above regex if any match was made captures.get(1).unwrap().as_str() }) } impl ResourceStorage { /// Given the contents of the `+js(...)` parts of multiple filters, return a script string /// appropriate for injection in a page. pub fn get_scriptlet_resources<'a>( &self, script_injections: impl IntoIterator, ) -> String { let mut deps = vec![]; let mut invokations = String::new(); script_injections.into_iter().for_each(|(s, mask)| { if let Ok(invokation) = self.get_scriptlet_resource(s, mask, &mut deps) { invokations += "try {\n"; invokations += &invokation; invokations += "\n} catch ( e ) { }\n"; } }); let mut result = String::new(); for dep in deps.iter() { if let ResourceContent::Text(content) = &dep.content { result += content; result += "\n"; } } result += &invokations; result } /// Add all dependencies of `new_dep` to `prev_deps`, recursively and uniquely. If the given /// permission is insufficient for any dependency, this will return an `Error`. /// /// Note that no ordering is guaranteed; function definitions in JS can appear after they are /// used. fn recursive_dependencies( &self, new_dep: &str, prev_deps: &mut Vec, filter_permission: PermissionMask, ) -> Result<(), ScriptletResourceError> { if prev_deps.iter().any(|dep| dep.name == new_dep) { return Ok(()); } let resource = self.get_permissioned_resource(new_dep, filter_permission)?; let deps = resource.dependencies.clone(); prev_deps.push(resource); for dep in deps.iter() { self.recursive_dependencies(dep, prev_deps, filter_permission)?; } Ok(()) } /// Given the contents of a single `+js(...)` filter part, return a scriptlet string /// appropriate for injection in a page. fn get_scriptlet_resource( &self, scriptlet_args: &str, filter_permission: PermissionMask, required_deps: &mut Vec, ) -> Result { // `unwrap` is safe because these are guaranteed valid at filter parsing. let scriptlet_args = parse_scriptlet_args(scriptlet_args).unwrap(); if scriptlet_args.is_empty() { return Err(ScriptletResourceError::MissingScriptletName); } let scriptlet_name = with_js_extension(scriptlet_args[0].as_ref()); let args = &scriptlet_args[1..]; if args.len() == 1 && args[0].starts_with('{') && args[0].ends_with('}') { return Err(ScriptletResourceError::ScriptletArgObjectSyntaxUnsupported); } let resource = self.get_permissioned_resource(&scriptlet_name, filter_permission)?; if !resource.kind.supports_scriptlet_injection() { return Err(ScriptletResourceError::ContentTypeNotInjectable); } for dep in resource.dependencies.iter() { self.recursive_dependencies(dep, required_deps, filter_permission)?; } let template = match &resource.content { ResourceContent::Raw(_content) => { return Err(ScriptletResourceError::ContentTypeNotInjectable); } ResourceContent::Text(content) => content.clone(), }; if let Some(function_name) = extract_function_name(&template) { // newer function-style resource: pass args using function call syntax // add the scriptlet itself as a dependency and invoke via function name if !required_deps.iter().any(|dep| dep.name == resource.name) { required_deps.push(resource); } use itertools::Itertools as _; Ok(format!( "{}({})", function_name, args.iter().map(|arg| stringify_arg::(arg)).join(", ") )) } else { // older template-style resource: replace first instances with args Ok(patch_template_scriptlet( template, args.iter().map(|arg| stringify_arg::(arg)), )) } } /// Get a data-URL formatted resource appropriate for a `$redirect` response. pub fn get_redirect_resource(&self, resource_ident: &str) -> Option { let resource = self.backend.get_resource(resource_ident); resource.and_then(|resource| { if !resource.permission.is_default() { return None; } if !resource.kind.supports_redirect() { return None; } if let ResourceType::Mime(mime) = &resource.kind { let bytes = match &resource.content { ResourceContent::Raw(content) => content, ResourceContent::Text(content) => content.as_bytes(), }; let encoded = BASE64_STANDARD.encode(bytes); Some(format!("data:{mime};base64,{encoded}")) } else { None } }) } fn get_permissioned_resource( &self, scriptlet_name: &str, filter_permission: PermissionMask, ) -> Result { let resource = self .backend .get_resource(scriptlet_name) .ok_or(ScriptletResourceError::NoMatchingScriptlet)?; if !resource.permission.is_injectable_by(filter_permission) { return Err(ScriptletResourceError::InsufficientPermissions); } Ok(resource) } } /// Describes failure cases when preparing [`Resource`]s to be used for adblocking. #[derive(Debug, Error, PartialEq)] pub enum AddResourceError { #[error("invalid base64 content")] InvalidBase64Content, #[error("invalid utf-8 content")] InvalidUtf8Content, #[error("resource name already added")] NameAlreadyAdded, #[error("resource content type does not support dependencies")] ContentTypeDoesNotSupportDependencies, } impl From for AddResourceError { fn from(_: base64::DecodeError) -> Self { AddResourceError::InvalidBase64Content } } impl From for AddResourceError { fn from(_: std::string::FromUtf8Error) -> Self { AddResourceError::InvalidUtf8Content } } /// Describes failure cases when attempting to retrieve a resource for scriptlet injection. #[derive(Debug, Error, PartialEq)] pub enum ScriptletResourceError { #[error("no scriptlet has the provided name")] NoMatchingScriptlet, #[error("no scriptlet name was provided")] MissingScriptletName, #[error("object syntax for scriptlet arguments is unsupported")] ScriptletArgObjectSyntaxUnsupported, #[error("scriptlet content was corrupted")] CorruptScriptletContent, #[error("resource content type cannot be used for a scriptlet injection")] ContentTypeNotInjectable, #[error("filter rule is not authorized to inject the intended scriptlet")] InsufficientPermissions, } impl From for ScriptletResourceError { fn from(_: base64::DecodeError) -> Self { Self::CorruptScriptletContent } } impl From for ScriptletResourceError { fn from(_: std::string::FromUtf8Error) -> Self { Self::CorruptScriptletContent } } static TEMPLATE_ARGUMENT_RE: [Lazy; 9] = [ Lazy::new(|| template_argument_regex(1)), Lazy::new(|| template_argument_regex(2)), Lazy::new(|| template_argument_regex(3)), Lazy::new(|| template_argument_regex(4)), Lazy::new(|| template_argument_regex(5)), Lazy::new(|| template_argument_regex(6)), Lazy::new(|| template_argument_regex(7)), Lazy::new(|| template_argument_regex(8)), Lazy::new(|| template_argument_regex(9)), ]; fn template_argument_regex(i: usize) -> Regex { Regex::new(&format!(r"\{{\{{{i}\}}\}}")).unwrap() } /// Omit the 0th element of `args` (the scriptlet name) when calling this method. fn patch_template_scriptlet( mut template: String, args: impl IntoIterator>, ) -> String { // `regex` treats `$` as a special character. Instead, `$$` is interpreted as a literal `$` // character. args.into_iter() .take(TEMPLATE_ARGUMENT_RE.len()) .enumerate() .for_each(|(i, arg)| { template = TEMPLATE_ARGUMENT_RE[i] .replace(&template, arg.as_ref().replace('$', "$$")) .to_string(); }); template } /// Scriptlet injections must be JS resources. However, the `.js` extension may need to be added as /// a canonicalization step, since it can be omitted in filter rules. fn with_js_extension(scriptlet_name: &str) -> String { if scriptlet_name.ends_with(".js") { scriptlet_name.to_string() } else { format!("{scriptlet_name}.js") } } /// Returns the index of the next unescaped separator, as well as a boolean indicating whether or /// not the string must be postprocessed to normalize any separators along the way. fn index_next_unescaped_separator(s: &str, separator: char) -> (Option, bool) { assert!(separator != '\\'); let mut new_arg_end = 0; let mut needs_transform = false; // guaranteed to terminate: // - loop only proceeds if there is an odd number of escape characters // - new_arg_end increases by at least 1 in that case // - s has finite length while new_arg_end < s.len() { let rest = &s[new_arg_end..]; if let Some(i) = rest.find(separator) { // check how many escape characters there are before the matched separator let mut trailing_escapes = 0; while trailing_escapes < i && rest[..i - trailing_escapes].ends_with('\\') { trailing_escapes += 1; } if trailing_escapes % 2 == 0 { // even number; all escape characters are literal backslashes new_arg_end += i; break; } else { // odd number; the last escape character is escaping this separator new_arg_end += i + 1; needs_transform = true; continue; } } else { // no match return (None, needs_transform); } } // don't index beyond the end of the string let new_arg_end = if new_arg_end >= s.len() { None } else { Some(new_arg_end) }; (new_arg_end, needs_transform) } /// Replaces escaped instances of `separator` in `arg` with unescaped characters. fn normalize_arg(arg: &str, separator: char) -> String { assert!(separator != '\\'); let mut output = String::with_capacity(arg.len()); let mut escaped = false; for i in arg.chars() { if i == '\\' { if escaped { escaped = false; output += "\\\\"; } else { escaped = true; } continue; } if escaped { if i != separator { output.push('\\'); } escaped = false; } output.push(i); } output } /// Parses the inner contents of a `+js(...)` operator of a cosmetic filter. /// /// Returns `None` if the contents are malformed. pub(crate) fn parse_scriptlet_args(mut args: &str) -> Option> { let mut args_vec = vec![]; if args.trim().is_empty() { return Some(args_vec); } // guaranteed to terminate: // - each branch of the `match` consumes at least 1 character from the beginning of `args` // - loop exits if `args` is empty loop { // n.b. `args.trim_start()` leaves an empty string if it's only whitespace if let Some(i) = args.find(|c: char| !c.is_whitespace()) { args = &args[i..]; } let (arg, needs_transform); match args.chars().next() { Some(qc) if qc == '"' || qc == '\'' || qc == '`' => { args = &args[1..]; let i; (i, needs_transform) = index_next_unescaped_separator(args, qc); if let Some(i) = i { arg = &args[..i]; args = &args[i + 1..]; // consume whitespace following the quote if let Some(i) = args.find(|c: char| !c.is_whitespace()) { args = &args[i..]; } // consume comma separator if args.starts_with(',') { args = &args[1..]; } else if !args.is_empty() { // uBO pushes everything up to the next comma without escapes, but it's // very weird and probably not what the filter list author intended. // Treating it as an error for now. return None; } } else { // uBO pushes the entire argument, including the unmatched quote. Again, weird // and probably not intended. return None; } } Some(_) => { let i; (i, needs_transform) = index_next_unescaped_separator(args, ','); arg = args[..i.unwrap_or(args.len())].trim_end(); args = &args[i.map(|i| i + 1).unwrap_or(args.len())..]; } None => { // `args` is empty break; } } let arg = if needs_transform { normalize_arg(arg, ',') } else { arg.to_string() }; args_vec.push(arg); } Some(args_vec) } #[cfg(test)] #[path = "../../tests/unit/resources/resource_storage.rs"] mod unit_tests;