//! Holds [`Blocker`], which handles all network-based adblocking queries. use memchr::{memchr as find_char, memrchr as find_char_reverse}; use once_cell::sync::Lazy; use serde::Serialize; use std::collections::HashSet; use std::ops::DerefMut; use crate::filters::fb_network_builder::NetworkFilterListId; use crate::filters::filter_data_context::FilterDataContextRef; use crate::filters::network::NetworkFilterMaskHelper; use crate::network_filter_list::NetworkFilterList; use crate::regex_manager::{RegexManager, RegexManagerDiscardPolicy}; use crate::request::Request; use crate::resources::ResourceStorage; /// Options used when constructing a [`Blocker`]. pub struct BlockerOptions { pub enable_optimizations: bool, } /// Describes how a particular network request should be handled. #[derive(Debug, Serialize, Default)] pub struct BlockerResult { /// Was a blocking filter matched for this request? pub matched: bool, /// Important is used to signal that a rule with the `important` option /// matched. An `important` match means that exceptions should not apply /// and no further checking is neccesary--the request should be blocked /// (empty body or cancelled). /// /// Brave Browser keeps multiple instances of [`Blocker`], so `important` /// here is used to correct behaviour between them: checking should stop /// instead of moving to the next instance iff an `important` rule matched. pub important: bool, /// Specifies what to load instead of the original request, rather than /// just blocking it outright. This can come from a filter with a `redirect` /// or `redirect-rule` option. If present, the field will contain the body /// of the redirect to be injected. /// /// Note that the presence of a redirect does _not_ imply that the request /// should be blocked. The `redirect-rule` option can produce a redirection /// that's only applied if another blocking filter matches a request. pub redirect: Option, /// `removeparam` may remove URL parameters. If the original request URL was /// modified at all, the new version will be here. This should be used /// as long as the request is not blocked. pub rewritten_url: Option, /// Contains a string representation of any matched exception rule. /// Effectively this means that there was a match, but the request should /// not be blocked. /// /// If debugging was _not_ enabled (see [`crate::FilterSet::new`]), this /// will only contain a constant `"NetworkFilter"` placeholder string. pub exception: Option, /// When `matched` is true, this contains a string representation of the /// matched blocking rule. /// /// If debugging was _not_ enabled (see [`crate::FilterSet::new`]), this /// will only contain a constant `"NetworkFilter"` placeholder string. pub filter: Option, } // only check for tags in tagged and exception rule buckets, // pass empty set for the rest static NO_TAGS: Lazy> = Lazy::new(HashSet::new); /// Stores network filters for efficient querying. pub struct Blocker { // Enabled tags are not serialized - when deserializing, tags of the existing // instance (the one we are recreating lists into) are maintained pub(crate) tags_enabled: HashSet, // Not serialized #[cfg(feature = "single-thread")] pub(crate) regex_manager: std::cell::RefCell, #[cfg(not(feature = "single-thread"))] pub(crate) regex_manager: std::sync::Mutex, pub(crate) filter_data_context: FilterDataContextRef, } #[cfg(feature = "single-thread")] pub(crate) type RegexManagerRef<'a> = std::cell::RefMut<'a, RegexManager>; #[cfg(not(feature = "single-thread"))] pub(crate) type RegexManagerRef<'a> = std::sync::MutexGuard<'a, RegexManager>; impl Blocker { /// Decide if a network request (usually from WebRequest API) should be /// blocked, redirected or allowed. pub fn check(&self, request: &Request, resources: &ResourceStorage) -> BlockerResult { self.check_parameterised(request, resources, false, false) } pub(crate) fn get_list(&self, id: NetworkFilterListId) -> NetworkFilterList<'_> { NetworkFilterList { list: self .filter_data_context .memory .root() .network_rules() .get(id as usize), filter_data_context: &self.filter_data_context, } } pub(crate) fn csp(&self) -> NetworkFilterList<'_> { self.get_list(NetworkFilterListId::Csp) } pub(crate) fn exceptions(&self) -> NetworkFilterList<'_> { self.get_list(NetworkFilterListId::Exceptions) } pub(crate) fn importants(&self) -> NetworkFilterList<'_> { self.get_list(NetworkFilterListId::Importants) } pub(crate) fn redirects(&self) -> NetworkFilterList<'_> { self.get_list(NetworkFilterListId::Redirects) } pub(crate) fn removeparam(&self) -> NetworkFilterList<'_> { self.get_list(NetworkFilterListId::RemoveParam) } pub(crate) fn filters(&self) -> NetworkFilterList<'_> { self.get_list(NetworkFilterListId::Filters) } pub(crate) fn generic_hide(&self) -> NetworkFilterList<'_> { self.get_list(NetworkFilterListId::GenericHide) } pub(crate) fn tagged_filters_all(&self) -> NetworkFilterList<'_> { self.get_list(NetworkFilterListId::TaggedFiltersAll) } /// Borrow mutable reference to the regex manager for the ['Blocker`]. /// Only one caller can borrow the regex manager at a time. pub(crate) fn borrow_regex_manager(&self) -> RegexManagerRef<'_> { #[cfg(feature = "single-thread")] #[allow(unused_mut)] let mut manager = self.regex_manager.borrow_mut(); #[cfg(not(feature = "single-thread"))] let mut manager = self.regex_manager.lock().unwrap(); #[cfg(not(target_arch = "wasm32"))] manager.update_time(); manager } pub fn check_generic_hide(&self, hostname_request: &Request) -> bool { let mut regex_manager = self.borrow_regex_manager(); self.generic_hide() .check(hostname_request, &HashSet::new(), &mut regex_manager) .is_some() } #[cfg(test)] pub(crate) fn check_exceptions(&self, request: &Request) -> bool { let mut regex_manager = self.borrow_regex_manager(); self.exceptions() .check(request, &HashSet::new(), &mut regex_manager) .is_some() } pub fn check_parameterised( &self, request: &Request, resources: &ResourceStorage, matched_rule: bool, force_check_exceptions: bool, ) -> BlockerResult { let mut regex_manager = self.borrow_regex_manager(); if !request.is_supported { return BlockerResult::default(); } // Check the filters in the following order: // 1. $important (not subject to exceptions) // 2. redirection ($redirect=resource) // 3. normal filters - if no match by then // 4. exceptions - if any non-important match of forced // Always check important filters let important_filter = self .importants() .check(request, &NO_TAGS, &mut regex_manager); // only check the rest of the rules if not previously matched let filter = if important_filter.is_none() && !matched_rule { self.tagged_filters_all() .check(request, &self.tags_enabled, &mut regex_manager) .or_else(|| self.filters().check(request, &NO_TAGS, &mut regex_manager)) } else { important_filter }; let exception = match filter.as_ref() { // if no other rule matches, only check exceptions if forced to None if matched_rule || force_check_exceptions => { self.exceptions() .check(request, &self.tags_enabled, &mut regex_manager) } None => None, // If matched an important filter, exceptions don't atter Some(f) if f.is_important() => None, Some(_) => self .exceptions() .check(request, &self.tags_enabled, &mut regex_manager), }; let redirect_filters = self.redirects() .check_all(request, &NO_TAGS, regex_manager.deref_mut()); // Extract the highest priority redirect directive. // 1. Exceptions - can bail immediately if found // 2. Find highest priority non-exception redirect let redirect_resource = { let mut exceptions = vec![]; for redirect_filter in redirect_filters.iter() { if redirect_filter.is_exception() { if let Some(redirect) = redirect_filter.modifier_option.as_ref() { exceptions.push(redirect); } } } let mut resource_and_priority = None; for redirect_filter in redirect_filters.iter() { if !redirect_filter.is_exception() { if let Some(redirect) = redirect_filter.modifier_option.as_ref() { if !exceptions.contains(&redirect) { // parse redirect + priority let (resource, priority) = if let Some(idx) = find_char_reverse(b':', redirect.as_bytes()) { let priority_str = &redirect[idx + 1..]; let resource = &redirect[..idx]; if let Ok(priority) = priority_str.parse::() { (resource, priority) } else { (&redirect[..], 0) } } else { (&redirect[..], 0) }; if let Some((_, p1)) = resource_and_priority { if priority > p1 { resource_and_priority = Some((resource, priority)); } } else { resource_and_priority = Some((resource, priority)); } } } } } resource_and_priority.map(|(r, _)| r) }; let redirect: Option = redirect_resource.and_then(|resource_name| { resources.get_redirect_resource(resource_name).or({ // It's acceptable to pass no redirection if no matching resource is loaded. // TODO - it may be useful to return a status flag to indicate that this occurred. #[cfg(test)] eprintln!("Matched rule with redirect option but did not find corresponding resource to send"); None }) }); let important = filter.is_some() && filter .as_ref() .map(|f| f.is_important()) .unwrap_or_else(|| false); let rewritten_url = if important { None } else { Self::apply_removeparam(&self.removeparam(), request, regex_manager.deref_mut()) }; // If something has already matched before but we don't know what, still return a match let matched = exception.is_none() && (filter.is_some() || matched_rule); BlockerResult { matched, important, redirect, rewritten_url, exception: exception.as_ref().map(|f| f.to_string()), // copy the exception filter: filter.as_ref().map(|f| f.to_string()), // copy the filter } } fn apply_removeparam( removeparam_filters: &NetworkFilterList, request: &Request, regex_manager: &mut RegexManager, ) -> Option { /// Represents an `&`-separated argument from a URL query parameter string enum QParam<'a> { /// Just a key, e.g. `...&key&...` KeyOnly(&'a str), /// Key-value pair separated by an equal sign, e.g. `...&key=value&...` KeyValue(&'a str, &'a str), } impl std::fmt::Display for QParam<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::KeyOnly(k) => write!(f, "{k}"), Self::KeyValue(k, v) => write!(f, "{k}={v}"), } } } let url = &request.original_url; // Only check for removeparam if there's a query string in the request URL if let Some(i) = find_char(b'?', url.as_bytes()) { // String indexing safety: indices come from `.len()` or `find_char` on individual ASCII // characters (1 byte each), some plus 1. let params_start = i + 1; let hash_index = if let Some(j) = find_char(b'#', &url.as_bytes()[params_start..]) { params_start + j } else { url.len() }; let qparams = &url[params_start..hash_index]; let mut params: Vec<(QParam, bool)> = qparams .split('&') .map(|pair| { if let Some((k, v)) = pair.split_once('=') { QParam::KeyValue(k, v) } else { QParam::KeyOnly(pair) } }) .map(|param| (param, true)) .collect(); let filters = removeparam_filters.check_all(request, &NO_TAGS, regex_manager); let mut rewrite = false; for removeparam_filter in filters { if let Some(removeparam) = &removeparam_filter.modifier_option { params.iter_mut().for_each(|(param, include)| { if let QParam::KeyValue(k, v) = param { if !v.is_empty() && k == removeparam { *include = false; rewrite = true; } } }); } } if rewrite { let p = itertools::join( params .into_iter() .filter(|(_, include)| *include) .map(|(param, _)| param.to_string()), "&", ); let new_param_str = if p.is_empty() { String::from("") } else { format!("?{p}") }; Some(format!( "{}{}{}", &url[0..i], new_param_str, &url[hash_index..] )) } else { None } } else { None } } /// Given a "main_frame" or "subdocument" request, check if some content security policies /// should be injected in the page. pub fn get_csp_directives(&self, request: &Request) -> Option { use crate::request::RequestType; if request.request_type != RequestType::Document && request.request_type != RequestType::Subdocument { return None; } let mut regex_manager = self.borrow_regex_manager(); let filters = self .csp() .check_all(request, &self.tags_enabled, &mut regex_manager); if filters.is_empty() { return None; } let mut disabled_directives: HashSet<&str> = HashSet::new(); let mut enabled_directives: HashSet<&str> = HashSet::new(); for filter in filters.iter() { if filter.is_exception() { if filter.is_csp() { if let Some(csp_directive) = &filter.modifier_option { disabled_directives.insert(csp_directive); } else { // Exception filters with empty `csp` options will disable all CSP // injections for matching pages. return None; } } } else if filter.is_csp() { if let Some(csp_directive) = &filter.modifier_option { enabled_directives.insert(csp_directive); } } } let mut remaining_directives = enabled_directives.difference(&disabled_directives); let mut merged = if let Some(directive) = remaining_directives.next() { String::from(*directive) } else { return None; }; remaining_directives.for_each(|directive| { merged.push(','); merged.push_str(directive); }); Some(merged) } pub(crate) fn from_context(filter_data_context: FilterDataContextRef) -> Self { Self { filter_data_context, tags_enabled: HashSet::new(), regex_manager: Default::default(), } } #[cfg(test)] pub fn new( network_filters: Vec, options: &BlockerOptions, ) -> Self { use crate::engine::Engine; use crate::FilterSet; let mut filter_set = FilterSet::new(true); filter_set.network_filters = network_filters; let engine = Engine::from_filter_set(filter_set, options.enable_optimizations); Self::from_context(engine.filter_data_context()) } pub fn use_tags(&mut self, tags: &[&str]) { let tag_set: HashSet = tags.iter().map(|&t| String::from(t)).collect(); self.tags_with_set(tag_set); } pub fn enable_tags(&mut self, tags: &[&str]) { let tag_set: HashSet = tags .iter() .map(|&t| String::from(t)) .collect::>() .union(&self.tags_enabled) .cloned() .collect(); self.tags_with_set(tag_set); } pub fn disable_tags(&mut self, tags: &[&str]) { let tag_set: HashSet = self .tags_enabled .difference(&tags.iter().map(|&t| String::from(t)).collect()) .cloned() .collect(); self.tags_with_set(tag_set); } fn tags_with_set(&mut self, tags_enabled: HashSet) { self.tags_enabled = tags_enabled; } pub fn tags_enabled(&self) -> Vec { self.tags_enabled.iter().cloned().collect() } pub fn set_regex_discard_policy(&self, new_discard_policy: RegexManagerDiscardPolicy) { let mut regex_manager = self.borrow_regex_manager(); regex_manager.set_discard_policy(new_discard_policy); } #[cfg(feature = "debug-info")] pub fn discard_regex(&self, regex_id: u64) { let mut regex_manager = self.borrow_regex_manager(); regex_manager.discard_regex(regex_id); } #[cfg(feature = "debug-info")] pub fn get_regex_debug_info(&self) -> crate::regex_manager::RegexDebugInfo { let regex_manager = self.borrow_regex_manager(); regex_manager.get_debug_info() } } #[cfg(test)] #[path = "../tests/unit/blocker.rs"] mod unit_tests;