// Copyright 2015 Ted Mielczarek. See the COPYRIGHT
// file at the top-level directory of this distribution.
use crate::{FrameSymbolizer, FrameWalker, Module, SymbolError};

pub use crate::sym_file::types::*;
pub use parser::SymbolParser;
use std::fs::File;
use std::io::Read;
use std::ops::Deref;
use std::path::Path;
use tracing::trace;

mod parser;
mod types;
pub mod walker;

// # Sync VS Async
//
// There is both a sync and an async entry-point to the parser.
// The two impls should be essentially identical, except for how they
// read bytes from the input reader into our circular buffer.
//
//
// # Streaming
//
// This parser streams the input to avoid the need to materialize all of
// it into memory at once (symbol files can be a gigabyte!). As a result,
// we need to iteratively parse.
//
// We do this by repeatedly filling up a buffer with input and asking the
// parser to parse it. The parser will return how much of the input it
// consumed, which we can use to clear space in our buffer and to tell
// if it successfully consumed the whole input when the Reader runs dry.
//
//
// # Handling EOF / Capacity
//
// Having a fix-sized buffer has one fatal issue: if one atomic step
// of the parser needs more than this amount of data, then we won't
// be able to parse it.
//
// This can result in `buf` filling up and `buf.space()` becoming an
// empty slice. This in turn will make the reader yield 0 bytes, and
// we'll treat it like EOF and fail the parse. When this happens, we
// try to double the buffer's size and request more bytes. If we get
// more, hooray! If we don't, then it's a "real" EOF.
//
// The "atom" of our parser is a line, so we need our buffer to be able
// to fit any line. However we actually only have roughly
// *half* this value as our limit, as circular::Buffer will only
// `shift` the buffer's contents if over half of its capacity has been
// drained by `consume` -- and `space()` only grows when a `shift` happens.
//
// I have in fact seen 8kb function names from Rust (thanks generic combinators!)
// and 82kb function names from C++ (thanks 'auto' returns!), so we
// need a buffer size that can grow to at least 200KB. This is a *very* large
// amount to backshift repeatedly, so to keep this under control, we start
// with only a 10KB buffer, which is generous but tolerable.
//
// We should still have *SOME* limit on this to avoid nasty death spirals,
// so let's go with 2MB (MAX_BUFFER_CAPACITY), letting you have a horrifying 1MB symbol.
//
// But just *dying* when we hit this point is terrible, so lets have an
// extra layer of robustness: if we ever hit the limit, enter "panic recovery"
// and just start discarding bytes until we hit a newline. Then resume normal
// parsing. The net effect of this is that we just treat this one line as
// corrupt (because statistically it won't even be needed!).

// Allows for at least 80KB symbol names, at most 160KB symbol names (fuzzy because of circular).
static MAX_BUFFER_CAPACITY: usize = 1024 * 160;
static INITIAL_BUFFER_CAPACITY: usize = 1024 * 10;

impl SymbolFile {
    /// Parse a SymbolFile from the given Reader.
    ///
    /// Every time a chunk of the input is parsed, that chunk will
    /// be passed to `callback` to allow you to do something else
    /// with the data as it's streamed in (e.g. you can save the
    /// input to a cache).
    ///
    /// The reader is wrapped in a buffer reader so you shouldn't
    /// buffer the input yourself.
    pub fn parse<R: Read>(
        mut input_reader: R,
        mut callback: impl FnMut(&[u8]),
    ) -> Result<SymbolFile, SymbolError> {
        let mut buf = circular::Buffer::with_capacity(INITIAL_BUFFER_CAPACITY);
        let mut parser = SymbolParser::new();
        let mut fully_consumed = false;
        let mut tried_to_grow = false;
        let mut in_panic_recovery = false;
        let mut just_finished_recovering = false;
        let mut total_consumed = 0u64;
        loop {
            if in_panic_recovery {
                // PANIC RECOVERY MODE! DISCARD BYTES UNTIL NEWLINE.
                let input = buf.data();
                if let Some(new_line_idx) = input.iter().position(|&byte| byte == b'\n') {
                    // Hooray, we found a new line! Consume up to and including that, and resume.
                    let amount = new_line_idx + 1;
                    callback(&input[..amount]);
                    buf.consume(amount);
                    total_consumed += amount as u64;

                    // Back to normal!
                    in_panic_recovery = false;
                    fully_consumed = false;
                    just_finished_recovering = true;
                    parser.lines += 1;
                    trace!("RECOVERY: complete!");
                } else {
                    // No newline, discard everything
                    let amount = input.len();
                    callback(&input[..amount]);
                    buf.consume(amount);
                    total_consumed += amount as u64;

                    // If the next read returns 0 bytes, then that's a proper EOF!
                    fully_consumed = true;
                }
            }

            // Read the data in, and tell the circular buffer about the new data
            let size = input_reader.read(buf.space())?;
            buf.fill(size);

            if size == 0 {
                // If the reader returned no more bytes, this can be either mean
                // EOF or the buffer is out of capacity. There are a lot of cases
                // to consider, so let's go through them one at a time...
                if just_finished_recovering && !buf.data().is_empty() {
                    // We just finished PANIC RECOVERY, but there's still bytes in
                    // the buffer. Assume that is parseable and resume normal parsing
                    // (do nothing, fallthrough to normal path).
                } else if fully_consumed {
                    // Success! The last iteration cleared the buffer and we still got
                    // no more bytes, so that's a proper EOF with a complete parse!
                    return Ok(parser.finish());
                } else if !tried_to_grow {
                    // We still have some stuff in the buffer, assume this is because
                    // the buffer is full, and try to make it BIGGER and ask for more again.
                    let new_cap = buf.capacity().saturating_mul(2);
                    if new_cap > MAX_BUFFER_CAPACITY {
                        // TIME TO PANIC!!! This line is catastrophically big, just start
                        // discarding bytes until we hit a newline.
                        trace!("RECOVERY: discarding enormous line {}", parser.lines);
                        in_panic_recovery = true;
                        continue;
                    }
                    trace!("parser out of space? trying more ({}KB)", new_cap / 1024);
                    buf.grow(new_cap);
                    tried_to_grow = true;
                    continue;
                } else if total_consumed == 0 {
                    // We grew the buffer and still got no more bytes, so it's a proper EOF.
                    // But actually, we never consumed any bytes, so this is an empty file?
                    // Give a better error message for that.
                    return Err(SymbolError::ParseError(
                        "empty SymbolFile (probably something wrong with your debuginfo tooling?)",
                        0,
                    ));
                } else {
                    // Ok give up, this input is just impossible.
                    return Err(SymbolError::ParseError(
                        "unexpected EOF during parsing of SymbolFile (or a line was too long?)",
                        parser.lines,
                    ));
                }
            } else {
                tried_to_grow = false;
            }

            if in_panic_recovery {
                // Don't run the normal parser while we're still recovering!
                continue;
            }
            just_finished_recovering = false;

            // Ask the parser to parse more of the input
            let input = buf.data();
            let consumed = parser.parse_more(input)?;
            total_consumed += consumed as u64;

            // Give the other consumer of this Reader a chance to use this data.
            callback(&input[..consumed]);

            // Remember for the next iteration if all the input was consumed.
            fully_consumed = input.len() == consumed;
            buf.consume(consumed);
        }
    }

    /// `parse` but async
    #[cfg(feature = "http")]
    pub async fn parse_async(
        mut response: reqwest::Response,
        mut callback: impl FnMut(&[u8]),
    ) -> Result<SymbolFile, SymbolError> {
        let mut chunk;
        let mut slice = &[][..];
        let mut input_reader = &mut slice;
        let mut buf = circular::Buffer::with_capacity(INITIAL_BUFFER_CAPACITY);
        let mut parser = SymbolParser::new();

        let mut fully_consumed = false;
        let mut tried_to_grow = false;
        let mut in_panic_recovery = false;
        let mut just_finished_recovering = false;
        let mut total_consumed = 0u64;
        loop {
            if in_panic_recovery {
                // PANIC RECOVERY MODE! DISCARD BYTES UNTIL NEWLINE.
                let input = buf.data();
                if let Some(new_line_idx) = input.iter().position(|&byte| byte == b'\n') {
                    // Hooray, we found a new line! Consume up to and including that, and resume.
                    let amount = new_line_idx + 1;
                    callback(&input[..amount]);
                    buf.consume(amount);
                    total_consumed += amount as u64;

                    // Back to normal!
                    in_panic_recovery = false;
                    fully_consumed = false;
                    just_finished_recovering = true;
                    parser.lines += 1;
                    trace!("PANIC RECOVERY: complete!");
                } else {
                    // No newline, discard everything
                    let amount = input.len();
                    callback(&input[..amount]);
                    buf.consume(amount);
                    total_consumed += amount as u64;

                    // If the next read returns 0 bytes, then that's a proper EOF!
                    fully_consumed = true;
                }
            }

            // Little rube-goldberg machine to stream the contents:
            // * get a chunk (Bytes) from the Response
            // * get its underlying slice
            // * then get a mutable reference to that slice
            // * then Read that mutable reference in our circular buffer
            // * when the slice runs out, get the next chunk and repeat
            if input_reader.is_empty() {
                chunk = response
                    .chunk()
                    .await
                    .map_err(std::io::Error::other)?
                    .unwrap_or_default();
                slice = &chunk[..];
                input_reader = &mut slice;
            }

            // Read the data in, and tell the circular buffer about the new data
            let size = input_reader.read(buf.space())?;
            buf.fill(size);

            if size == 0 {
                // If the reader returned no more bytes, this can be either mean
                // EOF or the buffer is out of capacity. There are a lot of cases
                // to consider, so let's go through them one at a time...
                if just_finished_recovering && !buf.data().is_empty() {
                    // We just finished PANIC RECOVERY, but there's still bytes in
                    // the buffer. Assume that is parseable and resume normal parsing
                    // (do nothing, fallthrough to normal path).
                } else if fully_consumed {
                    // Success! The last iteration cleared the buffer and we still got
                    // no more bytes, so that's a proper EOF with a complete parse!
                    return Ok(parser.finish());
                } else if !tried_to_grow {
                    // We still have some stuff in the buffer, assume this is because
                    // the buffer is full, and try to make it BIGGER and ask for more again.
                    let new_cap = buf.capacity().saturating_mul(2);
                    if new_cap > MAX_BUFFER_CAPACITY {
                        // TIME TO PANIC!!! This line is catastrophically big, just start
                        // discarding bytes until we hit a newline.
                        trace!("RECOVERY: discarding enormous line {}", parser.lines);
                        in_panic_recovery = true;
                        continue;
                    }
                    trace!("parser out of space? trying more ({}KB)", new_cap / 1024);
                    buf.grow(new_cap);
                    tried_to_grow = true;
                    continue;
                } else if total_consumed == 0 {
                    // We grew the buffer and still got no more bytes, so it's a proper EOF.
                    // But actually, we never consumed any bytes, so this is an empty file?
                    // Give a better error message for that.
                    return Err(SymbolError::ParseError(
                        "empty SymbolFile (probably something wrong with your debuginfo tooling?)",
                        0,
                    ));
                } else {
                    // Ok give up, this input is just impossible.
                    return Err(SymbolError::ParseError(
                        "unexpected EOF during parsing of SymbolFile (or a line was too long?)",
                        parser.lines,
                    ));
                }
            } else {
                tried_to_grow = false;
            }

            if in_panic_recovery {
                // Don't run the normal parser while we're still recovering!
                continue;
            }
            just_finished_recovering = false;

            // Ask the parser to parse more of the input
            let input = buf.data();
            let consumed = parser.parse_more(input)?;
            total_consumed += consumed as u64;

            // Give the other consumer of this Reader a chance to use this data.
            callback(&input[..consumed]);

            // Remember for the next iteration if all the input was consumed.
            fully_consumed = input.len() == consumed;
            buf.consume(consumed);
        }
    }

    // Parse a SymbolFile from bytes.
    pub fn from_bytes(bytes: &[u8]) -> Result<SymbolFile, SymbolError> {
        Self::parse(bytes, |_| ())
    }

    // Parse a SymbolFile from a file.
    pub fn from_file(path: &Path) -> Result<SymbolFile, SymbolError> {
        let file = File::open(path)?;
        Self::parse(file, |_| ())
    }

    /// Fill in as much source information for `frame` as possible.
    pub fn fill_symbol(&self, module: &dyn Module, frame: &mut dyn FrameSymbolizer) {
        // Look for a FUNC covering the address first.
        if frame.get_instruction() < module.base_address() {
            return;
        }
        let addr = frame.get_instruction() - module.base_address();
        if let Some(func) = self.functions.get(addr) {
            // TODO: although FUNC records have a parameter size, it appears that
            // they aren't to be trusted? The STACK WIN records are more reliable
            // when available. This is important precisely because these values
            // are used to unwind subsequent STACK WIN frames (because certain
            // calling conventions have the caller push the callee's arguments,
            // which affects the the stack's size!).
            //
            // Need to spend more time thinking about if this is the right approach
            let parameter_size = if let Some(info) = self.win_stack_framedata_info.get(addr) {
                info.parameter_size
            } else if let Some(info) = self.win_stack_fpo_info.get(addr) {
                info.parameter_size
            } else {
                func.parameter_size
            };

            frame.set_function(
                &func.name,
                func.address + module.base_address(),
                parameter_size,
            );

            // See if there's source line and inline info as well.
            //
            // In the following, we transform data between two different representations of inline calls.
            // The input shape has function names associated with the location of the call to that function.
            // The output shape has function names associated with a location *inside* that function.
            //
            // Input:
            //
            //   (
            //       outer_name,
            //       inline_calls: [ // Each location is the line of the *call* to the function
            //           (inline_call_location[0], inline_name[0]),
            //           (inline_call_location[1], inline_name[1]),
            //           (inline_call_location[2], inline_name[2]),
            //       ]
            //       innermost_location,
            //   )
            //
            // Output:
            //
            //   ( // Each location is the line *inside* the function
            //       (outer_name, inline_call_location[0]),
            //       inlines: [
            //           (inline_name[0], inline_call_location[1]),
            //           (inline_name[1], inline_call_location[2]),
            //           (inline_name[2], innermost_location),
            //       ]
            //   )
            if let Some((file_id, line, address, next_inline_origin)) =
                func.get_outermost_sourceloc(addr)
            {
                if let Some(file) = self.files.get(&file_id) {
                    frame.set_source_file(file, line, address + module.base_address());
                }

                if let Some(mut inline_origin) = next_inline_origin {
                    // There is an inline call at the address.
                    // Enumerate all inlines at the address one by one by looking up
                    // successively deeper call depths.
                    // The call to `get_outermost_source_location` above looked up depth 0, so here
                    // we start at depth 1.
                    for depth in 1.. {
                        match func.get_inlinee_at_depth(depth, addr) {
                            Some((call_file_id, call_line, _address, next_inline_origin)) => {
                                // We found another inline frame.
                                let call_file = self.files.get(&call_file_id).map(Deref::deref);
                                if let Some(name) = self.inline_origins.get(&inline_origin) {
                                    frame.add_inline_frame(name, call_file, Some(call_line));
                                }

                                inline_origin = next_inline_origin;
                            }
                            None => break,
                        }
                    }
                    // We've run out of inline calls but we still have to output the final frame.
                    let (file, line) = match func.get_innermost_sourceloc(addr) {
                        Some((file_id, line, _)) => (
                            self.files.get(&file_id).map(Deref::deref),
                            if line != 0 { Some(line) } else { None },
                        ),
                        None => (None, None),
                    };
                    if let Some(name) = self.inline_origins.get(&inline_origin) {
                        frame.add_inline_frame(name, file, line);
                    }
                }
            }
        } else if let Some(public) = self.find_nearest_public(addr) {
            // We couldn't find a valid FUNC record, but we could find a PUBLIC record.
            // Unfortauntely, PUBLIC records don't have end-points, so this could be
            // a random PUBLIC record from the start of the module that isn't at all
            // applicable. To try limit this problem, we can use the nearest FUNC
            // record that comes *before* the address we're trying to find a symbol for.
            //
            // It is reasonable to assume a PUBLIC record cannot extend *past* a FUNC,
            // so if the PUBLIC has a smaller base address than the nearest previous FUNC
            // to our target address, the PUBLIC must actually end before that FUNC and
            // therefore not actually apply to the target address.
            //
            // We get the nearest previous FUNC by getting the raw slice of ranges
            // and binary searching for our base address. Rust's builtin binary search
            // will fail to find the value since it uses strict equality *but* the Err
            // will helpfully contain the index in the slice where our value "should"
            // be inserted to preserve the sort. The element before this index is
            // therefore the nearest previous value!
            //
            // Case analysis for this -1 because binary search is an off-by-one minefield:
            //
            // * if the address we were looking for came *before* every FUNC, binary_search
            //   would yield "0" because that's where it should go to preserve the sort.
            //   The checked_sub will then fail and make us just assume the PUBLIC is reasonable,
            //   which is correct.
            //
            // * if we get 1, this saying we actually want element 0, so again -1 is
            //   correct. (This generalizes to all other "reasonable" values, but 1 is easiest
            //   to think about given the previous case's analysis.)
            //
            // * if the address we were looking for came *after* every FUNC, binary search
            //   would yield "slice.len()", and the nearest FUNC is indeed at `len-1`, so
            //   again correct.
            let funcs_slice = self.functions.ranges_values().as_slice();
            let prev_func = funcs_slice
                .binary_search_by_key(&addr, |(range, _)| range.start)
                .err()
                .and_then(|idx| idx.checked_sub(1))
                .and_then(|idx| funcs_slice.get(idx));

            if let Some(prev_func) = prev_func {
                if public.address <= prev_func.1.address {
                    // This PUBLIC is truncated by a FUNC before it gets to `addr`,
                    // so we shouldn't use it.
                    return;
                }
            }

            // Settle for a PUBLIC.
            frame.set_function(
                &public.name,
                public.address + module.base_address(),
                public.parameter_size,
            );
        }
    }

    pub fn walk_frame(&self, module: &dyn Module, walker: &mut dyn FrameWalker) -> Option<()> {
        if walker.get_instruction() < module.base_address() {
            return None;
        }
        let addr = walker.get_instruction() - module.base_address();

        // Preferentially use framedata over fpo, because if both are present,
        // the former tends to be more precise (breakpad heuristic).
        let win_stack_result = if let Some(info) = self.win_stack_framedata_info.get(addr) {
            walker::walk_with_stack_win_framedata(info, walker)
        } else if let Some(info) = self.win_stack_fpo_info.get(addr) {
            walker::walk_with_stack_win_fpo(info, walker)
        } else {
            None
        };

        // If STACK WIN failed, try STACK CFI
        win_stack_result.or_else(|| {
            if let Some(info) = self.cfi_stack_info.get(addr) {
                // Don't use add_rules that come after this address
                let mut count = 0;
                let len = info.add_rules.len();
                while count < len && info.add_rules[count].address <= addr {
                    count += 1;
                }

                walker::walk_with_stack_cfi(&info.init, &info.add_rules[0..count], walker)
            } else {
                None
            }
        })
    }

    /// Find the nearest `PublicSymbol` whose address is less than or equal to `addr`.
    pub fn find_nearest_public(&self, addr: u64) -> Option<&PublicSymbol> {
        self.publics.iter().rev().find(|&p| p.address <= addr)
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use std::ffi::OsStr;
    fn test_symbolfile_from_file(rel_path: &str) {
        let mut path = std::env::current_dir().unwrap();
        if path.file_name() == Some(OsStr::new("rust-minidump")) {
            path.push("breakpad-symbols");
        }
        path.push(rel_path);
        let sym = SymbolFile::from_file(&path).unwrap();
        assert_eq!(sym.files.len(), 6661);
        assert_eq!(sym.publics.len(), 5);
        assert_eq!(sym.find_nearest_public(0x9b07).unwrap().name, "_NLG_Return");
        assert_eq!(
            sym.find_nearest_public(0x142e7).unwrap().name,
            "_NLG_Return"
        );
        assert_eq!(
            sym.find_nearest_public(0x23b06).unwrap().name,
            "__from_strstr_to_strchr"
        );
        assert_eq!(
            sym.find_nearest_public(0xFFFFFFFF).unwrap().name,
            "__from_strstr_to_strchr"
        );
        assert_eq!(sym.functions.ranges_values().count(), 1065);
        assert_eq!(sym.functions.get(0x1000).unwrap().name, "vswprintf");
        assert_eq!(sym.functions.get(0x1012).unwrap().name, "vswprintf");
        assert!(sym.functions.get(0x1013).is_none());
        // There are 1556 `STACK WIN 4` lines in the symbol file, but only 856
        // that don't overlap. However they all overlap in ways that we have
        // to handle in the wild.
        assert_eq!(sym.win_stack_framedata_info.ranges_values().count(), 1556);
        assert_eq!(sym.win_stack_fpo_info.ranges_values().count(), 259);
        assert_eq!(
            sym.win_stack_framedata_info.get(0x41b0).unwrap().address,
            0x41b0
        );
    }

    #[test]
    fn test_symbolfile_from_lf_file() {
        test_symbolfile_from_file(
            "testdata/symbols/test_app.pdb/5A9832E5287241C1838ED98914E9B7FF1/test_app.sym",
        );
    }

    #[test]
    fn test_symbolfile_from_crlf_file() {
        test_symbolfile_from_file(
            "testdata/symbols/test_app.pdb/6A9832E5287241C1838ED98914E9B7FF1/test_app.sym",
        );
    }

    fn test_symbolfile_from_bytes(symbolfile_bytes: &[u8]) {
        let sym = SymbolFile::from_bytes(symbolfile_bytes).unwrap();

        assert_eq!(sym.files.len(), 1);
        assert_eq!(sym.publics.len(), 1);
        assert_eq!(sym.functions.ranges_values().count(), 1);
        assert_eq!(sym.functions.get(0x1000).unwrap().name, "another func");
        assert_eq!(
            sym.functions
                .get(0x1000)
                .unwrap()
                .lines
                .ranges_values()
                .count(),
            1
        );
        // test fallback
        assert_eq!(sym.functions.get(0x1001).unwrap().name, "another func");
    }

    #[test]
    fn test_symbolfile_from_bytes_with_lf() {
        test_symbolfile_from_bytes(
            b"MODULE Linux x86 ffff0000 bar
FILE 53 bar.c
PUBLIC 1234 10 some public
FUNC 1000 30 10 another func
1000 30 7 53
",
        );
    }

    #[test]
    fn test_symbolfile_from_bytes_with_crlf() {
        test_symbolfile_from_bytes(
            b"MODULE Linux x86 ffff0000 bar
FILE 53 bar.c
PUBLIC 1234 10 some public
FUNC 1000 30 10 another func
1000 30 7 53
",
        );
    }
}