from pip._internal.utils import logging
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException, \
    ElementClickInterceptedException, ElementNotInteractableException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import csv
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import logging

# Create an empty DataFrame to store the scraped data
reviews_data_df = pd.DataFrame()


# Function to save data to a CSV file
def save_to_file(data_frame, file_path):
    data_frame.to_csv(file_path, index=False)


def extract_book_info(book):
    book_data = {}
    try:
        book_data['title'] = book.find_element(By.CSS_SELECTOR, 'h3.bc-heading a').text
    except NoSuchElementException:
        book_data['title'] = None

    try:
        book_data['subtitle'] = book.find_element(By.CSS_SELECTOR, 'li.subtitle span').text
    except NoSuchElementException:
        book_data['subtitle'] = None

    try:
        book_data['authors'] = [author.text for author in
                                book.find_elements(By.CSS_SELECTOR, 'li.authorLabel span a')]
    except NoSuchElementException:
        book_data['authors'] = []

    try:
        book_data['narrators'] = [narrator.text for narrator in
                                  book.find_elements(By.CSS_SELECTOR, 'li.narratorLabel span a')]
    except NoSuchElementException:
        book_data['narrators'] = []

    try:
        book_data['series'] = book.find_element(By.CSS_SELECTOR, 'li.seriesLabel span a').text
    except NoSuchElementException:
        book_data['series'] = None

    try:
        book_data['length'] = book.find_element(By.CSS_SELECTOR, 'li.runtimeLabel span').text
    except NoSuchElementException:
        book_data['length'] = None

    try:
        book_data['release_date'] = book.find_element(By.CSS_SELECTOR, 'li.releaseDateLabel span').text
    except NoSuchElementException:
        book_data['release_date'] = None

    try:
        book_data['language'] = book.find_element(By.CSS_SELECTOR, 'li.languageLabel span').text
    except NoSuchElementException:
        book_data['language'] = None

    try:
        book_data['rating'] = book.find_element(By.CSS_SELECTOR, 'li.ratingsLabel .bc-pub-offscreen').text
    except NoSuchElementException:
        book_data['rating'] = None

    try:
        book_data['no_of_ratings'] = book.find_element(By.CSS_SELECTOR, 'li.ratingsLabel .bc-size-small').text
    except NoSuchElementException:
        book_data['no_of_ratings'] = None

    try:
        book_data['regular_price'] = book.find_element(By.CSS_SELECTOR,
                                                       'p.buybox-regular-price span:nth-child(2)').text
    except NoSuchElementException:
        book_data['regular_price'] = None

    try:
        book_data['sales_price'] = book.find_element(By.CSS_SELECTOR,
                                                     'p.buybox-member-price span:nth-child(2)').text
    except NoSuchElementException:
        book_data['sales_price'] = None

    try:
        book_data['category'] = book.find_element(By.CSS_SELECTOR, 'li.categoriesLabel a').text
    except NoSuchElementException:
        book_data['category'] = None

    try:
        genre_elements = book.find_elements(By.CSS_SELECTOR, 'div.bc-section span.bc-chip-text')
        book_data['genres'] = [genre.text for genre in genre_elements]
    except NoSuchElementException:
        book_data['genres'] = []

    try:
        book_data['url'] = book.find_element(By.CSS_SELECTOR, 'h3.bc-heading a').get_attribute('href')
    except NoSuchElementException:
        book_data['url'] = None

    return book_data


def extract_review_info(review_elements, reviewer_id_elements):
    review_data = []
    for review_element, reviewer_id_element in zip(review_elements, reviewer_id_elements):
        review_data.append({
            'reviewer_id': reviewer_id_element.text,
            'review': review_element.text
        })
    return review_data


def save_data(data, csv_filename, json_filename):
    # Convert data to DataFrame
    df = pd.DataFrame(data)
    # Save data to CSV
    df.to_csv(csv_filename, index=False)
    # Save data to JSON
    df.to_json(json_filename, orient='records', lines=True)


class AudibleScraper:
    def __init__(self):
        # Add this line to log chromedriver version
        chrome_options = Options()
        # chrome_options.add_experimental_option('detach', True)

        # Specify the path to the ChromeDriver executable
        chrome_driver_path = r'C:\Users\RemoteUser\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe'

        # Initialize Chrome WebDriver with the specified service
        chrome_service = ChromeService(executable_path=chrome_driver_path)

        # Add this line to log chromedriver version
        desired_capabilities = DesiredCapabilities.CHROME.copy()
        desired_capabilities['loggingPrefs'] = {'browser': 'ALL'}

        self.driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

        # Maximize the browser window
        self.driver.maximize_window()

        self.url = "https://www.audible.com/search?feature_six_browse-bin=18685580011&feature_twelve_browse-bin=18685552011&pageSize=50&sort=review-rank&ref_pageloadid=lLpatCZ3I2jVcbaV&ref=a_search_l1_feature_twelve_browse-bin_0&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=EZ1D5488G5TQE1V4G8BF&pageLoadId=lKeM4jw2D71tFz7T&ref_plink=not_applicable&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e"
        self.data = []

    def wait_for_element(self, by, value, timeout=10):
        try:
            element_present = EC.presence_of_element_located((by, value))
            WebDriverWait(self.driver, timeout).until(element_present)
        except TimeoutException:
            print(f"Timed out waiting for page to load element {value}")

    def scrape_reviews(self, book_url):
        self.driver.get(book_url)
        print(book_url)
        # First, extract the book information including the category
        book_info = extract_book_info(self.driver)

        def wait_for_overlay_disappearance():
            try:
                WebDriverWait(self.driver, 10).until(
                    EC.invisibility_of_element((By.CSS_SELECTOR, "div.att_lightbox_background"))
                )
            except TimeoutException:
                print('Overlay did not disappear within the timeout')
                pass

        # def close_overlay_if_present():
        #     try:
        #         close_button = self.driver.find_element(By.CSS_SELECTOR, "span#att_lightbox_close_x")
        #         close_button.click()
        #         wait_for_overlay_disappearance()
        #     except NoSuchElementException:
        #         # No close button, overlay not present
        #         pass
        #
        # def click_with_retry(self, element, retries=3):
        #     attempt = 0
        #     while attempt < retries:
        #         try:
        #             # Scroll the element into view before attempting to click
        #             self.driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});",
        #                                        element)
        #             time.sleep(1)  # Add a short delay to ensure the page has settled
        #             WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable(element))
        #             element.click()
        #             return  # Click successful, exit the function
        #         except ElementClickInterceptedException:
        #             self.close_overlay_if_present()
        #             attempt += 1
        #         except ElementNotInteractableException:
        #             time.sleep(1)
        #             attempt += 1
        #         if attempt == retries:
        #             raise Exception(f"Failed to click element after {retries} attempts.")

        # Initial wait for the overlay to disappear
        wait_for_overlay_disappearance()

        reviews_data = []

        # Find all review tabs and iterate over them if they exist
        review_tabs = self.driver.find_elements(By.XPATH,
                                                '//div[contains(@class, "bc-tab-set")]/a[contains(@class, "bc-tab-heading")]')
        if not review_tabs:
            print("No review tabs available on this page.")
            return pd.DataFrame(reviews_data)

        for tab in review_tabs:
            try:
                # Extract the label of the reviews tab
                tab_label = tab.text.strip()

                logging.info(f"Reviews Tab: {tab_label}")

                # Scroll back to the reviews tab before clicking it
                self.driver.execute_script("arguments[0].scrollIntoView(true);", tab)
                time.sleep(5)  # Allow time for the page to settle after scrolling

                # Now use click_with_retry to handle the click
                self.click_with_retry(tab)

                if not tab_label == "Amazon reviews":
                    self.expand_reviews_on_current_page()
                else:
                    self.expand_reviews_to_amazon_page()

                # After clicking the tab, call scrape_reviews_on_current_page
                self.scrape_reviews_on_current_page(reviews_data, book_info['category'], book_info['genres'])

                while self.go_to_next_reviews_page():
                    # After each pagination, call scrape_reviews_on_current_page
                    self.scrape_reviews_on_current_page(reviews_data, book_info['category'], book_info['genres'])
            except TimeoutException:
                logging.error("Timeout occurred while waiting for a review tab to become clickable.")
            except Exception as e:
                logging.error(f"Failed to click on a review tab due to an exception: {e}")

            # # Save the final data to a file after processing all tabs
            # print(reviews_data)
            # reviews_data_df = pd.DataFrame(reviews_data)
            # save_to_file(reviews_data_df, "scraped_reviews.csv")
            #
            # # Save and print the tab's label
            # print(f"Reviews Tab: {tab_label}")

        return pd.DataFrame(reviews_data)

    # //div[contains(@class, 'USreviews')]
    # //div[contains(@class, 'AUreviews')]
    def scrape_reviews_on_current_page(self, reviews_data, book_category, book_genres):
        try:
            # Wait for the reviews to load on this page
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.reviewText span'))
            )

            # Log a message indicating that reviews are present
            logging.info("Reviews are present on this page.")

            # Log the HTML content to inspect the structure
            logging.debug("HTML Content:")
            logging.debug(self.driver.page_source)

            review_elements = self.driver.find_elements(By.XPATH, '//div[contains(@class, "USreviews")]/h3[contains(@class, "bc-heading")]/following-sibling::p[contains(@class, "bc-size-body")]')
            for review_element in review_elements:
                reviews_data.append({
                    'review': review_element.text,
                    'category': book_category,  # Add the category to each review's data
                    'genres': ', '.join(book_genres)  # Convert list of genres to a string
                })
                # Print each review
                # print(
                # f"Review: {review_element.text}, Reviewer: {reviewer_id_element.text}, Category: {book_category}, Genres: {', '.join(book_genres)}")

            print("End of Reviews on Current Page")
        except Exception as e:
            # Log the exception information for debugging
            logging.error(f"An exception occurred: {e}")

    def go_to_next_reviews_page(self):
        try:
            # Locate the pagination button
            next_button = self.driver.find_element(By.CSS_SELECTOR, 'ul.a-pagination li.a-last a')
            if next_button and next_button.is_displayed():
                # Scroll the pagination button into the center of the view before clicking
                self.driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});",
                                           next_button)
                time.sleep(2)  # Wait for the scrolling to complete and the page to settle
                next_button.click()
                # Wait for the next page of reviews to load
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div.reviewText span'))
                )
                return True
        except NoSuchElementException:
            # Log a message indicating no more pages
            logging.info("No more pages of reviews.")
            return False
        except ElementClickInterceptedException:
            # Log a message indicating the click is intercepted
            logging.warning("Click intercepted by another element.")
            self.close_overlay_if_present()
            return False
        except TimeoutException:
            # Log a message indicating a timeout
            logging.error("Timed out waiting for next page of reviews to load.")
            return False

    def expand_reviews_on_current_page(self):
        while True:
            try:
                # Wait for the "Show More Reviews" button to be clickable
                show_more_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Show More Reviews']"))
                )
                # Scroll to the "Show More Reviews" button and click it
                self.driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
                show_more_button.click()

                # Wait for the button to become invisible after clicking, indicating it has processed
                WebDriverWait(self.driver, 10).until(
                    EC.invisibility_of_element(show_more_button)
                )

                # Short delay to allow any page updates to occur
                time.sleep(2)

            except TimeoutException:
                # If the button doesn't become invisible within the timeout, assume no more reviews and break the loop
                break
            except NoSuchElementException:
                # If the button isn't found at all, also assume no more reviews and break the loop
                break

        return False

    def expand_reviews_to_amazon_page(self):
        try:
            # Wait for the container that should contain the 'See all reviews' link to load
            container = WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.ID, "adbl-amzn-portlet-reviews-in-iframe"))
            )

            # If the container is actually an iframe, you would need to switch to it:
            # self.driver.switch_to.frame("adbl-amzn-portlet-reviews-in-iframe")

            # Scroll within the container to the bottom
            self.driver.execute_script(
                "var container = arguments[0];"
                "container.scrollTop = container.scrollHeight;",
                container
            )

            # Wait for the 'See all reviews' link to become visible after scrolling
            see_all_reviews_link = WebDriverWait(self.driver, 20).until(
                EC.visibility_of_element_located((By.XPATH, "//a[contains(text(), 'See all reviews')]"))
            )

            # Click the 'See all reviews' link
            see_all_reviews_link.click()

            # Wait for some condition that indicates the next action can be taken
            # For example, wait for navigation to complete
            WebDriverWait(self.driver, 10).until(
                lambda d: d.execute_script('return document.readyState') == 'complete'
            )

        except TimeoutException:
            print("Timed out waiting for the reviews container or 'See all reviews' link.")
            self.driver.save_screenshot('debug_timeout.png')
            return False
        except NoSuchElementException:
            print("Could not find the reviews container or 'See all reviews' link.")
            self.driver.save_screenshot('debug_no_element.png')
            return False

        # If the container is an iframe, switch back to the main document
        # self.driver.switch_to.default_content()

        return True

    # ...

    # ...
    def close_overlay_if_present(self):
        try:
            close_button = self.driver.find_element(By.CSS_SELECTOR, "span#att_lightbox_close_x")
            close_button.click()
            # Wait until the overlay is gone before proceeding
            WebDriverWait(self.driver, 10).until(
                EC.invisibility_of_element((By.CSS_SELECTOR, "div.att_lightbox_background"))
            )
        except NoSuchElementException:
            # If the close button doesn't exist, ignore it
            pass
        except TimeoutException:
            # If the overlay didn't disappear in time, handle the timeout
            print("The overlay did not disappear in time.")

    #
    # def click_with_retry(element, retries=3):
    #     attempt = 0
    #     while attempt < retries:
    #         try:
    #             element.click()
    #             return  # Click successful, exit the function
    #         except ElementClickInterceptedException:
    #             close_overlay_if_present()
    #             attempt += 1
    #             if attempt == retries:
    #                 raise  # Re-raise the exception if max retries reached

    def click_with_retry(self, element, retries=3):
        attempt = 0
        while attempt < retries:
            try:
                # Scroll the element into view before attempting to click
                self.driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});",
                                           element)
                time.sleep(5)  # Add a short delay to ensure the page has settled
                WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable(element))
                element.click()
                return  # Click successful, exit the function
            except ElementClickInterceptedException:
                self.close_overlay_if_present()
                attempt += 1
            except ElementNotInteractableException:
                time.sleep(1)
                attempt += 1
            if attempt == retries:
                raise Exception(f"Failed to click element after {retries} attempts.")

    def go_to_next_page(self):
        try:
            # Updated to use the new method
            next_button = self.driver.find_element(By.XPATH,
                                                   "//span[contains(@class, 'nextButton') and not(contains(@class, 'bc-button-disabled'))]/a")

            next_page_url = next_button.get_attribute('href')
            if next_page_url:
                self.driver.get(next_page_url)
                # Scroll to the bottom of the page to ensure all lazy-loaded elements are loaded
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

                # Wait a bit for the page to load after scrolling
                time.sleep(5)
                return True
            else:
                print("Reached the last page or no 'Next' button found. Stopping the spider.")
                return False
        except NoSuchElementException:
            print("Reached the last page or no 'Next' button found. Stopping the spider.")
            return False

    def scrape(self):

        self.driver.get(self.url)

        # Create a DataFrame to store all book data
        all_book_data = []
        all_reviews_data = []

        while True:
            books = self.driver.find_elements(By.CLASS_NAME, 'productListItem')
            for book in books:
                book_data = extract_book_info(book)
                print(book_data)
                all_book_data.append(book_data)
            if not self.go_to_next_page():
                break

        # Save the book and review data
        save_data(all_book_data, 'audible_books.csv', 'audible_books.json')

        # Scrape reviews for each book
        for book_data in all_book_data:
            if book_data['url']:
                # print(book_data['url'])
                book_reviews_df = self.scrape_reviews(book_data['url'])
                # print(book_reviews_df)
                all_reviews_data.append(book_reviews_df)
                # print(all_reviews_data)
        save_data(all_reviews_data, 'audible_reviews.csv', 'audible_reviews.json')

        # Close the driver
        self.driver.quit()


if __name__ == "__main__":
    scraper = AudibleScraper()
    scraper.scrape()
    # scraper.save_data()