import scrapy
import pandas as pd
from scrapy import signals
from scrapy.signalmanager import dispatcher
import sqlite3

class AudibleSpider(scrapy.Spider):
    name = "audible"
    allowed_domains = ["www.audible.com"]

    start_urls = ["https://www.audible.com/search"]

    custom_settings = {
        'FEEDS': {
            'audible.csv': {
                'format': 'csv',
                'overwrite': True
            },
            'audible.json': {
                'format': 'json',
                'overwrite': True,
                'indent': 4
            },
        }
    }

    # Initialize a list to store the review data
    def __init__(self):
        self.reviews_data = []
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46'
                }
            )

    def parse(self, response):
        books = response.xpath(
            '(//div[@class="adbl-impression-container "])/div//li[contains(@class,"productListItem")]')
        for book in books:
            title = book.xpath('.//h3[contains(@class, "bc-heading")]/a/text()').get()
            title_link = book.xpath('.//h3[contains(@class, "bc-heading")]/a/@href').get()
            subtitle = book.xpath('.//li[contains(@class, "subtitle")]/span/text()').get()
            authors = book.xpath('.//li[contains(@class, "authorLabel")]/span/a/text()').getall()
            narrators = book.xpath('.//li[contains(@class, "narratorLabel")]/span/a/text()').getall()
            series = book.xpath('.//li[contains(@class, "seriesLabel")]/span/a/text()').get()
            length = book.xpath('.//li[contains(@class, "runtimeLabel")]/span/text()').get()
            release_date = book.xpath('.//li[contains(@class, "releaseDateLabel")]/span/text()').get()
            language = book.xpath('.//li[contains(@class, "languageLabel")]/span/text()').get()
            rating = book.xpath(
                './/li[contains(@class, "ratingsLabel")]/span[contains(@class, "bc-pub-offscreen")]/text()').get()
            no_of_ratings = book.xpath(
                './/li[contains(@class, "ratingsLabel")]/span[contains(@class, "bc-size-small")]/text()').get()
            regular_price = book.xpath('.//p[contains(@id, "buybox-regular-price")]/span[2]/text()').get()
            sales_price = book.xpath('.//p[contains(@id, "buybox-member-price")]/span[2]/text()').get()

            if length:
                length = length.split(':')[1].strip()
            if release_date:
                release_date = release_date.split('\n')[1].strip()
            if language:
                language = language.split('\n')[1].strip()
            if regular_price:
                regular_price = regular_price.split('\n')[1].strip()

            item = {
                'title': title,
                'subtitle': subtitle,
                'authors': authors,
                'narrators': narrators,
                'series': series,
                'length': length,
                'release_date': release_date,
                'language': language,
                'rating': rating,
                'no_of_ratings': no_of_ratings,
                'regular_price': regular_price,
                'sales_price': sales_price,
                'title_link': title_link
            }

            # To handle genres, yield a request to the book detail page if a title link is available
            if title_link:
                yield response.follow(
                    url=title_link,
                    callback=self.parse_title,
                    meta={'item': item}  # Pass the current book item
                )
            else:
                yield item  # If there's no title link, yield the item as it is

        pagination = response.xpath('//ul[contains(@class, "pagingElements")]')
        next_button = pagination.xpath(
            './/span[contains(@class, "nextButton") and not(contains(@class, "bc-button-disabled"))]/a/@href')

        next_page_url = next_button.get()

        if next_page_url:
            yield response.follow(
                next_page_url,
                callback=self.parse,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46'
                }
            )
        else:
            self.logger.info("Reached the last page or no 'Next' button found. Stopping the spider.")

    def parse_title(self, response):
        reviews_data = []
        # Extract genres as before
        genres = response.xpath('//div[contains(@class, "bc-expander")]//span[@class="bc-chip-text"]/text()').getall()
        categories = response.xpath('//li[contains(@class, "categoriesLabel")]/a/text()').getall()
        reviews_tabs = response.xpath('//div[contains(@class, "bc-tab-set")]')

        # Function to add reviews to the list if they exist
        def add_reviews(review_list):
            if review_list:  # Checks if the review list is not empty
                reviews_data.extend([review.strip() for review in review_list])

        # US reviews
        usreviews = response.xpath(
            '//div[contains(@class, "USreviews")]/h3[contains(@class, "bc-heading")]/following-sibling::p[contains(@class, "bc-size-body")]/text()').getall()
        add_reviews(usreviews)

        # UK reviews
        ukreviews = response.xpath(
            '//div[contains(@class, "UKreviews")]/h3[contains(@class, "bc-heading")]/following-sibling::p[contains(@class, "bc-size-body")]/text()').getall()
        add_reviews(ukreviews)

        # AU reviews
        aureviews = response.xpath(
            '//div[contains(@class, "AUreviews")]/h3[contains(@class, "bc-heading")]/following-sibling::p[contains(@class, "bc-size-body")]/text()').getall()
        add_reviews(aureviews)

        # Amazon reviews
        amazonreviews = response.xpath(
            '//div[contains(@class, "review-text-content")]/span/text()').getall()
        add_reviews(amazonreviews)

        # If there are no reviews at all, you might want to add a placeholder or leave as an empty list
        if not reviews_data:
            reviews_data = ["no reviews"]  # Placeholder for no reviews

        item = response.meta['item']
        item['genres'] = [genre.strip() for genre in genres]  # Clean the genres
        item['categories'] = [category.strip() for category in categories]  # Clean the categories
        item['reviews'] = reviews_data  # Add the cleaned and consolidated reviews

        # Extract the Amazon URL from the book details page
        amazon_url = response.xpath('//a[@data-hook="see-all-reviews-link-foot"]/@href').get()

        # Check if an Amazon URL was found
        if amazon_url:
            # If found, follow the Amazon URL to scrape reviews
            print(amazon_url)
            yield response.follow(
                url=amazon_url,
                callback=self.parse_amazon_reviews,
                meta={'item': item},  # Pass along the existing item
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46'
                }
            )
        else:
            # If there's no Amazon URL, yield the item with the details scraped so far
            yield item

    def parse_amazon_reviews(self, response):
        # Retrieve the item passed via meta
        item = response.meta['item']

        # Scrape the review texts
        reviews = response.xpath(
            '//div[contains(@class, "review-data")]/span[contains(@class, "review-text-content")]/span')
        for review in reviews:
            review_text = review.xpath('.//text()').get().strip()
            print(f"Scraped review: {review_text}")  # Debugging line
            review_item = {
                'title_link': item['title_link'],
                'review_text': review_text
            }
            self.reviews_data.append(review_item)
            yield review_item

            # Check for a 'Next' page link and follow if found
            next_page_link = response.xpath('//ul[@class="a-pagination"]/li[@class="a-last"]/a/@href')
            if next_page_link:
                next_page_url = next_page_link.get()
                yield response.follow(
                    url=next_page_url,
                    callback=self.parse_amazon_reviews,
                    meta={'item': response.meta['item']},  # Pass along the meta information
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46'
                    }
                )

    # Method to be called when the spider is closed
    def spider_closed(self, spider):
        # Convert the list of reviews to a DataFrame
        reviews_df = pd.DataFrame(self.reviews_data)

        # Save to CSV
        reviews_df.to_csv('amazon_reviews.csv', index=False)

        # Save to JSON
        reviews_df.to_json('amazon_reviews.json', orient='records', lines=True, indent=4)