import sys
import os
import requests
import time
import shutil
from selenium import webdriver
import threading
from queue import Queue


dataset_path = "/persistent/dataset"
thread_count = 50
threads = []
# Initial queue
queue = Queue(0)


class Downloader(threading.Thread):
    def __init__(self, queue, thread):
        threading.Thread.__init__(self)
        self.queue = queue
        self.thread = thread

    def run(self):
        while self.queue.empty() == False:
            item = self.queue.get()

            # print("Thread:",self.thread,item)
            # time.sleep(3)
            download_from_url(item["url"], item["img_dir"], item["file_path"])

            self.queue.task_done()


def download_from_url(url, img_dir, file_path):
    """Downloading the actual image from Google

    INPUTS
    =======
    url: unifrom resource locator; the address of a given unique resource on the Web, in this case an image of a pair of Jordans
    img_dir: Not accessed
    file_path:  where to save the Jordans image

    """
    try:
        img_path = os.path.basename(url)
        #file_path = os.path.join(img_dir, img_path)
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
    except Exception as e:
        print("Error in url:", url)
        print(e)


def download_google_images(num_images_requested, search_term_list):
    print("download_google_images...")
    start_time = time.time()

    # Setup dataset folder
    if os.path.exists(dataset_path):
        shutil.rmtree(dataset_path)
    os.mkdir(dataset_path)

    # Each scrolls provides 400 image approximately
    number_of_scrolls = int(num_images_requested / 400) + 1

    # Firefox Options
    # set up a webdriver for Firefox
    options = webdriver.FirefoxOptions()
    options.headless = True
    browser = webdriver.Firefox(options=options)

    images_link = None
    for search_term in search_term_list:
        print("Searching for :", search_term)
        browse_link = 'https://www.google.com/search?q=' + search_term
        print("link:", browse_link)
        browser.get(browse_link)

        # Go to Google Images
        # we search Google by using the CSS selector (like regex for HTML)
        # we find an element ‘a’ that contains the class with “hide-focus-ring”; this will give us the Google images
        # migrate through each of the links b/c there’s one for shopping, news, books, etc., and find the one that shows it to be =isch&; this will identify it as Google Images

        # images_links = browser.find_elements_by_xpath(
        #     '//a[contains(@class, "hide-focus-ring")]')
        images_links = browser.find_elements_by_xpath(
            '//a[contains(@data-hveid, "CAEQAw")]')
        for link in images_links:
            # print(link)
            # on the element we get the href
            link_href = link.get_attribute("href")
            print(link_href)

            # Find images link
            if "&tbm=isch" in link_href:
                images_link = link
                break

        if images_link is None:
            raise ValueError('Google Images link was not found')

        # Wait to make sure that Google knows this is not automated
        time.sleep(5)

        # Go to images
        #images_link = images_links[0]
        print("Going to link:", images_link.get_attribute("href"))
        # Seleniuam was originally a front-end testing tool where we write out test scenarios for 50 cases, such as a user clicking on certain things; Facebook and Instagram use Selenium for fake likes and fake comments
        # since Selenium can do anything that a human is doing, we can click on the "Images" link for Google
        images_link.click()

        # Scroll to get more images
        # we want to keep on scrolling and clicking the "show more results" button so that we can capture as many thumbnails as possible to get as many Jordan shoe images as possible
        print("number_of_scrolls:", number_of_scrolls)
        for _ in range(number_of_scrolls):
            for __ in range(10):
                # multiple scrolls needed to show all 400 images
                browser.execute_script("window.scrollBy(0, 1000000)")
                time.sleep(2)
            # to load next 400 images
            # we've scrolled to the bottom of this page, so now we'd like to show more results again
            time.sleep(5)
            # try to find show more results bottom
            try:
                # if found click to load more image
                browser.find_element_by_xpath(
                    "//input[@value='Show more results']").click()
            except Exception as e:
                print(e)
                # if not exit
                print("End of page")
                break

        # Image link store
        # we’re at first collecting the urls in a set, and then actually saving images
        # we're using a set because soemtimes there are duplicate urls
        imgs_urls = set()
        # Find the thumbnail images
        # -find all “a” elements to get the image, and that will allow us to get the direct link to the image
        thumbnails = browser.find_elements_by_xpath(
            '//a[@class="wXeWr islib nfEiy"]')
        print("Number of thumbnails:", len(thumbnails))
        # loop over the thumbs to retrive the links
        for thumbnail in thumbnails:
            # check if reached the request number of links
            if len(imgs_urls) >= num_images_requested:
                break
            try:
                thumbnail.click()
                time.sleep(2)
            except Exception as error:
                print("Error clicking one thumbnail : ", error)
            # Find the image url
            url_elements = browser.find_elements_by_xpath(
                '//img[@class="n3VNCb"]')
            # check for the correct url
            for url_element in url_elements:
                try:
                    url = url_element.get_attribute('src')
                except e:
                    print("Error getting url")
                if url.startswith('http') and not url.startswith('https://encrypted-tbn0.gstatic.com'):
                    #print("Found image url:", url)
                    imgs_urls.add(url)

        print('Number of image urls found:', len(imgs_urls))

        # Wait 5 seconds
        time.sleep(5)

        # Save the images
        # creating the path for where to save the images
        img_dir = os.path.join(
            dataset_path, search_term.lower().replace(" ", "_"))
        if not os.path.exists(img_dir):
            os.makedirs(img_dir)

        count = 0
        if len(imgs_urls) > 0:
            for url in imgs_urls:
                file_path = os.path.join(img_dir, '{0}.jpg'.format(count))
                count += 1
                queue.put({"url": url, "img_dir": img_dir,
                          "file_path": file_path})

            # Execute downloads from queue in a thread
            # a Queue here makes it faster
            # put all the links in a queue, and then performing the downloads in parallel using the threading library (e.g. we have 50 threads, and can download them all at once)
            for i in range(thread_count):
                thread = Downloader(queue, i)
                thread.start()
                threads.append(thread)
            for thread in threads:
                thread.join()

    # Quit the browser
    browser.quit()

    execution_time = (time.time() - start_time) / 60.0
    print("Download execution time (mins)", execution_time)