# -*- coding: utf-8 -*- # VERSION: 1.13 # AUTHORS: alexandre-eliot # INSPIRED BY THE WORK OF # sa3dany, Alyetama, BurningMop, scadams # Yun (chenzm39@gmail.com) # LICENSING INFORMATION # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import re from html.parser import HTMLParser from novaprinter import prettyPrinter from helpers import retrieve_url, download_file class zetorrents(object): """ `url`, `name`, `supported_categories` should be static variables of the engine_name class, otherwise qbt won't install the plugin. `url`: The URL of the search engine. `name`: The name of the search engine, spaces and special characters are allowed here. `supported_categories`: What categories are supported by the search engine and their corresponding id, possible categories are ('all', 'anime', 'books', 'games', 'movies', 'music', 'pictures', 'software', 'tv'). """ url = 'https://www.zetorrents.be' name = 'ZeTorrents (french)' supported_categories = { 'all': [None], 'anime': ['animation'], 'books': ['ebooks'], 'games': ['jeux-pc', 'jeux-consoles'], 'movies': ['films'], 'music': ['musique'], 'tv': ['series'], } RESULTS_PER_PAGE = 100 class zeTorrentsParser(HTMLParser): """Parses zetorrents.com browse page for search results and stores them.""" def __init__(self, infos, url): """ Construct a zeTorrents html parser. Parameters: :param list res: a list to store the results in @deprecated :param infos: an object to retrieve informations about the page :param str url: the base url of the search engine """ try: super().__init__() except TypeError: # See: http://stackoverflow.com/questions/9698614/ HTMLParser.__init__(self) self.NB_OF_COLUMNS = 5 self.page_infos = infos self.engine_url = url self.results = [] self.torrent_infos = {} self.is_found_content = False self.td_counter = -1 self.span_counter = -1 self.a_counter = -1 def get_torrent_url_from_page_url(self, page_url): torrent_page = retrieve_url(page_url) torrent_regex = r'href="\/downloads\/torrentFile\/.*\.torrent"' matches = re.finditer(torrent_regex, torrent_page, re.MULTILINE) torrent_urls = [x.group() for x in matches] if len(torrent_urls) > 0: return torrent_urls[0].split('"')[1] return None def handle_starttag(self, tag, attrs): params = dict(attrs) if params.get('class') == 'content-list-torrent': self.is_found_content = True elif self.is_found_content: if tag == 'tr': self.print_torrent_infos_and_reinit_row() elif tag == 'td': self.td_counter += 1 elif self.td_counter > -1: if tag == 'span': self.span_counter += 1 elif tag == 'a': self.a_counter += 1 if self.td_counter == 0: if 'href' not in params: return href = params['href'] if href.startswith('/torrents/'): link = f'{self.engine_url}{href}' torrent_url = self.get_torrent_url_from_page_url(link) if torrent_url: self.torrent_infos['link'] = self.engine_url + torrent_url self.torrent_infos['engine_url'] = self.engine_url self.torrent_infos['desc_link'] = link def handle_torrent_data(self, data): if ( self.td_counter > 0 # We skip the first "td" and self.td_counter < self.NB_OF_COLUMNS ): match self.td_counter: # Catch the name case 1: if self.a_counter == 0: self.torrent_infos['name'] = data.strip() # Catch the size case 2: if self.span_counter == 0: self.torrent_infos['size'] = unit_fr2en(data.strip()) # Catch the seeds case 3: if self.span_counter == 0: try: self.torrent_infos['seeds'] = int(data.strip()) except ValueError: self.torrent_infos['seeds'] = -1 # Catch the leeches case 4: if self.span_counter == 0: try: self.torrent_infos['leech'] = int(data.strip()) except ValueError: self.torrent_infos['leech'] = -1 def handle_data(self, data): self.handle_torrent_data(data) def print_torrent_infos_and_reinit_row(self): self.td_counter = -1 array_length = len(self.torrent_infos) if array_length < 1: return self.page_infos['hit_count'] += 1 prettyPrinter(self.torrent_infos) self.torrent_infos = {} def handle_endtag(self, tag): if self.is_found_content and tag == 'table': # Because we are printing out the previous torrent infos on # detecting a `td` tag, we need to try and print out the last # torrent's infos right before the end of the table self.print_torrent_infos_and_reinit_row() self.is_found_content = False elif self.td_counter > -1: if self.span_counter > -1 and tag == 'span': self.span_counter -= 1 elif self.a_counter > -1 and tag == 'a': self.a_counter -= 1 def build_url(self, url, query, category=None, page=1): page_url = f'{url}/torrents/find/' if category: page_url += f'{category}/' return f'{page_url}:{page}?title={query}' # DO NOT CHANGE the name and parameters of this function # This function will be the one called by nova2.py def search(self, what, cat='all'): """ Retreive and parse engine search results by category and query. Parameters: `what` is a string with the search tokens, already escaped (e.g. "Ubuntu+Linux") `cat` is the name of a search category in ('all', 'anime', 'books', 'games', 'movies', 'music', 'pictures', 'software', 'tv') """ categories = self.supported_categories[cat] page_infos = { 'hit_count': 0, } parser = self.zeTorrentsParser(page_infos, self.url) for category in categories: page_index = 1 while True: page_url = self.build_url(self.url, what, category, page_index) html = retrieve_url(page_url) # Trying to find the page arrow to know if # we should carry on iterating right_arrow_regex = r'><\/a>' is_last_page = len(re.findall(right_arrow_regex, html)) > 0 parser.feed(html) if ( not is_last_page or page_infos['hit_count'] < self.RESULTS_PER_PAGE ): break page_infos['hit_count'] = 0 page_index += 1 parser.close() def unit_fr2en(size): """Convert french size unit to english unit""" return re.sub( r'([KMGTP])o', lambda match: match.group(1) + 'B', size, flags=re.IGNORECASE )