#VERSION: 1.1 import re from enum import Enum from html.parser import HTMLParser from helpers import ( # download_file, retrieve_url, ) from novaprinter import prettyPrinter ENGINE_BASEURL = 'http://dmhy.org' MAGNET_PATTERN = r'magnet:\?xt=urn:btih:[a-zA-Z0-9]*' TITLE = 3 MAGLINK = 4 SIZE = 5 SEEDER = 6 LEECH = 7 class dmhy(object): """ `url`, `name`, `supported_categories` should be static variables of the engine_name class, otherwise qbt won't install the plugin. `url`: The URL of the search engine. `name`: The name of the search engine, spaces and special characters are allowed here. `supported_categories`: What categories are supported by the search engine and their corresponding id, possible categories are ('all', 'anime', 'books', 'games', 'movies', 'music', 'pictures', 'software', 'tv'). """ url = ENGINE_BASEURL name = 'dmhy' supported_categories = { 'all': '0', } class DmhyParser(HTMLParser): def __init__(self, outer_class): super().__init__() self.outer_class = outer_class self.in_table = False self.in_tbody = False self.in_row = False self.in_cell_num = 0 self.in_cell = False self.result_dict = {} def handle_starttag(self, tag, attrs): # Find the table that contains the search results # The table has an id of "topic_list" if tag == 'table': for attr in attrs: if attr[0] == 'id' and attr[1] == "topic_list": self.in_table = True return if tag == 'tbody' and self.in_table: self.in_tbody = True return # Find the rows in the table and initialize the result_dict if tag == 'tr' and self.in_tbody: self.in_row = True self.in_cell_num = 0 self.result_dict = { "link": "-1", "name": "", "size": "-1", "seeds": "-1", "leech": "-1", "engine_url": ENGINE_BASEURL, "desc_link": "-1", } return # Find the cells in the row. Keep track of the cell number if tag == 'td' and self.in_row: self.in_cell_num += 1 self.in_cell = True return # The anchor tag in the third cell contains the page url. # Save page url into the dictionary, and retrieve the page to get the magnet link # Also save the magnet link into the dictionary if tag == 'a' and self.in_cell_num == TITLE: for attr in attrs: if attr[0] == 'href': self.result_dict["desc_link"] = ENGINE_BASEURL + attr[1] return # Only the first (of two) anchor tag in the fourth cell contains the magnet link # So a regular expression check is performed to make sure it is a magnet link if tag == 'a' and self.in_cell_num == MAGLINK: for attr in attrs: if attr[0] == 'href' and re.match(MAGNET_PATTERN, attr[1]): self.result_dict["link"] = attr[1] def handle_data(self, data): # The third cell contains the name of the torrent, # but it may be split into multiple parts. Concatenate them. if self.in_cell and self.in_cell_num == TITLE: self.result_dict["name"] += re.sub(r"[\t\n]", "", data) return # The fourth cell contains the size of the torrent # Safe to use as is. if self.in_cell and self.in_cell_num == SIZE: self.result_dict["size"] = data return # The sixth cell contains the number of seeders # This data is not always available, so check for a dash if self.in_cell and self.in_cell_num == SEEDER and data != '-': self.result_dict["seeds"] = data return # The seventh cell contains the number of leech # Same as the seeders, check for a dash if self.in_cell and self.in_cell_num == LEECH and data != '-': self.result_dict["leech"] = data return def handle_endtag(self, tag): # Reset the cell flag when the cell ends if tag == 'td' and self.in_cell: self.in_cell = False return # Reset the row and cell flags when the row ends if tag == 'tr' and self.in_row: self.in_row = False self.in_cell_num = 0 # It may happen that a magnet link is not directly available from the table # In that case, th description page is retrieved to get the magnet link if self.result_dict["link"] == "-1": page = retrieve_url(self.result_dict["desc_link"]) magnet_links = re.findall(MAGNET_PATTERN, page) self.result_dict["link"] = magnet_links[0] prettyPrinter(self.result_dict) return # Reset the tbody flag when the tbody ends if tag == 'tbody' and self.in_tbody: self.in_tbody = False return # Reset the table flag when the table ends if tag == 'table' and self.in_table: self.in_table = False return def __init__(self): """ Some initialization """ self.result_dicts = [] # def download_torrent(self, info): # """ # Providing this function is optional. # It can however be interesting to provide your own torrent download # implementation in case the search engine in question does not allow # traditional downloads (for example, cookie-based download). # """ # print(download_file(info)) # DO NOT CHANGE the name and parameters of this function # This function will be the one called by nova2.py def search(self, what, cat='all'): """ Here you can do what you want to get the result from the search engine website. Everytime you parse a result line, store it in a dictionary and call the prettyPrint(your_dict) function. `what` is a string with the search tokens, already escaped (e.g. "Ubuntu+Linux") `cat` is the name of a search category in ('all', 'anime', 'books', 'games', 'movies', 'music', 'pictures', 'software', 'tv') """ search_url = f"http://dmhy.org/topics/list?keyword={what}" while True: result_page = retrieve_url(search_url) parser = self.DmhyParser(outer_class=self) parser.feed(result_page) pattern = fr'<a\s+href="/topics/list/page/(\d+)\?keyword={re.escape(what)}">下一頁</a>' match = re.search(pattern, result_page) if match: search_url = f"http://dmhy.org/topics/list/page/{match.group(1)}?keyword={what}" continue else: break