#VERSION: 0.3 #AUTHORS: Henrik Asp (solenskiner@gmail.com) # Copyright (c) 2019, Henrik Asp # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. from novaprinter import prettyPrinter from helpers import download_file, retrieve_url from html.parser import HTMLParser from functools import partial class MyHTMLParser(HTMLParser): defaults = { "link": -1, "name": -1, "size": -1, "seeds": 0, "leech": 0, "engine_url": "http://academictorrents.com", "desc_link": -1 } def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.data = [] self.current = self.defaults.copy() self.tag_stack = ['root'] self.attrs_stack = [{}] self.numchild_stack = [0] self.data = { 'torrents':[], 'next_page': None } def handle_starttag(self, tag, attrs): """ Parser's start tag handler """ dispatcher = getattr( self, "_".join(("handle_start_tag", tag)), partial(self.handle_start_tag_default, tag) ) self.pre_handle_start_tag(tag, attrs) dispatcher(attrs) self.post_handle_start_tag(tag, attrs) def handle_start_tag_default(self, tag, attrs): pass def pre_handle_start_tag(self, tag, attrs): self.numchild_stack[-1] += 1 def post_handle_start_tag(self, tag, attrs): self.tag_stack.append(tag) self.attrs_stack.append(dict(attrs)) self.numchild_stack.append(0) def handle_endtag(self, tag): """ Parser's end tag handler """ dispatcher = getattr( self, "_".join(("handle_end_tag", tag)), partial(self.handle_end_tag_default, tag) ) self.pre_handle_end_tag(tag) dispatcher() self.post_handle_end_tag(tag,) def handle_end_tag_default(self, tag): pass def pre_handle_end_tag(self, tag): # sometimes website authors are not too careful to write valid markup # hence looping until we find the matching end tag, # and explicitly stopping if we can't find it. pops = 0 while True: if self.tag_stack[-1] == "root": break pops += 1 pop = self.tag_stack.pop() self.attrs_stack.pop() self.numchild_stack.pop() if pop == tag: break def post_handle_end_tag(self, tag): pass def handle_start_tag_tr(self, attrs): self.current = self.defaults.copy() def handle_end_tag_tr(self): match_torrent = ['root', 'html', 'body', 'table', 'tr'] if self.tag_stack[:len(match_torrent)] == match_torrent: self.data["torrents"].append(self.current.copy()) def handle_data(self, data): match_next = ['root', 'html', 'body', 'center', 'p', 'ul', 'li', 'a'] match_torrent = ['root', 'html', 'body', 'table', 'tr'] url = "http://academictorrents.com" if self.tag_stack[:len(match_next)] == match_next and "Next" in data: self.data["next_page"] = url + "/" + self.attrs_stack[7]["href"] elif self.tag_stack[:len(match_torrent)] == match_torrent: if self.numchild_stack[5:] == [2, 1, 1, 0]: download_link = url + "/download/{}.torrent" self.current["name"] = data self.current["desc_link"] = url + self.attrs_stack[7]["href"] self.current["link"] = download_link.format( self.attrs_stack[7]["href"].split("/")[2] ) elif self.numchild_stack[5:] == [5, 0]: self.current["size"] = data elif self.numchild_stack[5:] == [6, 1, 1, 1, 0]: self.current["seeds"] = data.strip("+") elif self.numchild_stack[5:] == [6, 2, 1, 1, 0]: self.current["leech"] = data.strip("+") def get_results(self): return self.data class academictorrents(object): """ Search engine class """ url = "http://academictorrents.com" name = "Academic Torrents" supported_categories = {'all': ''} def download_torrent(self, info): """ Downloader """ print(download_file(info)) def search(self, what, cat='all'): """ Performs search """ if cat != 'all': return search_url = "http://academictorrents.com/browse.php?search={what}" url = search_url.format(what=what) while url: response = retrieve_url(url) parser = MyHTMLParser() parser.feed(response) results = parser.get_results() for match in results["torrents"]: prettyPrinter(match) url = results["next_page"]