# VERSION: 1.0 # AUTHORS: BurningMop (burning.mop@yandex.com) # LICENSING INFORMATION # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import re from html.parser import HTMLParser from helpers import download_file, retrieve_url from novaprinter import prettyPrinter class pornrips(object): url = "https://pornrips.to" name = "PornRips.To" supported_categories = {"all": "all"} next_page_regex = r"next page-numbers" pagination_regex = r"posts-pagination-wrapper wrapper-with-padding" nothing_found_regex = r"Nothing Found" class MyHtmlParser(HTMLParser): def error(self, message): pass SEC, ART, HEA, H2, A, DIV, P, STRONG = ("section", "article", "header", "h2", "a", "div", "p", "strong") def __init__(self, url, referer): HTMLParser.__init__(self) self.headers = { 'Referer': referer } self.url = url self.row = {} self.rows = [] self.insideResults = False self.insideResult = False self.insideWrapper = False self.insideHeader = False self.insideTitle = False self.insideMetadata = False self.shouldParseName = False self.shouldSkipTorrent = False self.wrapperExcerptContentClass = "wrapper-excerpt-content" self.torrent_regex = r'https:\/\/.+?\.torrent' self.size_regex = r'\d+ [MB|MiB]' def handle_starttag(self, tag, attrs): params = dict(attrs) cssClasses = params.get("class", "") elementId = params.get("id", "") if tag == self.SEC and elementId == "primary": self.insideResults = True return if tag == self.ART and self.insideResults: self.insideResult = True return if tag == self.DIV and self.wrapperExcerptContentClass in cssClasses and self.insideResult: self.insideWrapper = True return if tag == self.HEA and self.insideWrapper: self.insideHeader = True return if tag == self.H2 and self.insideHeader: self.insideTitle = True return if tag == self.A and self.insideTitle: self.shouldParseName = True href = params.get('href') self.row['desc_link'] = href torrent_page = retrieve_url(href, self.headers) matches = re.finditer(self.torrent_regex, torrent_page, re.MULTILINE) torrent_urls = [x.group() for x in matches] if(len(torrent_urls) > 0): self.row['link'] = torrent_urls[0] else: self.shouldSkipTorrent = True return if tag == self.P and self.insideWrapper: self.insideMetadata = True return def handle_data(self, data): if self.shouldParseName: self.row['name'] = data self.shouldParseName = False return if self.insideMetadata: size_matches = re.finditer(self.size_regex, data, re.MULTILINE) size = [x.group() for x in size_matches] if len(size) > 0: self.row['size'] = size[0] return def handle_endtag(self, tag): if tag == self.A and self.shouldParseName: self.shouldParseName = False return if tag == self.H2 and self.insideTitle: self.insideTitle = False return if tag == self.HEA and self.insideHeader: self.insideHeader = False return if tag == self.P and self.insideMetadata: self.insideMetadata = False return if tag == self.ART and self.insideResult: self.row['seeds'] = -1 self.row['leech'] = -1 self.row['engine_url'] = self.url self.insideResult = True if not self.shouldSkipTorrent: prettyPrinter(self.row) self.insideResult = False self.insideWrapper = False self.insideHeader = False self.insideTitle = False self.insideMetadata = False self.shouldParseName = False self.shouldSkipTorrent = False return if tag == self.SEC and self.insideResults: self.insideResults = False return def download_torrent(self, info): return # print(download_file(info)) def get_page_url(self, page, what): return f"{self.url}/page/{page}/?s={what}" def search(self, what, cat="all"): what = what.replace("%20", "+") page = 1 retrieved_html = retrieve_url(self.get_page_url(page, what)) has_results = (len([x.group() for x in re.finditer( self.nothing_found_regex, retrieved_html, re.MULTILINE )]) == 0) if has_results: parser = self.MyHtmlParser(self.url, self.get_page_url(page, what)) parser.feed(retrieved_html) parser.close() has_next_page = len([x.group() for x in re.finditer( self.next_page_regex, retrieved_html, re.MULTILINE )]) > 0 while has_next_page: page += 1 retrieved_html = retrieve_url(self.get_page_url(page, what)) parser.feed(retrieved_html) parser = self.MyHtmlParser(self.url, self.get_page_url(page, what)) parser.close() has_next_page = len([x.group() for x in re.finditer( self.next_page_regex, retrieved_html, re.MULTILINE )]) > 0