','

') # logger.debug("\tChapter text: %s" % raw_page) page_soup = self.make_soup(raw_page) [comment.extract() for comment in page_soup.find_all(string=lambda text:isinstance(text, Comment))] fullhtml = "" for aa_ht_div in page_soup.find_all('div', 'aa_ht') + page_soup.select('div[class^="_article__content_"]'): if aa_ht_div.div: html = unicode(aa_ht_div.div) # Strip some starting and ending tags, html = re.sub(r'^', r'', html) html = re.sub(r'

$', r'', html) html = re.sub(r'

$', r'', html) fullhtml = fullhtml + html # logger.debug('getPageText - fullhtml: %s' % fullhtml) return fullhtml def getChapterText(self, url): logger.debug('Getting chapter text from: %s' % url) raw_page = self.get_request(url) page_soup = self.make_soup(raw_page) pages = page_soup.find('div',class_='l_bH') if not pages: pages = page_soup.select_one('div._pagination_h0sum_1') if not pages: pages = page_soup.select_one('div.clearfix.panel._pagination_1400x_1') if not pages: pages = page_soup.select_one('div[class^="panel clearfix _pagination_"]') # logger.debug(pages) fullhtml = "" chapter_description = '' if self.getConfig("description_in_chapter"): chapter_description = page_soup.find("meta", {"name" : "description"})['content'] # logger.debug("\tChapter description: %s" % chapter_description) chapter_description = '

Description: %s

' % chapter_description fullhtml += self.getPageText(raw_page, url) if pages: ## look for highest numbered page, they're not all listed ## when there are many. last_page_links = pages.find_all('a', class_='l_bJ') if not last_page_links: last_page_links = pages.select('a[class^="_pagination__item_"]') last_page_link = last_page_links[-1] last_page_no = int(urlparse.parse_qs(last_page_link['href'].split('?')[1])['page'][0]) # logger.debug(last_page_no) for page_no in range(2, last_page_no+1): page_url = url + "?page=%s" % page_no # logger.debug("page_url= %s" % page_url) raw_page = self.get_request(page_url) fullhtml += self.getPageText(raw_page, url) #logger.debug(fullhtml) page_soup = self.make_soup(fullhtml) fullhtml = self.utf8FromSoup(url, self.make_soup(fullhtml)) fullhtml = chapter_description + fullhtml fullhtml = unicode(fullhtml) return fullhtml def get_urls_from_page(self,url,normalize): from ..geturls import get_urls_from_html ## hook for logins, etc. self.before_get_urls_from_page(url,normalize) # this way it uses User-Agent or other special settings. data = self.get_request(url,usecache=False) soup = self.make_soup(data) page_urls = get_urls_from_html(soup, url, configuration=self.configuration, normalize=normalize) if not self.getConfig("fetch_stories_from_api",True): logger.debug('fetch_stories_from_api Not enabled') return {'urllist': page_urls} user_story_list = re.search(r'literotica\.com/authors/.+?/lists\?listid=(?P\d+)', url) fav_authors = re.search(r'literotica\.com/authors/.+?/favorites', url) written = re.search(r'literotica.com/authors/.+?/works/', url) logger.debug((bool(user_story_list), bool(fav_authors), bool(written))) # If the url is not supported if not user_story_list and not fav_authors and not written: logger.debug('No supported link. %s', url) return {'urllist':page_urls} # Grabbing the main list where chapters are contained. if user_story_list: js_story_list = re.search(r';\$R\[\d+?\]$\$R\[\d+?\],\$R\[\d+?\]$;\$R\[\d+?\]$\$R\[\d+?\],\$R\[\d+?\]=\{success:!\d,current_page:(?P\d+?),last_page:(?P\d+?),total:\d+?,per_page:\d+,(has_series:!\d)?data:\$R\[\d+?\]=\[\$R\[\d+?\]=(?P.+)\}\]\}$;', data) # }] } } }); \$R\[\d+?\]$\$R\[\d+?\],\$R\[\d+?\]$;\$R\[\d+?]$\$R\[\d+?\],\$R\[\d+?\]=\{sliders: logger.debug('user_story_list ID [%s]'%user_story_list.group('list_id')) else: js_story_list = re.search(r'\$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]={current_page:(?P\d+?),last_page:(?P\d+?),total:\d+?,per_page:\d+,(has_series:!\d,)?data:\$R\[\d+\]=\[\$R\[\d+\]=\{(?!aim)(?P.+)\}$;_\$HY\.r\[', data) # In case the regex becomes outdated if not js_story_list: logger.debug('Failed to grab data from the js.') return {'urllist':page_urls} user = None script_tags = soup.find_all('script') for script in script_tags: if not script.string: continue # Getting author from the js. user = re.search(r'_\$HY\.r\[\"AuthorQuery\[\\\"(?P.+?)\\\"\]\"\]', script.string) if user != None: logger.debug("User: [%s]"%user.group('author')) break else: logger.debug('Failed to get a username') return {'urllist': page_urls} # Extract the current (should be 1) and last page numbers from the js. logger.debug("Pages %s/%s"%(js_story_list.group('current_page'), js_story_list.group('last_page'))) urls = [] # Necessary to format a proper link as there were no visible data specifying what kind of link that should be. cat_to_link = {'adult-comics': 'i', 'erotic-art': 'i', 'illustrated-poetry': 'p', 'erotic-audio-poetry': 'p', 'erotic-poetry': 'p', 'non-erotic-poetry': 'p'} stories_found = re.findall(r"category_info:\$R\[.*?type:\".+?\",pageUrl:\"(.+?)\"}.+?,type:\"(.+?)\",url:\"(.+?)\",", js_story_list.group('data')) for story in stories_found: story_category, story_type, story_url = story urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(story_category, 's'), story_url)) # Removes the duplicates seen = set() urls = [x for x in (page_urls + urls) if not (x in seen or seen.add(x))] logger.debug("Found [%s] stories so far."%len(urls)) # Sometimes the rest of the stories are burried in the js so no fetching in necessery. if js_story_list.group('last_page') == js_story_list.group('current_page'): return {'urllist': urls} user = urlparse.quote(user.group(1)) logger.debug("Escaped user: [%s]"%user) if written: category = re.search(r"_\$HY\.r\[\"AuthorSeriesAndWorksQuery\[\\\".+?\\\",\\\"\D+?\\\",\\\"(?P\D+?)\\\"\]\"\]=\$R\[\d+?\]=\$R\[\d+?\]\(\$R\[\d+?\]=\{", data) elif fav_authors: category = re.search(r"_\$HY\.r\[\"AuthorFavoriteWorksQuery\[\\\".+?\\\",\\\"(?P\D+?)\\\",\d\]\"\]=\$R\[\d+?\]=\$R\[\d+?\]\(\$R\[\d+?\]={", data) if not user_story_list and not category: logger.debug("Type of works not found") return {'urllist': urls} last_page = int(js_story_list.group('last_page')) current_page = int(js_story_list.group('current_page')) + 1 # Fetching the remaining urls from api. Can't trust the number given about the pages left from a website. Sometimes even the api returns outdated number of pages. while current_page <= last_page: i = len(urls) logger.debug("Pages %s/%s"%(current_page, int(last_page))) if fav_authors: jsn = self.get_request('https://literotica.com/api/3/users/{}/favorite/works?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22type%22%3A%22{}%22%2C%22withSeriesDetails%22%3Atrue%7D'.format(user, current_page, category.group('type'))) elif user_story_list: jsn = self.get_request('https://literotica.com/api/3/users/{}/list/{}?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22withSeriesDetails%22%3Atrue%7D'.format(user, user_story_list.group('list_id'), current_page)) else: jsn = self.get_request('https://literotica.com/api/3/users/{}/series_and_works?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22sort%22%3A%22date%22%2C%22type%22%3A%22{}%22%2C%22listType%22%3A%22expanded%22%7D'.format(user, current_page, category.group('type'))) urls_data = json.loads(jsn) last_page = urls_data["last_page"] current_page = int(urls_data["current_page"]) + 1 for story in urls_data['data']: #logger.debug('parts' in story) if story['url'] and story.get('work_count') == None: urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(story["category_info"]["pageUrl"], 's'), str(story['url']))) continue # Most of the time series has no url specified and contains all of the story links belonging to the series urls.append('https://www.literotica.com/series/se/%s'%str(story['id'])) for series_story in story['parts']: urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(series_story["category_info"]["pageUrl"], 's'), str(series_story['url']))) logger.debug("Found [%s] stories."%(len(urls) - i)) # Again removing duplicates. seen = set() urls = [x for x in urls if not (x in seen or seen.add(x))] logger.debug("Found total of [%s] stories"%len(urls)) return {'urllist':urls} def getClass(): return LiteroticaSiteAdapter