# SPDX-License-Identifier: GPL-3.0 # DLX # # Bulk download tool # # COPYRIGHT NOTICE # Copyright (C) 2025 0x4248 and contributors # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the license is not changed. # # This software is free and open source. Licensed under the GNU general # public license version 3.0 as published by the Free Software Foundation. import requests from bs4 import BeautifulSoup import urllib.parse def fetch_files_recursive(base_url, current_path="", visited=None): if visited is None: visited = set() url = urllib.parse.urljoin(base_url, current_path) normalized_path = urllib.parse.urljoin(base_url, current_path).rstrip("/") if normalized_path in visited: return [] visited.add(normalized_path) print(f"Fetching index of {url}") response = requests.get(url) if response.status_code != 200: print(f"Failed to retrieve {url}. Status code: {response.status_code}") return [] soup = BeautifulSoup(response.content, "html.parser") files = [] for item in soup.find_all("a"): href = item.get("href") if href is None or href.startswith("#") or href.startswith("?") or href.startswith("../") or href.startswith("mailto:") or href.startswith("javascript:") or href.startswith("ftp://") or href.endswith("/."): continue full_href = urllib.parse.urljoin(url, href) parsed_href = urllib.parse.urlparse(full_href) relative_path = parsed_href.path[len(urllib.parse.urlparse(base_url).path) :].lstrip("/") if href.endswith("/"): files.extend(fetch_files_recursive(base_url, relative_path, visited)) else: files.append((full_href, relative_path)) return files class Driver: def fetch(base_url, output): result = fetch_files_recursive(base_url) out = open(output, "w") for full_url, relative_path in result: out.write(f"{full_url}\t{relative_path}\n")