#!/usr/bin/env python3 # Coded by Andrea Telatin (Andrea.Telatin@quadram.ac.uk) # Implemented in MetaPhage by Mattia Pandolfo (mattia.pandolfo@univr.it) software = "db_manager.py" version = "0.2.1" import json, argparse, os, time, re, sys import concurrent.futures # Try importing wget withWget = False try: import wget withWget = True except ImportError: withWget = False print(f"WARNING: {software} will use `wget` from the system.", file=sys.stderr) start = time.perf_counter() progresses = {"_printed": []} def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) # Initial #"url": "https://zenodo.org/record/5879332/files/2022-01-inphared.tar.gz?download=1", config = """ { "2021.1": { "phix": { "name": "PhiX reference", "url": "https://zenodo.org/record/4608203/files/phix.tar.gz?download=1", "md5": "36897778387b36cecd6e60aef9dab17b", "file": "phix.tar.gz", "provides": ["phix"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm phix.*" }, "kraken2": { "name": "MiniKraken", "url": "https://zenodo.org/record/4608203/files/kraken2.tar.gz?download=1", "md5": "4dca0ee1acccbff860ca725c37e22f85", "file": "kraken2.tar.gz", "provides": ["kraken2"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm kraken2.*" }, "vibrant": { "name": "Vibrant", "url": "https://zenodo.org/record/4608203/files/vibrant.tar.gz?download=1", "md5": "05bad36563db58889236571af871acaa", "file": "vibrant.tar.gz", "provides": ["vibrant"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm vibrant.*" }, "virsorter": { "name": "Virsorter", "url": "https://zenodo.org/record/4608203/files/virsorter_legacy.tar.gz?download=1", "md5": "2e4308be78a15df3d89ceb3e72c9ba81", "file": "virsorter_legacy.tar.gz", "provides": ["virsorter"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm virsorter_legacy.*" }, "phigaro": { "name": "Phigaro", "url": "https://zenodo.org/record/4608203/files/phigaro.tar.gz?download=1", "md5": "f6e671d885538542755d0c52b1aa6ddb", "file": "phigaro.tar.gz", "provides": ["phigaro"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm phigaro.*" }, "vcontact2": { "name": "vConTACT2", "url": "https://warwick.s3.climb.ac.uk/ifrqmra-metaphage/v1.0/2022-01-inphared.tar.gz", "md5": "72c9a0be3b93364e44338ced659341fe", "file": "2022-01-inphared.tar.gz", "provides": ["inphared"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm *inphared*.tar.gz" } }, "2022.1": { "phix": { "name": "PhiX reference", "url": "https://zenodo.org/record/4608203/files/phix.tar.gz?download=1", "md5": "36897778387b36cecd6e60aef9dab17b", "file": "phix.tar.gz", "provides": ["phix"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm phix.*" }, "kraken2": { "name": "MiniKraken", "url": "https://zenodo.org/record/4608203/files/kraken2.tar.gz?download=1", "md5": "4dca0ee1acccbff860ca725c37e22f85", "file": "kraken2.tar.gz", "provides": ["kraken2"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm kraken2.*" }, "phigaro": { "name": "Phigaro", "url": "https://zenodo.org/record/4608203/files/phigaro.tar.gz?download=1", "md5": "f6e671d885538542755d0c52b1aa6ddb", "file": "phigaro.tar.gz", "provides": ["phigaro"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm phigaro.*" }, "vibrant": { "name": "Vibrant", "url": "https://zenodo.org/record/4608203/files/vibrant.tar.gz?download=1", "md5": "05bad36563db58889236571af871acaa", "file": "vibrant.tar.gz", "provides": ["vibrant"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm vibrant.*" }, "virsorter2": { "name": "Virsorter2", "url": "https://warwick.s3.climb.ac.uk/ifrqmra-metaphage/v2.0/virsorter2.tar.gz", "md5": "ddc75b770b96a9fabee4f76458b57e0d", "file": "virsorter2.tar.gz", "provides": ["virsorter"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm virsorter2.tar.gz" }, "checkv": { "name": "CheckV database", "url": "https://warwick.s3.climb.ac.uk/ifrqmra-metaphage/v2.0/checkv.tar.gz", "md5": "85095275aee479de87b0dbc95b3ca48e", "file": "checkv.tar.gz", "provides": ["checkv"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm checkv.tar.gz" }, "vcontact2": { "name": "vConTACT2", "url": "https://warwick.s3.climb.ac.uk/ifrqmra-metaphage/v1.0/2022-01-inphared.tar.gz", "md5": "72c9a0be3b93364e44338ced659341fe", "file": "2022-01-inphared.tar.gz", "provides": ["inphared"], "expand": ["tar xfz '{file}' --directory '{outdir}'", "rm '{file}'"], "cleanup": "rm *inphared*.tar.gz" } } } """ try: data = json.loads(config) except Exception as e: eprint(f"Invalid JSON database: {e}") quit() def fill_template(template, source): matches = re.findall(r'\{(.*?)\}', template) for m in matches: if m in source: placeholder = '{' + m + '}' template = re.sub(rf"{placeholder}", source[m], template) else: print(f"Error: unable to put {source[m]} in {m} [processing {source[url]}") quit() return template def globalbar(cur, tot, step=100): progresses[tot] = cur #print(tot, "...") gt = 0 done = 0 for p in progresses: if p == '_printed': continue gt += int(p) done += progresses[p] ratio = int(done*100/gt) if ratio > 0 and len(progresses) == len(data) and ratio % 10 == 0 and not ratio in progresses['_printed']: progresses["_printed"].append(ratio) time.sleep(0.1) eprint(f"Downloading: {ratio}% done") def downloadPackage(url, dest): if withWget: try: wget.download(url, bar=globalbar) except Exception as e: eprint(f"Error downloading {url}:\n{e}") else: try: os.system(f"wget -q -O '{dest}' '{url}'") except Exception as e: eprint(f"Error downloading {url}:\n{e}") def getDatabase(package, attempts=5): starttime = time.perf_counter() package["downloaded"] = False shell_commands = [] for cmd_template in package["expand"]: cmd = fill_template(cmd_template, package) shell_commands.append(cmd) eprint(f" 📦 Preparing to download {package['name']}") for i in range(attempts + 1): try: downloadPackage(package["url"], package["file"]) package["downloaded"] = True break except Exception as e: if 'cleanup' in package: os.system(package["cleanup"]) eprint(f"{i}/{attempts} Download of {package['name']} failed:\nURI: {package['url']}\nError: {e}") if i == attempts: return f" 🛑 {package['name']} failed: {e}" eprint(f" ✅ {package['name']} downloaded") for cmd in shell_commands: os.system(cmd) finishtime = time.perf_counter() return finishtime-starttime if __name__ == "__main__": # Parameter parser parser = argparse.ArgumentParser( description = 'This script is designed to manage all the databases used ' 'in MetaPhage. This script is placed in MetaPhage/bin.', formatter_class = argparse.RawTextHelpFormatter) options = parser.add_argument_group("Options") options.add_argument('-o', '--outdir', help="Output directory", required=True) options.add_argument('-r', '--release', help="Database bundle release [default: %(default)s]", default="2021.1") options.add_argument('-m', '--maxthreads', help="Maximum number of concurrent downloads", default=3) parameters = parser.parse_args() # output directory if not os.path.exists(parameters.outdir): os.makedirs(parameters.outdir) # To download todo_list = [] if parameters.release not in data: print("Error: `{}` is not a valid release. Try:\n * {}".format(parameters.release, "\n * ".join(data.keys()))) exit(1) data = data[parameters.release] print(" 📂 Downloading bundle {}: {} databases total".format(parameters.release, len(data))) for package_name in data: pack = data[package_name] #print("▸ ",package_name, "\t", pack["url"]) download_this = 0 # Check requried files for file in pack["provides"]: if not os.path.exists(os.path.join(parameters.outdir, file)): download_this += 1 else: eprint(f"{file} found: skipping") if download_this > 0: pack["outdir"] = parameters.outdir todo_list.append(pack) # Download results = [] with concurrent.futures.ThreadPoolExecutor(max_workers=int(parameters.maxthreads)) as executor: for record in todo_list: results.append(executor.submit(getDatabase, record)) totalTime = 0 resultsCount = 0 for f in concurrent.futures.as_completed(results): #print(str(f.result())) totalTime += f.result() resultsCount+=1 finish = time.perf_counter() print(f"{resultsCount} packages downloaded in {round(finish-start, 2)} seconds (cumulative {round(totalTime,2)} seconds)") # Pass db paths to nextflow path_phix = parameters.outdir + "/phix" path_kraken2 = parameters.outdir + "/kraken2" path_vibrant = parameters.outdir + "/vibrant" path_virsorter2 = parameters.outdir + "/virsorter" path_checkv = parameters.outdir + "/checkv" path_allvsall = parameters.outdir + "/diamond" path_vcontact2 = parameters.outdir + "/inphared" file_db_path = open("db_path.csv", "w") file_db_path.writelines("%s,%s,%s,%s,%s,%s,%s" % (path_phix, path_kraken2, path_vibrant, path_virsorter2, path_checkv, path_allvsall, path_vcontact2)) file_db_path.close()