# download_rockland_raw_bids.py # # Authors: Daniel Clark, John Pellman 2015/2016 ''' This script downloads data from the NKI Rockland Sample Lite releases stored in the cloud in BIDS format. You can specify sex, age range, handedness, session, scan type (anatomical, functional, dwi) and to limit your download to a subset of the sample. If no options are specified, all available files are downloaded. Use the '-h' to get more information about command line usage. ''' # Import packages import os # Constants SESSIONS = ['NFB3', 'DS2', 'NFB2', 'NFBR2', 'CLG2', 'CLGR', 'CLG4', 'CLG2R', 'CLG3', 'NFBR2A', 'CLG4R', 'NFB2R', 'DSA', 'CLGA', 'NFBA', 'CLG2A', 'CLG5', 'CLG', 'NFBAR'] SCANS = ['anat', 'func', 'dwi', 'fmap'] # Mapping colloquial names for the series to BIDS names. SERIES_MAP = { 'CHECKERBOARD1400':'task-CHECKERBOARD_acq-1400', 'CHECKERBOARD645':'task-CHECKERBOARD_acq-645', 'RESTCAP':'task-rest_acq-CAP', 'REST1400':'task-rest_acq-1400', 'BREATHHOLD1400':'task-BREATHHOLD_acq-1400', 'REST645':'task-rest_acq-645', 'RESTPCASL':'task-rest_pcasl', 'DMNTRACKINGTEST':'task-DMNTRACKINGTEST', 'DMNTRACKINGTRAIN':'task-DMNTRACKINGTRAIN', 'MASK':'mask', 'MSIT':'task-MSIT', 'PEER1':'task-PEER1', 'PEER2':'task-PEER2', 'MORALDILEMMA':'task-MORALDILEMMA' } # Main collect and download function def collect_and_download(out_dir, less_than=0, greater_than=0, sex='', handedness='', sessions=SESSIONS, scans=SCANS, series=SERIES_MAP.keys(), derivatives=False, dryrun=False): ''' Function to collect and download images from the Rockland sample directory on FCP-INDI's S3 bucket Parameters ---------- out_dir : string filepath to a local directory to save files to less_than : float upper age (years) threshold for participants of interest greater_than : float lower age (years) threshold for participants of interest sex : string 'M' or 'F' to indicate whether to download male or female data handedness : string 'R' or 'L' to indicate whether to download right-handed or left-handed participants sessions : list the session names (e.g.,'CLG5','NFB3') scan : list the scan types to download. Can be 'anat','func','dwi' or 'fmap'. series : list the series to download (for functional scans) derivatives : boolean whether or not to download data derivatives for functional scans dryrun : boolean whether or not to perform a dry run (i.e., no actual downloads, just listing files that would be downloaded) Returns ------- boolean Returns true if the download was successful, false otherwise. ''' # Import packages import pandas import boto3 import botocore # For anonymous access to the bucket. from botocore import UNSIGNED from botocore.client import Config from botocore.handlers import disable_signing # Init variables s3_bucket_name = 'fcp-indi' s3_prefix = 'data/Projects/RocklandSample/RawDataBIDS' # Fetch bucket s3 = boto3.resource('s3') s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing) s3_bucket = s3.Bucket(s3_bucket_name) # Remove series that aren't in the series map keys. series = [ s for s in series if s in SERIES_MAP.keys() ] # If output path doesn't exist, create it if not os.path.exists(out_dir) and not dryrun: print 'Could not find %s, creating now...' % out_dir os.makedirs(out_dir) # Load the participants.tsv file from S3 s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED)) participants_obj = s3_client.get_object(Bucket=s3_bucket_name, Key = '/'.join([s3_prefix,'participants.tsv'])) participants_df = pandas.read_csv(participants_obj['Body'], delimiter='\t', na_values=['n/a']) # Init a list to store paths. print 'Collecting images of interest...' s3_keys = s3_bucket.objects.filter(Prefix=s3_prefix) s3_keylist = [key.key for key in s3_keys] # Remove the participants for whom age range, handedness and sex do not conform to the criteria. if less_than: participants_df = participants_df[participants_df['age'] < less_than] if greater_than: participants_df = participants_df[participants_df['age'] > greater_than] if sex == 'M': participants_df = participants_df[participants_df['sex'] == 'MALE'] elif sex == 'F': participants_df = participants_df[participants_df['sex'] == 'FEMALE'] if handedness == 'R': participants_df = participants_df[participants_df['handedness'] == 'RIGHT'] elif handedness == 'L': participants_df = participants_df[participants_df['handedness'] == 'LEFT'] if len(participants_df) == 0: print 'No participants meet the criteria given. No download will be initiated.' return # Generate a list of participants to filter on. participants_filt = ['sub-'+ label + '/' for label in participants_df['participant_id'].tolist()] # Generate a list of sessions to filter on. sessions_filt = ['ses-' + session + '/' for session in sessions] # Generate a list of series to filter on. series_filt = [SERIES_MAP[s] for s in series] # Fetch top-level JSONs first. json_keylist = [key for key in s3_keylist for s in series_filt if s in key and 'json' in key and 'sub' not in key] # Applying filters. s3_keylist = [key for key in s3_keylist for p in participants_filt if p in key] s3_keylist = [key for key in s3_keylist for s in sessions_filt if s in key] s3_keylist = [key for key in s3_keylist for s in scans if s in key] s3_keylist = [key for key in s3_keylist for s in series_filt if (s in key) or ('func' not in key) ] # Add back top-level files s3_keylist.extend(json_keylist) s3_keylist.append('/'.join([s3_prefix, 'CHANGES'])) s3_keylist.append('/'.join([s3_prefix, 'README'])) s3_keylist.append('/'.join([s3_prefix, 'dataset_description.json'])) # And download the items total_num_files = len(s3_keylist) files_downloaded = len(s3_keylist) for path_idx, s3_path in enumerate(s3_keylist): rel_path = s3_path.replace(s3_prefix, '') rel_path = rel_path.lstrip('/') download_file = os.path.join(out_dir, rel_path) download_dir = os.path.dirname(download_file) if not os.path.exists(download_dir) and not dryrun: os.makedirs(download_dir) try: if not os.path.exists(download_file): if dryrun: print 'Would download to: %s' % download_file else: print 'Downloading to: %s' % download_file with open(download_file, 'wb') as f: s3_client.download_fileobj(s3_bucket_name, s3_path, f) print '%.3f%% percent complete' % \ (100*(float(path_idx+1)/total_num_files)) else: print 'File %s already exists, skipping...' % download_file files_downloaded -= 1 except Exception as exc: print 'There was a problem downloading %s.\n'\ 'Check input arguments and try again.' % s3_path print exc # Print all done if dryrun: print '%d files would be downloaded for %d participant(s).' % (files_downloaded,len(participants_df)) else: print '%d files downloaded for %d participant(s).' % (files_downloaded,len(participants_df)) if not dryrun: print 'Saving out revised participants.tsv and session tsv files.' # Save out revised participants.tsv to output directory, if a participants.tsv already exists, open it and append it to the new one. if os.path.isfile(os.path.join(out_dir, 'participants.tsv')): old_participants_df = pandas.read_csv(os.path.join(out_dir, 'participants.tsv'), delimiter='\t', na_values=['n/a', 'N/A']) participants_df = participants_df.append(old_participants_df, ignore_index=True) participants_df.drop_duplicates(inplace=True) os.remove(os.path.join(out_dir, 'participants.tsv')) participants_df.to_csv(os.path.join(out_dir, 'participants.tsv'), sep="\t", na_rep="n/a", index=False) # Separate list for sessions TSVs. session_keylist = [key.key for key in s3_keys if 'sessions.tsv' in key.key] session_keylist = [key for key in session_keylist for p in participants_filt if p in key] # Save out revised session tsvs to output directory; if already exists, open it and merge with the new one. for session_key in session_keylist: participant = session_key.split('/')[-2] sessions_obj = s3_client.get_object(Bucket=s3_bucket_name, Key=session_key ) sessions_df = pandas.read_csv(sessions_obj['Body'], delimiter='\t', na_values=['n/a']) # Drop all sessions not in specified. sessions_df = sessions_df[sessions_df['session_id'].isin(sessions_filt)] # Save out revised sessions tsv to output directory, if a sessions tsv already exists, open it and append it to the new one. if os.path.isfile(os.path.join(out_dir, participant, participant+'_sessions.tsv')): old_sessions_df = pandas.read_csv(os.path.join(out_dir, participant, participant+'_sessions.tsv'), delimiter='\t', na_values=['n/a', 'N/A']) sessions_df = sessions_df.append(old_sessions_df, ignore_index=True) sessions_df.drop_duplicates(inplace=True) os.remove(os.path.join(out_dir, participant, participant+'_sessions.tsv')) sessions_df.to_csv(os.path.join(out_dir, participant, participant+'_sessions.tsv'), sep="\t", na_rep="n/a", index=False) print 'Done!' # Make module executable if __name__ == '__main__': # Import packages import argparse import sys # Init arparser parser = argparse.ArgumentParser(description=__doc__) # Required arguments parser.add_argument('-o', '--out_dir', required=True, type=str, help='Path to local folder to download files to') # Optional arguments parser.add_argument('-lt', '--less_than', required=False, type=float, help='Upper age threshold (in years) of '\ 'particpants to download (e.g. for '\ 'subjects 30 or younger, \'-lt 31\')') parser.add_argument('-gt', '--greater_than', required=False, type=float, help='Lower age threshold (in years) of '\ 'particpants to download (e.g. for '\ 'subjects 31 or older, \'-gt 30\')') parser.add_argument('-x', '--sex', required=False, type=str, help='Participant sex of interest to download only '\ '(e.g. \'M\' or \'F\')') parser.add_argument('-m', '--handedness', required=False, type=str, help='Participant handedness to download only '\ '(e.g. \'R\' or \'L\')') parser.add_argument('-v', '--sessions', required=False, nargs='*', type=str, help='A space-separated list of session (visit) codes '\ 'to download (e.g. \'NFB3\',\'CLG2\')') parser.add_argument('-t', '--scans', required=False, nargs='*', type=str, help='A space-separated list of scan types '\ 'to download (e.g. \'anat\',\'dwi\')') parser.add_argument('-e', '--series', required=False, nargs='*', type=str, help='A space-separated list of series codes '\ 'to download (e.g. \'DMNTRACKINGTRAIN\',\'DMNTRACKINGTEST\')') parser.add_argument('-d', '--derivatives', required=False, action='store_true', help='Download derivatives (despiked physio, masks) in addition to raw data?') parser.add_argument('-n', '--dryrun', required=False, action='store_true', help='Perform a dry run to see how many files would be downloaded.') # Parse and gather arguments args = parser.parse_args() # Init variables out_dir = os.path.abspath(args.out_dir) kwargs = {} if args.less_than: kwargs['less_than'] = args.less_than print 'Using upper age threshold of %d...' % kwargs['less_than'] else: print 'No upper age threshold specified' if args.greater_than: kwargs['greater_than'] = args.greater_than print 'Using lower age threshold of %d...' % kwargs['greater_than'] else: print 'No lower age threshold specified' if args.sex: kwargs['sex'] = args.sex.upper() if kwargs['sex'] == 'M': print 'Downloading only male participants...' elif kwargs['sex'] == 'F': print 'Downloading only female participants...' else: print 'Input for sex \'%s\' was not \'M\' or \'F\'.' % kwargs['sex'] print 'Please check script syntax and try again.' sys.exit(1) else: print 'No sex specified, using all sexes...' if args.handedness: kwargs['handedness'] = args.handedness.upper() if kwargs['handedness'] == 'R': print 'Downloading only right-handed participants...' elif kwargs['handedness'] == 'L': print 'Downloading only left-handed participants...' else: print 'Input for handedness \'%s\' was not \'L\' or \'R\'.' % kwargs['handedness'] print 'Please check script syntax and try again.' sys.exit(1) if args.sessions: kwargs['sessions'] = args.sessions for session in kwargs['sessions']: if session not in SESSIONS: print 'Session \'%s\' is not a valid session name.' % session print 'Please check script syntax and try again.' sys.exit(1) print 'Sessions to download: ' + ' '.join(kwargs['sessions']) if args.scans: kwargs['scans'] = args.scans for scan in kwargs['scans']: if scan not in SCANS: print 'Scan \'%s\' is not a valid scan name.' % scan print 'Please check script syntax and try again.' sys.exit(1) print 'Scans to download: ' + ' '.join(kwargs['scans']) if args.series: kwargs['series'] = args.series for series in kwargs['series']: if series not in SERIES_MAP.keys(): print 'Series \'%s\' is not a valid series name.' % series print 'Please check script syntax and try again.' sys.exit(1) print 'Series to download: ' + ' '.join(kwargs['series']) if args.derivatives: kwargs['derivatives'] = args.derivatives print 'Data derivatives will be downloaded.' if args.dryrun: kwargs['dryrun'] = args.dryrun print 'Running download as a dry run.' # Call the collect and download routine collect_and_download(out_dir, **kwargs)