import glob, os, json import argparse import h5py import numpy as np import pandas as pd from tqdm import tqdm def get_arguments(): parser = argparse.ArgumentParser( description="Compiles all events.tsv files into a single HDF5 file for the entire fLoc dataset" ) parser.add_argument( '--data_dir', required=True, type=str, help='absolute path to dataset directory that contains events.tsv files', ) parser.add_argument( '--out_dir', required=True, type=str, help='absolute path to output directory', ) parser.add_argument( '--sub', required=True, type=str, help='two-digit subject number', ) return parser.parse_args() def build_nilearn_matrix(df, c_dict): ''' Model events file for block design analysis (each block is 4TR, or 5.96s, with 12 stimuli shown from the same category) ''' condi_names = list(c_dict.keys()) df = df[df['category'].apply(lambda x: x in condi_names)] df.block_trial.loc[df.category=='baseline'] = 0 df = df[df['block_trial'] == 0] df.subcategory.loc[df.category=='baseline'] = 'baseline' trial_type = df['subcategory'].to_list() duration = np.repeat(5.96, len(trial_type)).tolist() # onset time (in s) after removing the first three fMRI volumes (1.49*3 = 4.47s) onset = df['onset'].apply(lambda x: x - 4.47).to_list() return onset, duration, trial_type def compile_floc_design(data_path, out_path, sub_num): ''' Script generates a time x conditions design matrix for each run (event file), where 1 = a condition's onset TR. To save space, all design matrices are saved as sparse matrices (rather than full matrices filled with zeroes) within a single HDF5 file per participant. That is, each matrix is saved as a list of coordinates, and each trial (a set of coordinates) is saved as a tuple : (onset TR x condition number). These coordinates are used to generate sparse design matrices in matlab: https://www.mathworks.com/help/matlab/math/constructing-sparse-matrices.html Time is in TR, conditions is the total number of conditions (faces, corridors, etc) across all runs & sessions. Note that, in the CNeuromod adapation of the fLoc paradigm, bloc onset is aligned to the TR (1.49s). ''' event_files = sorted( glob.glob( f"{data_path}/sub-{sub_num}/ses-*/func/sub-0*_ses-00*_task-fLoc_run-0*events.tsv" ) ) ''' There are two runs per session. Each run uses slightly different stimulus categories (with some overlap for faces and words). Typically, run 1 uses the task's default stimuli ('def'), and run 2 uses the task's alternative stimuli ('alt') The def run uses the "default" stimuli: bodies = body (0), characters = word (1), faces = adult (2), objects = car (3), places = house (4), scrambled = scrambled The alt run uses the "alternate" stimuli: bodies = limb (5), characters = word (1), faces = adult (2), objects = instrument (6), places = corridor (7), scrambled = scrambled # see https://github.com/courtois-neuromod/floc.stimuli/blob/4415763fc728918c856a174be27fe4ea69abdb6c/config.json ''' condi_dict = { "def": { 'bodies': 0, 'characters': 1, 'faces': 2, 'objects': 3, 'places': 4, 'baseline': 8 }, "alt": { 'bodies': 5, 'characters': 1, 'faces': 2, 'objects': 6, 'places': 7, 'baseline': 8 } } subj_h5file = h5py.File( f"{out_path}/sub-{sub_num}/glm/sub-{sub_num}_task-floc_model-GLM_design.h5",'w' ) for ev_path in tqdm(event_files, desc='exporting design matrices to HDF5 file'): sub, ses, _, run, _ = os.path.basename(ev_path).split('_') ses_num = ses[-2:] run_num = run[-2:] # TODO: exclude sub-02's session 02 from the dataset... if not ses_num in subj_h5file.keys(): subj_h5file.create_group(ses_num) run_group = subj_h5file[ses_num].create_group(run_num) design_df = pd.read_csv(ev_path, sep='\t') task_version = design_df['task_version'][0] onset, duration, trial_type = build_nilearn_matrix( design_df, condi_dict[task_version], ) run_group.create_dataset('onset', data=onset) run_group.create_dataset('duration', data=duration) run_group.create_dataset('trial_type', data=trial_type) subj_h5file.close() if __name__ == '__main__': ''' Step 1 to running a first-level GLM contrast in nilearn on fLoc CNeuromod dataset For each run, the script exports each run as a group saved into a HDF5 file (one per subject) Each run (.h5 group) has a list of onsets, of durations, and of conditions (the subcategory, e.g., 'faces') for each bloc in the run (blocs are treated as trials) These values will be loaded in nilearn to create design matrices to model each bloc's HRF ''' args = get_arguments() compile_floc_design(args.data_dir, args.out_dir, args.sub)