import os, sys, glob
from io import TextIOWrapper

import numpy as np
from numpy import nan as NaN
import pandas as pd
from pathlib import Path

import argparse
import datetime
import tqdm


def get_log_times(log_list):
    '''
    Function takes list of log files from a session and extracts the logged
    computer time of image presentation for each trial
    '''
    log_dict = {}
    for log_file in log_list:
        log_num = os.path.basename(log_file).split('_')[2].split('.')[0]
        log_dict[log_num] = {}

        with open(log_file) as f:
            lines = f.readlines()
            count = -1
            for line in lines:
                if 'ThingsMemory: task starting at' in line:
                    count += 1
                    log_dict[log_num][count] = []
                elif 'image: ' in line:
                    split_line = line.split('\t')
                    img_time = split_line[0].split(' ')[0]
                    img_name = split_line[-1].split(':')[-1][:-1]
                    log_dict[log_num][count].append((img_time, img_name))

    return log_dict


def get_sess_times(
    time_path: str,
    sub_num: str,
) -> dict:
    """
    Create a dictionary of session dates from subject's text file with saved
    scan dates.
    Note: subjects' scan dates are identifiers.
    For confidentiality, they cannot be shared in a public repository.
    """
    months = {
              'Jan': 1,
              'Feb': 2,
              'Mar': 3,
              'Apr': 4,
              'May': 5,
              'Jun': 6,
              'Jul': 7,
              'Aug': 8,
              'Sep': 9,
              'Oct': 10,
              'Nov': 11,
              'Dec': 12,
    }

    session_times = {}
    scantime_path = f'{time_path}/sub-{sub_num}_scandates.txt'
    with open(scantime_path) as f:
        lines = f.readlines()
        for line in lines:
            chunks = line.split(' ')
            y = chunks[-2]
            d = chunks[-4]
            m = chunks[-5] if chunks[-5] is not '' else chunks[-6]
            sub, sess = chunks[-1].split('/')[-3:-1]

            if not sess in session_times.keys():
                session_times[sess] = datetime.datetime(int(y), months[m], int(d))

    # Manual updates for fluke session for which there are no events files.
    # (I just copied events files from another subject as approximation)
    if sub_num == '03':
        session_times['ses-013b'] = datetime.datetime(2021, 7, 28)

    return session_times


def get_ses_list(
    in_path: str,
    sub_num: str,
    session_times: dict,
) -> list:
    """
    Return list of subject's sessions
    """
    # Sanity check: sessions found match those in session_times dict
    ses_list = []
    list_all_runs = sorted(
        glob.glob(os.path.join(in_path, f'sub-{sub_num}', 'ses*', '*events.tsv'))
    )
    for run_path in list_all_runs:
        ses_num = run_path.split('/')[-1].split('_')[1]
        ses_list.append(ses_num)
        assert ses_num in session_times.keys()

    return sorted(list(np.unique(ses_list)))


def get_timing(
    in_path: str,
    et_path: str,
    ses_list: list,
    sub_num: str,
    session_times: dict,
    out_path: str,
) -> tuple:
    """
    Concatenate all the subject's *events.tsv files (even the excluded
    & repeated sessions) in chronological order into a single large DataFrame.

    Processing session by session, access within-session trial times
    from .log files (concatenated).
    Add three identifier columns: subject, session, flag to exclude bad sessions
    Add two timing columns: session date, and within-session trial timing (from log files)
    """
    # DataFrame of raw datapoints per trial concatenated across sessions and runs
    df_trials = None
    cols_to_keep = []
    run_error_report = open(f'{out_path}/sub-{sub_num}_desc-run_errorReport.txt', 'w+')

    # process files per session
    for ses_num in tqdm.tqdm(ses_list, desc='concatenating event files per session'):
        ses_time = session_times[ses_num]
        flag_to_exclude = 'b' in ses_num
        run_list = sorted(
            glob.glob(f"{in_path}/sub-{sub_num}/{ses_num}/*events.tsv")
        )

        # Extract times from session's log files, returns dictionary
        log_list = sorted(
            glob.glob(f"{in_path}/sub-{sub_num}/{ses_num}/*log")
        )
        if not flag_to_exclude:
            log_dict = get_log_times(log_list)
            count = 0
            seen_fids = []

        for run_event in run_list:
            run_df = pd.read_csv(run_event, sep = '\t')
            if df_trials is None:
                cols_to_keep = list(run_df.columns)

            # sanity check: is this the correct file?
            ids = os.path.basename(run_event).split('_')
            sub = ids[0]
            sess = ids[1]
            run_num = ids[-2].split('-')[-1]
            assert f'sub-{sub_num}' == sub
            assert sess == ses_num
            assert int(run_num) == run_df['run_id'][0]

            # insert additional columns
            # subject, session, flag to exclude, time (date), time (computer time for session's trial)
            run_df.insert(loc=0, column='subject_id', value=sub_num, allow_duplicates=True)
            run_df.insert(loc=1, column='session_id', value=ses_num, allow_duplicates=True)
            run_df.insert(loc=2, column='not_for_memory', value=flag_to_exclude, allow_duplicates=True)
            run_df.insert(loc=3, column='date_time', value=ses_time, allow_duplicates=True)

            if flag_to_exclude:
                # within-session timing not important for discarded sessions, use placeholder
                run_df.insert(loc=18, column='session_trial_time', value=NaN, allow_duplicates=True)
            else:
                file_id = ids[2]
                if file_id in seen_fids:
                    count += 1
                else:
                    seen_fids.append(file_id)
                    count = 0

                try:
                    """
                    Extract image timestamps from psychopy log files
                    """
                    img_info = np.array(log_dict[file_id][count])
                    img_time = img_info[:, 0]
                    img_name = img_info[:, 1]

                    assert np.sum(img_name == run_df['image_path'].to_numpy()) == img_name.shape[0]
                    assert np.array_equal(img_name, run_df['image_path'].to_numpy())
                    run_df.insert(loc=18, column='session_trial_time', value=img_time, allow_duplicates=True)
                except:
                    """
                    If no task log file, derive image timestamps from raw eyetracking
                    timestamps
                    """
                    print(sub_num, ses_num, run_num, file_id, count)
                    run_error_report.write(f'empty log file for sub-{sub_num}, {ses_num}, run {run_num[-1]}\n')
                    et_file = f'{et_path}/sub-{sub_num}/{ses_num}/sub-{sub_num}_{ses_num}_{file_id}.pupil/task-thingsmemory_run-{run_num[-1]}/000/eye0_timestamps.npy'
                    if os.path.exists(et_file):
                        r_time = np.load(et_file)[0]
                        img_time = (run_df['onset'].to_numpy() + r_time).tolist()
                        run_df.insert(loc=18, column='session_trial_time', value=img_time, allow_duplicates=True)
                    else:
                        print('no eyetracking timestamps')
                        run_df.insert(loc=18, column='session_trial_time', value=NaN, allow_duplicates=True)

                ## Note: sub-01 ses-14 has an extra 18 trials shown ("run-0");
                # I manually removed the other trials (19+) listed in the events file that were not shown
                # verified: the number of trials will match those in the log file (18 trials listed)
            if df_trials is None:
                df_trials = run_df
            else:
                df_trials = pd.concat((df_trials, run_df), ignore_index=True)

    return df_trials, run_error_report, cols_to_keep


def fix_entries(
    df_trials: pd.DataFrame,
    out_path: str,
    sub_num: str,
) -> pd.DataFrame:
    """
    Validate the DataFrame's following columns,
    whose value depends on previous / subsequent trials
    - condition: seen/unseen
    - subcondition: seen-within-between, etc
    - repetition : 1-3 (normally)
    - error: True/False (determined based on response_txt and condition)

    Flag trials that require updating in a text file.
    Add three columns:
    - time since previous rep (in days and in seconds),
    - number of stimuli shown since previous rep
    At the end, delete session date column (identifier/confidential)
    """
    shown_images = {}
    # text file that documents trials with erroneous labels
    error_report = open(f'{out_path}/sub-{sub_num}_desc-trial_errorReport.txt', 'w+')

    df_trials.insert(loc=2, column='atypical', value=False, allow_duplicates=True)
    df_trials.insert(loc=3, column='atypical_log', value='', allow_duplicates=True)
    df_trials.insert(loc=22, column='delay_days', value=NaN, allow_duplicates=True)
    df_trials.insert(loc=23, column='delay_seconds', value=NaN, allow_duplicates=True)
    df_trials.insert(loc=24, column='trials_since_lastrep', value=NaN, allow_duplicates=True)

    # slow, tedious, clunky for loop bruteforcing its way through the DataFrame
    for i in tqdm.tqdm(range(df_trials.shape[0]), desc='validating all trial entries'):
        img_name = os.path.basename(df_trials['image_path'][i])
        ses = df_trials['session_id'][i]
        run = df_trials['run_id'][i]
        trial_num = df_trials['order'][i]

        # the image is UNSEEN
        if not img_name in shown_images.keys():
            # add entry to dict of seen images
            shown_images[img_name] = {
                                      'idx': i,
                                      'rep_num': 1,
                                      'previous_reps': '' #'-between' '-within'
            }
            # validate condition
            if not df_trials['condition'][i] == 'unseen':
                error_report.write(f'condition changed from seen to unseen for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                df_trials['condition'][i] = 'unseen'
                df_trials['atypical'][i] = True
                df_trials['atypical_log'][i] += '_condition:unseen'
            # validate subcondition
            df_ses = df_trials[df_trials['session_id']==ses]
            df_img = df_ses[df_ses['image_path']==df_trials['image_path'][i]]
            subcon_val = 'unseen-within' if df_img.shape[0] > 1 else 'unseen-between'
            if not df_trials['subcondition'][i] == subcon_val:
                error_report.write(f'subcondition changed to {subcon_val} for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                df_trials['subcondition'][i] = subcon_val
                df_trials['atypical'][i] = True
                df_trials['atypical_log'][i] += f'_subcondition:{subcon_val}'
            # validate repetition
            if not df_trials['repetition'][i] == 1:
                error_report.write(f'repetition set to 1 for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                df_trials['repetition'][i] = 1
                df_trials['atypical'][i] = True
                df_trials['atypical_log'][i] += '_repetition:1'
            # validate error
            error_val = df_trials['error'][i]
            resp_given = df_trials['response_txt'][i]
            if resp_given in ['unseen', 'seen']: #TODO: check, is it NaN?
                if resp_given == 'unseen' and error_val == True:
                    error_report.write(f'error set to False for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                    df_trials['error'][i] = False
                    df_trials['atypical'][i] = True
                    df_trials['atypical_log'][i] += '_error:False'
                elif resp_given == 'seen' and error_val == False:
                    error_report.write(f'error set to True for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                    df_trials['error'][i] = True
                    df_trials['atypical'][i] = True
                    df_trials['atypical_log'][i] += '_error:True'

        # the image is SEEN
        elif img_name in shown_images.keys():
            # fill up delay columns (days and seconds) and trials_since_lastrep column
            # update entry to dict of seen images
            old_i = shown_images[img_name]['idx']
            df_trials['trials_since_lastrep'][i] = i - old_i
            if df_trials['session_id'][old_i] == ses:
                df_trials['delay_days'][i] = 0
                df_trials['delay_seconds'][i] = float(df_trials['session_trial_time'][i]) - float(df_trials['session_trial_time'][old_i])
                shown_images[img_name]['previous_reps'] += '-within'
            else:
                df_trials['delay_days'][i] = (df_trials['date_time'][i] - df_trials['date_time'][old_i]).days
                df_trials['delay_seconds'][i] = 0.0
                shown_images[img_name]['previous_reps'] += '-between'

            shown_images[img_name]['idx'] = i
            shown_images[img_name]['rep_num'] += 1

            rep_num = shown_images[img_name]['rep_num']
            previous_reps = shown_images[img_name]['previous_reps']

            # validate condition
            if not df_trials['condition'][i] == 'seen':
                error_report.write(f'condition changed from unseen to seen for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                df_trials['condition'][i] = 'seen'
                df_trials['atypical'][i] = True
                df_trials['atypical_log'][i] += '_condition:seen'
            # validate subcondition
            if not df_trials['subcondition'][i] == f'seen{previous_reps}':
                error_report.write(f'subcondition changed to seen{previous_reps} for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                df_trials['subcondition'][i] = f'seen{previous_reps}'
                df_trials['atypical'][i] = True
                df_trials['atypical_log'][i] += f'_subcondition:seen{previous_reps}'
            # validate repetition
            if not df_trials['repetition'][i] == rep_num:
                error_report.write(f'repetition set to {str(rep_num)} for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                df_trials['repetition'][i] = rep_num
                df_trials['atypical'][i] = True
                df_trials['atypical_log'][i] += f'_repetition:{str(rep_num)}'
            # validate error
            error_val = df_trials['error'][i]
            resp_given = df_trials['response_txt'][i]
            if resp_given in ['unseen', 'seen']: #TODO: check, is it NaN?
                if resp_given == 'unseen' and error_val==False:
                    error_report.write(f'error set to True for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                    df_trials['error'][i] = True
                    df_trials['atypical'][i] = True
                    df_trials['atypical_log'][i] += '_error:True'
                elif resp_given == 'seen' and error_val==True:
                    error_report.write(f'error set to False for sub-{sub_num}, {ses}, run {str(int(run))}, trial {str(int(trial_num))}\n')
                    df_trials['error'][i] = False
                    df_trials['atypical'][i] = True
                    df_trials['atypical_log'][i] += '_error:False'

    # scanning dates should not be included in saved files as they are identifiers; comment out line below to debug
    df_trials.drop(columns=['date_time'])
    df_trials.to_csv(f'{out_path}/sub-{sub_num}_task-things_concatTrials.tsv', sep='\t', header=True, index=False)
    error_report.close()

    return df_trials


def export_events_files(
    df_trials: pd.DataFrame,
    cols_to_keep: list,
    ses_list: list,
    out_path: str,
    sub_num: str,
    run_error_report: TextIOWrapper,
) -> None:
    """
    Export newly updated/corrected *events.tsv files with
    additional timing columns, for valid sessions only
    """
    id_cols = ['subject_id', 'session_id']
    added_cols = ['session_trial_time', 'atypical', 'atypical_log', 'not_for_memory',
                  'delay_days', 'delay_seconds', 'trials_since_lastrep']
    cols_to_keep = id_cols + cols_to_keep + added_cols

    for ses_num in tqdm.tqdm(ses_list, desc='exporting updated event files'):
        if 'b' not in ses_num:
            df_ses = df_trials[df_trials['session_id']==ses_num]
            run_list = sorted(list(np.unique(df_ses['run_id'])))

            for run in run_list:
                # exclude run 0, run 40, etc
                if int(run) in [1, 2, 3, 4, 5, 6]:
                    out_dir = os.path.join(out_path, f'sub-{sub_num}', ses_num)
                    Path(out_dir).mkdir(parents=True, exist_ok=True)

                    out_name = os.path.join(
                        out_dir,
                        f'sub-{sub_num}_{ses_num}_task-things_run-{str(int(run))}_events.tsv',
                    )
                    df_run = df_ses[df_ses['run_id']==run]

                    if True in list(np.unique(df_run['atypical'])):
                        run_error_report.write(
                            f'atypical entries detected in sub-{sub_num}, '
                            f'{ses_num}, run {str(int(run))}\n'
                        )

                    df_run = df_run[cols_to_keep]
                    df_run.to_csv(out_name, sep='\t', header=True, index=False)

    run_error_report.close()


def validate_behav_data(
    in_path: str,
    time_path: str,
    et_path: str,
    sub_num: str,
    out_path: str,
) -> None:
    '''
    Input:
        in_path:  path to bids directory that contains *events.tsv files
        time_path:  path to directory with temp scan dates text files
        et_path:  path to raw directory with psychopy log files
            (with trial timestamps) and raw eye-tracking data
        sub_num :  two-digit subject number
        out_path:  path to output directory
    Output:
        None : exports updated *events.tsv files in specified output directory
    '''

    '''
    STEP 1: Create a dictionary of session dates
    '''
    session_times = get_sess_times(time_path, sub_num)


    '''
    STEP 2: Concatenate subject's *events.files into one DataFrame.
    Insert identifier and timing columns.
    '''
    ses_list = get_ses_list(in_path, sub_num, session_times)

    df_trials, run_error_report, cols_to_keep = get_timing(
        in_path,
        et_path,
        ses_list,
        sub_num,
        session_times,
        out_path,
    )


    '''
    STEP 3: validate condition, subcontition and accuracy.
    Compute delays between repetitions.
    Delete session date column (identifier/confidential)
    '''
    df_trials = fix_entries(
        df_trials,
        out_path,
        sub_num,
    )


    '''
    STEP 4: Export updated/corrected *events.tsv files
    '''
    export_events_files(
        df_trials, cols_to_keep, ses_list, out_path, sub_num, run_error_report,
    )


def main():
    """
    This script is rather rough and relies on brute force to validate each trial's data.

    It validates and corrects the conditions, sub-conditions
    and responses included in the raw *events.tsv files
    outputed for the THINGS dataset, based on the order in which sessions were
    administered and stimuli were shown to the participants.

    Specifically, the script
    - adds timing information to *events.tsv file: for repeated
    images (2nd or 3rd showing), it computes the delay (in days or seconds)
    since the last presentation, and the number of stimuli shown since the
    previous repetition (to estimate stimulus interference with recognition).
    - corrects trial labels (e.g., seen/unseen) in the few cases
    when sessions were not ran in the pre-planned order.
    - corrects accuracy accordingly (e.g., if a participant answered "seen" to
    an image accidentally shown in a previous session when it was supposed to
    be novel, the "error" value is changed from True to False to reflect the
    fact that the image was correctly recognized).

    ***NOTE: a few cleaned up *events.tsv files were edited manually after
    being outputed by this script***
    The 'not_for_memory' column flags sessions that should be excluded from
    analyses of memory recognition (due to deviations from pre-planned patterns
    of repetition).
    Those sessions are fine to assess concept representation and perception.
    The 'not_for_memory' column was changed MANUALLY from FALSE to TRUE
    for all rows in the following files:
    - sub-03: sessions 24, 25 and 26 (all 6 runs)
    - sub-06: sessions 19, 20, 21, 22, 23, 24, 25 and 26 (all 6 runs)

    A few more sessions include "atypical trials", that is, trials whose
    pattern of repetition deviates from the pre-planned protocole.
    e.g. a run was interrupted mid-scan and then redone, or runs were
    ran out of order within a session.
    Those trials are flagged with the 'atypical' (boolean flag) and
    'atypical_log' (text) columns, and they (or the runs / sessions that
    contain them) can be excluded at the experimenter's discretion.

    Additional notes about the QCing of the CNeuroMod-THINGS dataset are included
    in qc_notes.md.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--data_dir',
        type=str,
        required=True,
        help='path to bids directory with *events.tsv files',
    )
    parser.add_argument(
        '--time_dir',
        type=str,
        default='/home/mariestl/cneuromod/THINGS',
        help='path to directory with temp scan dates text files',
    )
    parser.add_argument(
        '--log_dir',
        type=str,
        default='/unf/eyetracker/neuromod/things/sourcedata',
        help='path to directory with raw psychopy log files (trials timestamps)',
    )
    parser.add_argument(
        '--out_dir',
        type=str,
        required=True,
        help='path to output directory',
    )
    parser.add_argument(
        '--sub',
        type=str,
        required=True,
        help='two-digit subject number',
    )
    args = parser.parse_args()

    out_path = args.out_dir
    Path(out_path).mkdir(parents=True, exist_ok=True)

    validate_behav_data(
        args.data_dir, args.time_dir, args.log_dir, args.sub, out_path,
    )


if __name__ == '__main__':
    sys.exit(main())