Source code for metabci.brainda.datasets.tsinghua

# -*- coding: utf-8 -*-
#
# Authors: Swolf <swolfforever@gmail.com>
# Date: 2021/01/07
# License: MIT License
"""
Tsinghua BCI Lab.
"""
import os
import zipfile
from typing import Union, Optional, Dict, List, cast
from pathlib import Path

import numpy as np
import py7zr
from mne import create_info
from mne.io import RawArray, Raw
from mne.channels import make_standard_montage
from .base import BaseDataset
from ..utils.download import mne_data_path
from ..utils.io import loadmat

# TSINGHUA_URL = 'http://bci.med.tsinghua.edu.cn/download.html'

# 403 error, though it still works
Wang2016_URL = "http://bci.med.tsinghua.edu.cn/upload/yijun/"
# Wang2016_URL = "ftp://sccn.ucsd.edu/pub/ssvep_benchmark_dataset/"
# Wang2016_URL = 'http://www.thubci.com/uploads/down/' # This may work
BETA_URL = "http://bci.med.tsinghua.edu.cn/upload/liubingchuan/"  # 403 error
# BETA_URL = 'https://figshare.com/articles/The_BETA_database/12264401'


[docs]class Wang2016(BaseDataset):
    """SSVEP dataset from Yijun Wang.

    This dataset gathered SSVEP-BCI recordings of 35 healthy subjects (17
    females, aged 17-34 years, mean age: 22 years) focusing on 40 characters
    flickering at different frequencies (8-15.8 Hz with an interval of 0.2 Hz).
    For each subject, the experiment consisted of 6 blocks. Each block
    contained 40 trials corresponding to all 40 characters indicated in a
    random order. Each trial started with a visual cue (a red square)
    indicating a target stimulus. The cue appeared for 0.5 s on the screen.
    Subjects were asked to shift their gaze to the target as soon as possible
    within the cue duration. Following the cue offset, all stimuli started to
    flicker on the screen concurrently and lasted 5 s. After stimulus offset,
    the screen was blank for 0.5 s before the next trial began, which allowed
    the subjects to have short breaks between consecutive trials. Each trial
    lasted a total of 6 s. To facilitate visual fixation, a red triangle
    appeared below the flickering target during the stimulation period.
    In each block, subjects were asked to avoid eye blinks during the
    stimulation period. To avoid visual fatigue, there was a rest for several
    minutes between two consecutive blocks. EEG data were acquired using a
    Synamps2 system (Neuroscan, Inc.) with a sampling rate of 1000 Hz.
    The amplifier frequency passband ranged from 0.15 Hz to 200 Hz. Sixty-four
    channels covered the whole scalp of the subject and were aligned according
    to the international 10-20 system. The ground was placed on midway between
    Fz and FPz. The reference was located on the vertex. Electrode impedances
    were kept below 10 KΩ. To remove the common power-line noise, a notch
    filter at 50 Hz was applied in data recording. Event triggers generated by
    the computer to the amplifier and recorded on an event channel
    synchronized to the EEG data.

    The continuous EEG data was segmented into 6 s epochs (500 ms pre-stimulus,
    5.5 s post-stimulus onset). The epochs were subsequently downsampled to
    250 Hz. Thus each trial consisted of 1500 time points. Finally, these data
    were stored as double-precision floating-point values in MATLAB and were
    named as subject indices (i.e., S01.mat, …, S35.mat). For each file, the
    data loaded in MATLAB generate a 4-D matrix named ‘data’ with dimensions
    of [64, 1500, 40, 6]. The four dimensions indicate ‘Electrode index’,
    ‘Time points’, ‘Target index’, and ‘Block index’. The electrode positions
    were saved in a ‘64-channels.loc’ file. Six trials were available for each
    SSVEP frequency. Frequency and phase values for the 40 target indices were
    saved in a ‘Freq_Phase.mat’ file.

    Information for all subjects was listed in a ‘Sub_info.txt’ file. For each
    subject, there are five factors including ‘Subject Index’, ‘Gender’, ‘Age‘
    ’Handedness’, and ‘Group’. Subjects were divided into an ‘experienced’
    group (eight subjects, S01-S08) and a ‘naive’ group (27 subjects, S09-S35)
    according to their experience in SSVEP-based BCIs.

    Frequency Table
    8    9   10   11   12   13   14   15
    8.2  9.2 10.2 11.2 12.2 13.2 14.2 15.2
    8.4  9.4 10.4 11.4 12.4 13.4 14.4 15.4
    8.6  9.6 10.6 11.6 12.6 13.6 14.6 15.6
    8.8  9.8 10.8 11.8 12.8 13.8 14.8 15.8

    Notes
    -----
    1. sub5 is not available from the download url.
    """

    _CHANNELS = [
        "FP1",
        "FPZ",
        "FP2",
        "AF3",
        "AF4",
        "F7",
        "F5",
        "F3",
        "F1",
        "FZ",
        "F2",
        "F4",
        "F6",
        "F8",
        "FT7",
        "FC5",
        "FC3",
        "FC1",
        "FCZ",
        "FC2",
        "FC4",
        "FC6",
        "FT8",
        "T7",
        "C5",
        "C3",
        "C1",
        "CZ",
        "C2",
        "C4",
        "C6",
        "T8",
        "TP7",
        "CP5",
        "CP3",
        "CP1",
        "CPZ",
        "CP2",
        "CP4",
        "CP6",
        "TP8",
        "P7",
        "P5",
        "P3",
        "P1",
        "PZ",
        "P2",
        "P4",
        "P6",
        "P8",
        "PO7",
        "PO5",
        "PO3",
        "POZ",
        "PO4",
        "PO6",
        "PO8",
        "O1",
        "OZ",
        "O2",
    ]

    _FREQS = [
        8,
        9,
        10,
        11,
        12,
        13,
        14,
        15,
        8.2,
        9.2,
        10.2,
        11.2,
        12.2,
        13.2,
        14.2,
        15.2,
        8.4,
        9.4,
        10.4,
        11.4,
        12.4,
        13.4,
        14.4,
        15.4,
        8.6,
        9.6,
        10.6,
        11.6,
        12.6,
        13.6,
        14.6,
        15.6,
        8.8,
        9.8,
        10.8,
        11.8,
        12.8,
        13.8,
        14.8,
        15.8,
    ]

    _PHASES = [
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
    ]

    _EVENTS = {str(freq): (i + 1, (0, 5)) for i, freq in enumerate(_FREQS)}

    def __init__(self):
        super().__init__(
            dataset_code="wang2016",
            subjects=list(range(1, 36)),
            events=self._EVENTS,
            channels=self._CHANNELS,
            srate=250,
            paradigm="ssvep",
        )

[docs]    def data_path(
        self,
        subject: Union[str, int],
        path: Optional[Union[str, Path]] = None,
        force_update: bool = False,
        update_path: Optional[bool] = None,
        proxies: Optional[Dict[str, str]] = None,
        verbose: Optional[Union[bool, str, int]] = None,
    ) -> List[List[Union[str, Path]]]:
        if subject not in self.subjects:
            raise (ValueError("Invalid subject id"))

        subject = cast(int, subject)
        url = "{:s}S{:d}.mat.7z".format(Wang2016_URL, subject)
        file_dest = mne_data_path(
            url,
            "tsinghua",
            path=path,
            proxies=proxies,
            force_update=force_update,
            update_path=update_path,
        )

        if not os.path.exists(file_dest[:-3]):
            # decompression the data
            with py7zr.SevenZipFile(file_dest, "r") as archive:
                archive.extractall(path=Path(file_dest).parent)
        dests = [[file_dest[:-3]]]
        return dests

    def _get_single_subject_data(
        self, subject: Union[str, int], verbose: Optional[Union[bool, str, int]] = None
    ) -> Dict[str, Dict[str, Raw]]:
        dests = self.data_path(subject)
        raw_mat = loadmat(dests[0][0])
        epoch_data = raw_mat["data"] * 1e-6
        stim = np.zeros((1, *epoch_data.shape[1:]))
        # insert event label at stimulus-onset
        # 0.5s latency
        stim[0, 125] = np.tile(
            np.arange(1, 41)[:, np.newaxis], (1, epoch_data.shape[-1])
        )
        epoch_data = np.concatenate((epoch_data, stim), axis=0)
        data = np.transpose(epoch_data, (0, 3, 2, 1))

        montage = make_standard_montage("standard_1005")
        montage.rename_channels(
            {ch_name: ch_name.upper() for ch_name in montage.ch_names}
        )
        # montage.ch_names = [ch_name.upper() for ch_name in montage.ch_names]
        ch_names = [ch_name.upper() for ch_name in self._CHANNELS]
        ch_names.insert(32, "M1")
        ch_names.insert(42, "M2")
        ch_names.insert(59, "CB1")
        ch_names = ch_names + ["CB2", "STI 014"]
        ch_types = ["eeg"] * 65
        ch_types[59] = "misc"
        ch_types[63] = "misc"
        ch_types[-1] = "stim"

        info = create_info(ch_names=ch_names,
                           ch_types=ch_types, sfreq=self.srate)

        runs = dict()
        for i in range(data.shape[1]):
            raw = RawArray(
                data=np.reshape(data[:, i, ...],
                                (data.shape[0], -1)),
                info=info
            )
            raw.set_montage(montage)
            runs["run_{:d}".format(i)] = raw

        sess = {"session_0": runs}
        return sess

[docs]    def get_freq(self, event: str):
        return self._FREQS[self._EVENTS[event][0] - 1]

[docs]    def get_phase(self, event: str):
        return self._PHASES[self._EVENTS[event][0] - 1]


[docs]class BETA(BaseDataset):
    """BETA SSVEP dataset [1]_.

    EEG data after preprocessing are store as a 4-way tensor, with a dimension
    of channel x time point x block x condition. Each trial comprises 0.5-s
    data before the event onset and 0.5-s data after the time window of 2 s or
    3 s. For S1-S15, the time window is 2 s and the trial length is 3 s,
    whereas for S16-S70 the time window is 3 s and the trial length is 4 s.
    Additional details about the channel and condition information can be
    found in the following supplementary information.

    Eight supplementary information is comprised of personal information,
    channel information, frequency and initial phase associated to each
    condition, SNR and sampling rate. The personal information contains age
    and gender of the subject. For the channel information, a location matrix
    (64 x 4) is provided, with the first column indicating channel index,
    the second column and third column indicating the degree and radius in
    polar coordinates, and the last column indicating channel name. The SNR
    information contains the mean narrow-band SNR and wide-band SNR matrix for
    each subject, calculated in (3) and (4), respectively. The initial phase
    is in radius.

    3-100Hz bandpass filtering (eegfilt), downsampled to 250 Hz

    References
    ----------
    .. [1] Liu B, Huang X, Wang Y, et al. BETA: A Large Benchmark Database
    Toward SSVEP-BCI Application[J]. Frontiers in neuroscience, 2020, 14: 627.
    """

    _CHANNELS = [
        "FP1",
        "FPZ",
        "FP2",
        "AF3",
        "AF4",
        "F7",
        "F5",
        "F3",
        "F1",
        "FZ",
        "F2",
        "F4",
        "F6",
        "F8",
        "FT7",
        "FC5",
        "FC3",
        "FC1",
        "FCZ",
        "FC2",
        "FC4",
        "FC6",
        "FT8",
        "T7",
        "C5",
        "C3",
        "C1",
        "CZ",
        "C2",
        "C4",
        "C6",
        "T8",
        "TP7",
        "CP5",
        "CP3",
        "CP1",
        "CPZ",
        "CP2",
        "CP4",
        "CP6",
        "TP8",
        "P7",
        "P5",
        "P3",
        "P1",
        "PZ",
        "P2",
        "P4",
        "P6",
        "P8",
        "PO7",
        "PO5",
        "PO3",
        "POZ",
        "PO4",
        "PO6",
        "PO8",
        "O1",
        "OZ",
        "O2",
    ]

    _FREQS = [
        8.6,
        8.8,
        9,
        9.2,
        9.4,
        9.6,
        9.8,
        10,
        10.2,
        10.4,
        10.6,
        10.8,
        11,
        11.2,
        11.4,
        11.6,
        11.8,
        12,
        12.2,
        12.4,
        12.6,
        12.8,
        13,
        13.2,
        13.4,
        13.6,
        13.8,
        14,
        14.2,
        14.4,
        14.6,
        14.8,
        15,
        15.2,
        15.4,
        15.6,
        15.8,
        8,
        8.2,
        8.4,
    ]
    _PHASES = [
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
        1.5,
        0,
        0.5,
        1,
    ]

    _EVENTS = {str(freq): (i + 1, (0, 2)) for i, freq in enumerate(_FREQS)}

    def __init__(self):
        super().__init__(
            dataset_code="beta",
            subjects=list(range(1, 71)),
            events=self._EVENTS,
            channels=self._CHANNELS,
            srate=250,
            paradigm="ssvep",
        )

[docs]    def data_path(
        self,
        subject: Union[str, int],
        path: Optional[Union[str, Path]] = None,
        force_update: bool = False,
        update_path: Optional[bool] = None,
        proxies: Optional[Dict[str, str]] = None,
        verbose: Optional[Union[bool, str, int]] = None,
    ) -> List[List[Union[str, Path]]]:
        if subject not in self.subjects:
            raise (ValueError("Invalid subject id"))

        subject = cast(int, subject)
        if subject < 11:
            url = "{:s}S1-S10.mat.zip".format(BETA_URL)
        elif subject < 21:
            url = "{:s}S11-S20.mat.zip".format(BETA_URL)
        elif subject < 31:
            url = "{:s}S21-S30.mat.zip".format(BETA_URL)
        elif subject < 41:
            url = "{:s}S31-S40.mat.zip".format(BETA_URL)
        elif subject < 51:
            url = "{:s}S41-S50.mat.zip".format(BETA_URL)
        elif subject < 61:
            url = "{:s}S51-S60.mat.zip".format(BETA_URL)
        else:
            url = "{:s}S61-S70.mat.zip".format(BETA_URL)

        file_dest = mne_data_path(
            url,
            "tsinghua",
            path=path,
            proxies=proxies,
            force_update=force_update,
            update_path=update_path,
        )

        parent_dir = Path(file_dest).parent

        if not os.path.exists(os.path.join(parent_dir,
                                           "S{:d}.mat".format(subject))):
            # decompression the data
            with zipfile.ZipFile(file_dest, "r") as archive:
                archive.extractall(path=parent_dir)
        dests: List[List[Union[str, Path]]] = [
            [os.path.join(parent_dir, "S{:d}.mat".format(subject))]
        ]
        return dests

    def _get_single_subject_data(
        self, subject: Union[str, int], verbose: Optional[Union[bool, str, int]] = None
    ) -> Dict[str, Dict[str, Raw]]:
        dests = self.data_path(subject)
        raw_mat = loadmat(dests[0][0])
        epoch_data = raw_mat["data"]["EEG"] * 1e-6
        stim = np.zeros((1, *epoch_data.shape[1:]))
        # 0.5s latency
        stim[0, 125] = np.tile(np.arange(1, 41), (epoch_data.shape[-2], 1))
        epoch_data = np.concatenate((epoch_data, stim), axis=0)
        data = np.transpose(epoch_data, (0, 3, 2, 1))

        montage = make_standard_montage("standard_1005")
        montage.rename_channels(
            {ch_name: ch_name.upper() for ch_name in montage.ch_names}
        )
        # montage.ch_names = [ch_name.upper() for ch_name in montage.ch_names]
        ch_names = [ch_name.upper() for ch_name in self._CHANNELS]
        ch_names.insert(32, "M1")
        ch_names.insert(42, "M2")
        ch_names.insert(59, "CB1")
        ch_names = ch_names + ["CB2", "STI 014"]
        ch_types = ["eeg"] * 65
        ch_types[59] = "misc"
        ch_types[63] = "misc"
        ch_types[-1] = "stim"

        info = create_info(ch_names=ch_names,
                           ch_types=ch_types, sfreq=self.srate)

        runs = dict()
        for i in range(data.shape[-2]):
            raw = RawArray(
                data=np.reshape(data[..., i, :],
                                (data.shape[0], -1)), info=info
            )
            raw.set_montage(montage)
            runs["run_{:d}".format(i)] = raw

        sess = {"session_0": runs}
        return sess

[docs]    def get_freq(self, event: str):
        return self._FREQS[self._EVENTS[event][0] - 1]

[docs]    def get_phase(self, event: str):
        return self._PHASES[self._EVENTS[event][0] - 1]