Source code for pyActigraphy.io.bba.bba

import json
import os
import pandas as pd
import re

from ..base import BaseRaw
from accelerometer.utils import date_parser
from accelerometer.summarisation import imputeMissing


class RawBBA(BaseRaw):
    r"""Raw object from files produced by the
    [biobankanalysis](
        https://biobankaccanalysis.readthedocs.io/en/latest/index.html
    ) package.

    Parameters
    ----------
    input_fname: str
        Path to the .csv(.gz) file.
    name: str, optional
        Name of the recording.
        Default is None.
    uuid: str, optional
        Device UUID.
        Default is None.
    frequency: str, optional
        Sampling frequency.
        Cf. #timeseries-offset-aliases in
        <https://pandas.pydata.org/pandas-docs/stable/timeseries.html>.
        If None, the sampling frequency is inferred from the data. Otherwise,
        the data are resampled to the specified frequency.
        Default is None.
    start_time: datetime-like, optional
        Read data from this time.
        Default is None.
    period: str, optional
        Length of the read data.
        Cf. #timeseries-offset-aliases in
        <https://pandas.pydata.org/pandas-docs/stable/timeseries.html>.
        Default is None (i.e all the data).
    engine: str, optional
        Parser engine to use. Argument passed to Pandas.
        Default is 'c'.
    impute_missing: bool, optional
        If set to True, use missing data imputation from the biobankanalysis
        package.
        Default is False.
    use_metadata_json: bool, optional.
        If set to True, extract meta-data from summary json file.
        Default is True.
    metadata_fname: str, optional
        Path to the summary json file.
        If None, the path to the summary json file is inferred from the input
        file (/path/to/XXX-timeSeries.csv.gz -> /path/to/XXX-summary.json).
        Default is None.
    """

    def __init__(
        self,
        input_fname,
        name=None,
        uuid=None,
        frequency=None,
        start_time=None,
        period=None,
        engine='c',
        impute_missing=False,
        use_metadata_json=True,
        metadata_fname=None
    ):

        # get absolute file path
        input_fname = os.path.abspath(input_fname)

        # read file
        data = pd.read_csv(
            input_fname,
            engine=engine,
            index_col=['time'],
            parse_dates=['time'],
            date_parser=date_parser
        )

        # read meta-data file (if found):
        if use_metadata_json:
            meta_data = self.__read_baa_metadata_json(
                input_fname, metadata_fname
            )

            # extract UUID if not provided
            if uuid is None:
                uuid = self.__extract_baa_metadata(meta_data, 'file-deviceID')

            # extract QC calibration marker
            self.__qc_calibrated_on_own_data = bool(
                self.__extract_baa_metadata(
                    meta_data,
                    'quality-calibratedOnOwnData'
                )
            )

            # extract QC DST cross-over marker
            self.__qc_daylight_savings_crossover = bool(
                self.__extract_baa_metadata(
                    meta_data,
                    'quality-daylightSavingsCrossover'
                )
            )

        if frequency is not None:
            data = data.resample(frequency).mean()
            freq = pd.Timedelta(frequency)
        elif not data.index.inferred_freq:
            raise ValueError(
                'The sampling frequency:\n'
                '- cannot be inferred from the data\n'
                'AND\n'
                '- is NOT explicity passed to the reader function.\n'
            )
        else:
            data = data.asfreq(data.index.inferred_freq)
            freq = pd.Timedelta(data.index.freq)

        # set start and stop times
        if start_time is not None:
            start_time = pd.to_datetime(start_time)
        else:
            start_time = data.index[0]

        if period is not None:
            period = pd.Timedelta(period)
            stop_time = start_time+period
        else:
            stop_time = data.index[-1]
            period = stop_time - start_time

        data = data.loc[start_time:stop_time]

        # Impute missing data (if required)
        if impute_missing:
            data = imputeMissing(data)

        # LIGHT
        self.__white_light = self.__extract_baa_data(
            data, 'light'
        )

        # MVPA
        self.__mvpa = self.__extract_baa_data(
            data, 'moderate-vigorous'
        )

        # Sedentary
        self.__sedentary = self.__extract_baa_data(
            data, 'sedentary'
        )

        # Sleep
        self.__sleep = self.__extract_baa_data(
            data, 'sleep'
        )

        # MET
        self.__met = self.__extract_baa_data(
            data, 'MET'
        )

        # call __init__ function of the base class
        super().__init__(
            fpath=input_fname,
            name=name if name is not None else os.path.basename(input_fname),
            uuid=uuid,
            format='BAA',
            axial_mode='tri-axial',
            start_time=start_time,
            period=period,
            frequency=freq,
            data=data.loc[:, 'acc'],
            light=None
        )

    @property
    def white_light(self):
        r"""Value of the white light illuminance in lux."""
        return self.__white_light

    @property
    def mvpa(self):
        r"""Value of the moderate-vigorous physical activity binary index."""
        return self.__mvpa

    @property
    def sedentary(self):
        r"""Value of the sedentary physical activity binary index."""
        return self.__sedentary

    @property
    def sleep(self):
        r"""Value of the sleep binary index."""
        return self.__sleep

    @property
    def met(self):
        r"""Value of the MET index."""
        return self.__met

    @property
    def isCalibratedOnOwnData(self):
        r"""Boolean indicating if the data have been calibrated."""
        return self.__qc_calibrated_on_own_data

    @property
    def isDSTCrossing(self):
        r"""Boolean indicating if the data have been acquired during a DST."""
        return self.__qc_daylight_savings_crossover

    @staticmethod
    def __extract_baa_data(data, column):
        """ Data reader

        Read requested data column.

        Parameters
        ----------
        data: pd.DataFrame
            Input data frame.
        column: str
            Column name.

        Returns
        -------
        ts : pd.Series
            Data contained in the requested column.
        """

        return data.loc[:, column] if column in data.columns else None

    @staticmethod
    def __read_baa_metadata_json(input_fname, metadata_fname):
        """ Meta-data reader

        Read meta data summary json file produced when processing .CWA file
        into X-timeSeries.csv.gz file with the biobank acc. package.

        Parameters
        ----------
        input_fname: str
            Path to the .csv(.gz) file.
        metadata_fname: str
            Path to the meta-data (summary.json) file.

        Returns
        -------
        metadata : dict
            Dictionnary containing the meta-data.
        """

        # read meta-data file (if found):
        if metadata_fname is None:
            input_metadata = input_fname.replace(
                '-timeSeries.csv.gz',
                '-summary.json'
            )
        else:
            input_metadata = os.path.abspath(metadata_fname)

        # load meta data json file
        with open(input_metadata) as file:
            meta_data = json.load(file)

        # check filename consistency:
        # - META-DATA: file-name = 'basename'.cwa[.gz]
        # - INPUT DATA: input_fname = 'basename'-timeSeries.csv[.gz]

        match_basename = re.match(
            pattern=r'^(\w*)-timeSeries.csv(\.gz)?',
            string=os.path.basename(input_fname)
        )
        if match_basename:
            input_basename = match_basename.groups()[0]
        else:
            raise ValueError(
                'Could extract file basename (XXX-timeSeries.csv.gz)'
                + ' from input filename: {}'.format(input_fname)
            )

        if not re.match(
            pattern=r'{}.(cwa|CWA)(\.gz)?'.format(input_basename),
            string=meta_data['file-name']
        ):
            raise ValueError(
                'Attempting to read a metadata file referring to another '
                + 'input file.\n'
                + '- Input file: {}\n'.format(os.path.basename(input_fname))
                + '- Extracted basename: {}\n'.format(input_basename)
                + '- Metadata ref: {}\n'.format(
                    os.path.basename(meta_data['file-name'])
                )
                + '- Metadata path: {}\n'.format(input_metadata)
            )
        return meta_data

    @staticmethod
    def __extract_baa_metadata(meta_data, field):
        """ Meta-data extractor

        Extract meta data from summary json file.

        Parameters
        ----------
        meta_data : dict
            Dictionnary containing the meta-data.
        field: str
            Field (key) to extract.

        Returns
        -------
        value: str or int
            Requested field extracted from the meta-data dict.
        """

        if field in meta_data.keys():
            return meta_data[field]
        else:
            raise KeyError(
                'Information ({}) not found in meta-data file.'.format(field)
            )


[docs]def read_raw_bba(
    input_fname,
    name=None,
    uuid=None,
    frequency=None,
    start_time=None,
    period=None,
    engine='c',
    impute_missing=False,
    use_metadata_json=True,
    metadata_fname=None
):
    r"""Reader function for files produced by the biobankAccelerometerAnalysis
    package.

    Parameters
    ----------
    input_fname: str
        Path to the BAA file.
    name: str, optional
        Name of the recording.
        Default is None.
    uuid: str, optional
        Device UUID.
        Default is None.
    frequency: str, optional
        Sampling frequency.
        Cf. #timeseries-offset-aliases in
        <https://pandas.pydata.org/pandas-docs/stable/timeseries.html>.
        If None, the sampling frequency is inferred from the data. Otherwise,
        the data are resampled to the specified frequency.
        Default is None.
    start_time: datetime-like, optional
        Read data from this time.
        Default is None.
    period: str, optional
        Length of the read data.
        Cf. #timeseries-offset-aliases in
        <https://pandas.pydata.org/pandas-docs/stable/timeseries.html>.
        Default is None (i.e all the data).
    engine: str, optional
        Parser engine to use. Argument passed to Pandas.
        Default is 'c'.
    impute_missing: bool, optional
        If set to True, use missing data imputation from the biobankanalysis
        package.
        Default is False.
    use_metadata_json: bool, optional.
        If set to True, extract meta-data from summary json file.
        Default is True.
    metadata_fname: str, optional
        Path to the summary json file.
        If None, the path to the summary json file is inferred from the input
        file (/path/to/XXX-timeSeries.csv.gz -> /path/to/XXX-summary.json).
        Default is None.

    Returns
    -------
    raw : Instance of RawBBA
        An object containing preprocessed data from raw accelerometers.
    """

    return RawBBA(
        input_fname=input_fname,
        name=name,
        uuid=uuid,
        frequency=frequency,
        start_time=start_time,
        period=period,
        engine=engine,
        impute_missing=impute_missing,
        use_metadata_json=use_metadata_json,
        metadata_fname=metadata_fname
    )