Source code for pyActigraphy.light.light_metrics

#############################################################################
# Copyright (c) 2022, Daylight Academy
# Author: Grégory Hammad
# Owner: Daylight Academy (https://daylight.academy)
# Maintainer: Grégory Hammad
# Email: gregory.hammad@uliege.be
# Status: development
#############################################################################
# The development of a module for analysing light exposure
# data was led and financially supported by members of the Daylight Academy
# Project “The role of daylight for humans” (led by Mirjam Münch, Manuel
# Spitschan). The module is part of the Human Light Exposure Database. For
# more information about the project, please see
# https://daylight.academy/projects/state-of-light-in-humans/.
#
# This module is also part of the pyActigraphy software.
# pyActigraphy is a free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# pyActigraphy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
############################################################################
import pandas as pd
import re
from scipy import signal
from ..metrics.metrics import _lmx
from ..metrics.metrics import _interdaily_stability
from ..metrics.metrics import _intradaily_variability
from ..utils.utils import _average_daily_activity
from ..utils.utils import _shift_time_axis

__all__ = [
    'LightMetricsMixin',
]


[docs]class LightMetricsMixin(object): """ Mixin Class """
[docs] def average_daily_profile( self, channel, rsfreq='5min', cyclic=False, binarize=False, threshold=None, time_origin=None ): r"""Average daily light profile Calculate the daily profile of light exposure. Data are averaged over all the days. Parameters ---------- channel: str, Channel to be used (i.e column of the input data). rsfreq: str, optional Data resampling frequency. Cf. #timeseries-offset-aliases in <https://pandas.pydata.org/pandas-docs/stable/timeseries.html>. cyclic: bool, optional If set to True, two daily profiles are concatenated to ensure continuity between the last point of the day and the first one. Default is False. binarize: bool, optional If set to True, the data are binarized. Default is False. threshold: int, optional If binarize is set to True, data above this threshold are set to 1 and to 0 otherwise. Default is None. time_origin: str or pd.Timedelta, optional If not None, origin of the time axis for the daily profile. Original time bins are translated as time delta with respect to this new origin. Default is None Supported time string: 'HH:MM:SS' Returns ------- raw : pandas.Series A Series containing the daily light profile with a 24h/48h index. """ # Check if requested channel is available if channel not in self.data.columns: raise ValueError( 'The light channel you tried to access ({}) '.format(channel) + 'is not available.\nAvailable channels:\n-{}'.format( '\n- '.join(self.data.columns) ) ) # Binarize (+resample) data, if required. if binarize: data = self.binarized_data( threshold, rsfreq=rsfreq, agg='mean' ) elif rsfreq is not None: data = self.resampled_data(rsfreq=rsfreq, agg='mean') else: data = self.data # Select requested channel data = data.loc[:, channel] if time_origin is None: return _average_daily_activity(data, cyclic=cyclic) else: if cyclic is True: raise NotImplementedError( 'Setting a time origin while cyclic option is True is not ' 'implemented yet.' ) avgdaily = _average_daily_activity(data, cyclic=False) if isinstance(time_origin, str): # Regex pattern for HH:MM:SS time string pattern = re.compile( r"^([0-1]\d|2[0-3])(?::([0-5]\d))(?::([0-5]\d))$" ) if pattern.match(time_origin): time_origin = pd.Timedelta(time_origin) else: raise ValueError( 'Time origin format ({}) not supported.\n'.format( time_origin ) + 'Supported format: HH:MM:SS.' ) elif not isinstance(time_origin, pd.Timedelta): raise ValueError( 'Time origin is neither a time string with a supported ' 'format, nor a pd.Timedelta.' ) # Round time origin to the required frequency time_origin = time_origin.round(data.index.freq) shift = int((pd.Timedelta('12h')-time_origin)/data.index.freq) return _shift_time_axis(avgdaily, shift)
[docs] def average_daily_profile_auc( self, channel=None, start_time=None, stop_time=None, binarize=False, threshold=None, time_origin=None ): r"""AUC of the average daily light profile Calculate the area under the curve of the daily profile of light exposure. Data are averaged over all the days. Parameters ---------- channel: str, Channel to be used (i.e column of the input data). start_time: str, optional If not set to None, compute AUC from start time. Supported time string: 'HH:MM:SS' Default is None. stop_time: str, optional If not set to None, compute AUC until stop time. Supported time string: 'HH:MM:SS' Default is None. binarize: bool, optional If set to True, the data are binarized. Default is False. threshold: int, optional If binarize is set to True, data above this threshold are set to 1 and to 0 otherwise. Default is None. time_origin: str or pd.Timedelta, optional If not None, origin of the time axis for the daily profile. Original time bins are translated as time delta with respect to this new origin. Default is None Supported time string: 'HH:MM:SS' Returns ------- auc : float Area under the curve. """ # Check if requested channel is available if channel not in self.data.columns: raise ValueError( 'The light channel you tried to access ({}) '.format(channel) + 'is not available.\nAvailable channels:\n-{}'.format( '\n- '.join(self.data.columns) ) ) # Binarize (+resample) data, if required. if binarize: data = self.binarized_data( threshold, rsfreq=None, agg='sum' ) else: data = self.data # Select requested channel data = data.loc[:, channel] # Compute average daily profile avgdaily = _average_daily_activity(data, cyclic=False) if time_origin is not None: if isinstance(time_origin, str): # Regex pattern for HH:MM:SS time string pattern = re.compile( r"^([0-1]\d|2[0-3])(?::([0-5]\d))(?::([0-5]\d))$" ) if pattern.match(time_origin): time_origin = pd.Timedelta(time_origin) else: raise ValueError( 'Time origin format ({}) not supported.\n'.format( time_origin ) + 'Supported format: HH:MM:SS.' ) elif not isinstance(time_origin, pd.Timedelta): raise ValueError( 'Time origin is neither a time string with a supported ' 'format, nor a pd.Timedelta.' ) # Round time origin to the required frequency time_origin = time_origin.round(data.index.freq) shift = int((pd.Timedelta('12h')-time_origin)/data.index.freq) avgdaily = _shift_time_axis(avgdaily, shift) # Restrict profile to start/stop times if start_time is not None: start_time = pd.Timedelta(start_time) if stop_time is not None: stop_time = pd.Timedelta(stop_time) return avgdaily.loc[start_time:stop_time].sum()
def _light_exposure(self, threshold=None, start_time=None, stop_time=None): r"""Light exposure Calculate the light exposure level and time Parameters ---------- threshold: float, optional If not set to None, discard data below threshold before computing exposure levels. Default is None. start_time: str, optional If not set to None, discard data before start time, on a daily basis. Supported time string: 'HH:MM:SS' Default is None. stop_time: str, optional If not set to None, discard data after stop time, on a daily basis. Supported time string: 'HH:MM:SS' Default is None. Returns ------- masked_data : pandas.DataFrame A DataFrame where the original data are set to Nan if below threshold and/or outside time window. """ if threshold is not None: data_mask = self.data.mask(self.data < threshold) else: data_mask = self.data if start_time is stop_time is None: return data_mask elif (start_time is None) or (stop_time is None): raise ValueError( 'Both start and stop times have to be specified, if any.' ) else: return data_mask.between_time( start_time=start_time, end_time=stop_time, include_end=False )
[docs] def light_exposure_level( self, threshold=None, start_time=None, stop_time=None, agg='mean' ): r"""Light exposure level Calculate the aggregated (mean, median, etc) light exposure level per epoch. Parameters ---------- threshold: float, optional If not set to None, discard data below threshold before computing exposure levels. Default is None. start_time: str, optional If not set to None, discard data before start time, on a daily basis. Supported time string: 'HH:MM:SS' Default is None. stop_time: str, optional If not set to None, discard data after stop time, on a daily basis. Supported time string: 'HH:MM:SS' Default is None. agg: str, optional Aggregating function used to summarize exposure levels. Available functions: 'mean', 'median', 'std', etc. Default is 'mean'. Returns ------- levels : pd.Series A pandas Series with aggreagted light exposure levels per channel """ light_exposure = self._light_exposure( threshold=threshold, start_time=start_time, stop_time=stop_time ) levels = getattr(light_exposure, agg) return levels()
[docs] def summary_statistics_per_time_bin( self, bins='24h', agg_func=['mean', 'median', 'sum', 'std', 'min', 'max'] ): r"""Summary statistics. Calculate summary statistics (ex: mean, median, etc) according to a user-defined (regular or arbitrary) binning. Parameters ---------- bins: str or list of tuples, optional If set to a string, bins is used to define a regular binning where every bin is of length "bins". Ex: "2h". Otherwise, the list of 2-tuples is used to define an arbitrary binning. Ex: \[('2000-01-01 00:00:00','2000-01-01 11:59:00')\]. Default is '24h'. agg_func: list, optional List of aggregation functions to be used on every bin. Default is \['mean', 'median', 'sum', 'std', 'min', 'max'\]. Returns ------- ss : pd.DataFrame A pandas DataFrame with summary statistics per channel. """ if isinstance(bins, str): summary_stats = self.data.resample(bins).agg(agg_func) elif isinstance(bins, list): df_col = [] for idx, (start, end) in enumerate(bins): df_bins = self.data.loc[start:end, :].apply( agg_func ).pivot_table(columns=agg_func) channels = {} for ch in df_bins.index: channels[ch] = df_bins.loc[df_bins.index == ch] channels[ch] = channels[ch].rename( index={ch: idx}, inplace=False ) channels[ch] = channels[ch].loc[:, agg_func] df_col.append( pd.concat( channels, axis=1 ) ) summary_stats = pd.concat(df_col) return summary_stats
[docs] def TAT( self, threshold=None, start_time=None, stop_time=None, oformat=None ): r"""Time above light threshold. Calculate the total light exposure time above the threshold. Parameters ---------- threshold: float, optional If not set to None, discard data below threshold before computing exposure levels. Default is None. start_time: str, optional If not set to None, discard data before start time, on a daily basis. Supported time string: 'HH:MM:SS' Default is None. stop_time: str, optional If not set to None, discard data after stop time, on a daily basis. Supported time string: 'HH:MM:SS' Default is None. oformat: str, optional Output format. Available formats: 'minute' or 'timedelta'. If set to 'minute', the result is in number of minutes. If set to 'timedelta', the result is a pd.Timedelta. If set to None, the result is in number of epochs. Default is None. Returns ------- tat : pd.Series A pandas Series with aggreagted light exposure levels per channel """ available_formats = [None, 'minute', 'timedelta'] if oformat not in available_formats: raise ValueError( 'Specified output format ({}) not supported. '.format(oformat) + 'Available formats are: {}'.format(str(available_formats)) ) light_exposure_counts = self._light_exposure( threshold=threshold, start_time=start_time, stop_time=stop_time ).count() if oformat == 'minute': tat = light_exposure_counts * \ self.data.index.freq.delta/pd.Timedelta('1min') elif oformat == 'timedelta': tat = light_exposure_counts * self.data.index.freq.delta else: tat = light_exposure_counts return tat
[docs] def TATp( self, threshold=None, start_time=None, stop_time=None, oformat=None ): r"""Time above light threshold (per day). Calculate the total light exposure time above the threshold, per calendar day. Parameters ---------- threshold: float, optional If not set to None, discard data below threshold before computing exposure levels. Default is None. start_time: str, optional If not set to None, discard data before start time, on a daily basis. Supported time string: 'HH:MM:SS' Default is None. stop_time: str, optional If not set to None, discard data after stop time, on a daily basis. Supported time string: 'HH:MM:SS' Default is None. oformat: str, optional Output format. Available formats: 'minute' or 'timedelta'. If set to 'minute', the result is in number of minutes. If set to 'timedelta', the result is a pd.Timedelta. If set to None, the result is in number of epochs. Default is None. Returns ------- tatp : pd.DataFrame A pandas DataFrame with aggreagted light exposure levels per channel and per day. """ available_formats = [None, 'minute', 'timedelta'] if oformat not in available_formats: raise ValueError( 'Specified output format ({}) not supported. '.format(oformat) + 'Available formats are: {}'.format(str(available_formats)) ) light_exposure_counts_per_day = self._light_exposure( threshold=threshold, start_time=start_time, stop_time=stop_time ).groupby(self.data.index.date).count() if oformat == 'minute': tatp = light_exposure_counts_per_day * \ self.data.index.freq.delta/pd.Timedelta('1min') elif oformat == 'timedelta': tatp = light_exposure_counts_per_day * self.data.index.freq.delta else: tatp = light_exposure_counts_per_day return tatp
[docs] def VAT(self, threshold=None): r"""Values above light threshold. Returns the light exposure values above the threshold. Parameters ---------- threshold: float, optional If not set to None, discard data below threshold before computing exposure levels. Default is None. Returns ------- vat : pd.Series A pandas Series with light exposure levels per channel """ return self._light_exposure( threshold=threshold, start_time=None, stop_time=None )
@classmethod def get_time_barycentre(cls, data): # Normalize each epoch to midnight. Y_j = data.index-data.index.normalize() # Convert to indices. Y_j /= pd.Timedelta(data.index.freq) # Compute barycentre bc = data.multiply(Y_j, axis=0).sum() / data.sum() return bc
[docs] def MLiT(self, threshold): r"""Mean light timing. Mean light timing above threshold, MLiT^C. Parameters ---------- threshold: float Threshold value. Returns ------- MLiT : pd.DataFrame A pandas DataFrame with MLiT^C per channel. Notes ----- The MLiT variable is defined in ref [1]_: .. math:: MLiT^C = \frac{\sum_{j}^{m}\sum_{k}^{n} j\times I^{C}_{jk}}{ \sum_{j}^{m}\sum_{k}^{n} I^{C}_{jk}} where :math:`I^{C}_{jk}` is equal to 1 if the light level is higher than the threshold C, m is the total number of epochs per day and n is the number of days covered by the data. References ---------- .. [1] Reid K.J., Santostasi G., Baron K.G., Wilson J., Kang J., Zee P.C., Timing and Intensity of Light Correlate with Body Weight in Adults. PLoS ONE 9(4): e92251. https://doi.org/10.1371/journal.pone.0092251 """ # Binarized data and convert to float in order to handle 'DivideByZero' I_jk = self.binarized_data(threshold=threshold).astype('float64') MLiT = self.get_time_barycentre(I_jk) # Scaling factor: MLiT is now expressed in minutes since midnight. MLiT /= (pd.Timedelta('1min')/I_jk.index.freq) return MLiT
[docs] def MLiTp(self, threshold): r"""Mean light timing per day. Mean light timing above threshold, MLiT^C, per calendar day. Parameters ---------- threshold: float Threshold value. Returns ------- MLiTp : pd.DataFrame A pandas DataFrame with MLiT^C per channel and per day. Notes ----- The MLiT variable is defined in ref [1]_: .. math:: MLiT^C = \frac{\sum_{j}^{m}\sum_{k}^{n} j\times I^{C}_{jk}}{ \sum_{j}^{m}\sum_{k}^{n} I^{C}_{jk}} where :math:`I^{C}_{jk}` is equal to 1 if the light level is higher than the threshold C, m is the total number of epochs per day and n is the number of days covered by the data. References ---------- .. [1] Reid K.J., Santostasi G., Baron K.G., Wilson J., Kang J., Zee P.C., Timing and Intensity of Light Correlate with Body Weight in Adults. PLoS ONE 9(4): e92251. https://doi.org/10.1371/journal.pone.0092251 """ # Binarized data and convert to float in order to handle 'DivideByZero' I_jk = self.binarized_data(threshold=threshold).astype('float64') # Group data per day: MLiTp = I_jk.groupby(I_jk.index.date).apply(self.get_time_barycentre) # Scaling factor: MLiT is now expressed in minutes since midnight. MLiTp /= (pd.Timedelta('1min')/I_jk.index.freq) return MLiTp
[docs] def get_light_extremum(self, extremum): r"""Light extremum. Return the index and the value of the requested extremum (min or max). Parameters ---------- extremum: str Name of the extremum. Available: 'min' or 'max'. Returns ------- ext : pd.DataFrame A pandas DataFrame with extremum info per channel. """ extremum_list = ['min', 'max'] if extremum not in extremum_list: raise ValueError( 'Requested extremum ({}) not available.'.format(extremum) + ' Available options are:\n- min\n- max' ) extremum_att = 'idxmax' if extremum == 'max' else 'idxmin' extremum_per_ch = [] for ch in self.data.columns: index_ext = getattr(self.data.loc[:, ch], extremum_att)() extremum_per_ch.append( pd.Series( { 'channel': ch, 'index': index_ext, 'value': self.data.loc[index_ext, ch] } ) ) return pd.concat(extremum_per_ch, axis=1).T
[docs] def LMX(self, length='5h', lowest=True): r"""Least or Most light period of length X Onset and mean hourly light exposure levels during the X least or most bright hours of the day. Parameters ---------- length: str, optional Period length. Default is '5h'. lowest: bool, optional If lowest is set to True, the period of least light exposure is considered. Otherwise, consider the period of most light exposure. Default is True. Returns ------- lmx_t, lmx: (pd.Timedelta, float) Onset and mean hourly light exposure level. Notes ----- The LMX variable is derived from the L5 and M10 defined in [1]_ as the mean hourly activity levels during the 5/10 least/most active hours. References ---------- .. [1] Van Someren, E.J.W., Lijzenga, C., Mirmiran, M., Swaab, D.F. (1997). Long-Term Fitness Training Improves the Circadian Rest-Activity Rhythm in Healthy Elderly Males. Journal of Biological Rhythms, 12(2), 146–156. http://doi.org/10.1177/074873049701200206 """ epoch_per_hour = pd.Timedelta('1h')/self.data.index.freq lmx_per_ch = [] for ch in self.data.columns: lmx_ts, lmx = _lmx(self.data.loc[:, ch], length, lowest=lowest) lmx_per_ch.append( pd.Series( { 'channel': ch, 'index': lmx_ts, 'value': lmx*epoch_per_hour } ) ) return pd.concat(lmx_per_ch, axis=1).T
def _RAR(self, rar_func, rar_name, binarize=False, threshold=0): r""" Generic RAR function Apply a generic RAR function to the light data, per channel. """ if binarize: data = self.binarized_data(threshold=threshold) else: data = self.data rar_per_ch = [] for ch in self.data.columns: rar = rar_func(data.loc[:, ch]) rar_per_ch.append( pd.Series( { 'channel': ch, rar_name: rar } ) ) return pd.concat(rar_per_ch, axis=1).T
[docs] def IS(self, binarize=False, threshold=0): r"""Interdaily stability The Interdaily stability (IS) quantifies the repeatibilty of the daily light exposure pattern over each day contained in the activity recording. Parameters ---------- binarize: bool, optional If set to True, the data are binarized. Default is False. threshold: int, optional If binarize is set to True, data above this threshold are set to 1 and to 0 otherwise. Default is 0. Returns ------- is : pd.DataFrame A pandas DataFrame with IS per channel. Notes ----- This variable is derived from the original IS variable defined in ref [1]_ as: .. math:: IS = \frac{d^{24h}}{d^{1h}} with: .. math:: d^{1h} = \sum_{i}^{n}\frac{\left(x_{i}-\bar{x}\right)^{2}}{n} where :math:`x_{i}` is the number of active (counts higher than a predefined threshold) minutes during the :math:`i^{th}` period, :math:`\bar{x}` is the mean of all data and :math:`n` is the number of periods covered by the actigraphy data and with: .. math:: d^{24h} = \sum_{i}^{p} \frac{ \left( \bar{x}_{h,i} - \bar{x} \right)^{2} }{p} where :math:`\bar{x}^{h,i}` is the average number of active minutes over the :math:`i^{th}` period and :math:`p` is the number of periods per day. The average runs over all the days. For the record, this is the 24h value from the chi-square periodogram (Sokolove and Bushel, 1978). References ---------- .. [1] Witting W., Kwa I.H., Eikelenboom P., Mirmiran M., Swaab D.F. Alterations in the circadian rest–activity rhythm in aging and Alzheimer׳s disease. Biol Psychiatry. 1990;27:563–572. """ return self._RAR( _interdaily_stability, 'IS', binarize=binarize, threshold=threshold )
[docs] def IV(self, binarize=False, threshold=0): r"""Intradaily variability The Intradaily Variability (IV) quantifies the variability of the light exposure pattern. Parameters ---------- binarize: bool, optional If set to True, the data are binarized. Default is False. threshold: int, optional If binarize is set to True, data above this threshold are set to 1 and to 0 otherwise. Default is 4. Returns ------- iv: pd.DataFrame A pandas DataFrame with IV per channel. Notes ----- It is defined in ref [1]_: .. math:: IV = \frac{c^{1h}}{d^{1h}} with: .. math:: d^{1h} = \sum_{i}^{n}\frac{\left(x_{i}-\bar{x}\right)^{2}}{n} where :math:`x_{i}` is the number of active (counts higher than a predefined threshold) minutes during the :math:`i^{th}` period, :math:`\bar{x}` is the mean of all data and :math:`n` is the number of periods covered by the actigraphy data, and with: .. math:: c^{1h} = \sum_{i}^{n-1} \frac{ \left( x_{i+1} - x_{i} \right)^{2} }{n-1} References ---------- .. [1] Witting W., Kwa I.H., Eikelenboom P., Mirmiran M., Swaab D.F. Alterations in the circadian rest–activity rhythm in aging and Alzheimer׳s disease. Biol Psychiatry. 1990;27:563–572. """ return self._RAR( _intradaily_variability, 'IV', binarize=binarize, threshold=threshold )
@staticmethod def _filter_butterworth(data, fs, fc_low, fc_high, N): # Filter order (Attenuation: -20*N dB/decade) # See https://dsp.stackexchange.com/questions/60455/ # how-to-choose-order-and-cut-off-frequency-for-low-pass-butterworth-filter) # Create Butterworth filter (order: N) # whose type (highpass, lowpass, bandpass) # depends on the input arguments if (fc_low is None) and (fc_high is not None): # Set a lowpass filter Wn = fc_high btype = 'lowpass' elif (fc_low is not None) and (fc_high is None): # Set a highpass filter Wn = fc_low btype = 'highpass' elif (fc_low is not None) and (fc_high is not None): # Set a bandpass filter Wn = [fc_low, fc_high] btype = 'bandpass' else: raise ValueError( "Both high and low critical frequencies were set to None." ) sos = signal.butter( N//2, Wn=Wn, btype=btype, fs=fs, output='sos' ) data_smooth = signal.sosfiltfilt(sos, data) return data_smooth
[docs] def filter_butterworth(self, fc_low, fc_high, N, channels=None): r"""Butterworth filtering Forward-backward digital filtering using a Nth order Butterworth filter Parameters ---------- fc_low: float Critical frequency (lower). fc_high: float Critical fequency (higher). N: int Order of the filter channels: list of str, optional. Channel list. If set to None, use all available channels. Default is None. Returns ------- filt: pd.DataFrame Filtered signal, per channel. Notes ----- This function is essentially a wrapper to the scipy.signal.butter function. For more information, see [1]_. References ---------- .. [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.butter.html """ # noqa # Select channels of interest and # apply filtering to all available channels filt = self.get_channels(channels).apply( self._filter_butterworth, axis=0, raw=True, fs=1/self.data.index.freq.delta.total_seconds(), fc_low=fc_low, fc_high=fc_high, N=N ) return filt