From 795e33d139f70a46feb6a44544a6939a628ba881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= Date: Mon, 23 Oct 2023 20:17:18 +0200 Subject: [PATCH] feat: load sleep data from fitbit, many misc fixes (incl updated export formats) --- src/quantifiedme/derived/all_df.py | 46 ++++++++++++++-------- src/quantifiedme/derived/heartrate.py | 25 ++++++++---- src/quantifiedme/derived/screentime.py | 11 +++++- src/quantifiedme/derived/sleep.py | 17 +++++--- src/quantifiedme/load/fitbit.py | 54 +++++++++++++++++++++----- src/quantifiedme/load/oura.py | 33 ++++++++++------ src/quantifiedme/main.py | 3 +- 7 files changed, 137 insertions(+), 52 deletions(-) diff --git a/src/quantifiedme/derived/all_df.py b/src/quantifiedme/derived/all_df.py index 4824dbb..fa310fb 100644 --- a/src/quantifiedme/derived/all_df.py +++ b/src/quantifiedme/derived/all_df.py @@ -1,7 +1,6 @@ -import io +import itertools import logging import os -import sys from datetime import ( date, datetime, @@ -26,7 +25,9 @@ def load_all_df( - fast=True, screentime_events: list[Event] | None = None, ignore: list[Sources] = [] + fast=True, + screentime_events: list[Event] | None = None, + ignore: list[Sources] = [], ) -> pd.DataFrame: """ Loads a bunch of data into a single dataframe with one row per day. @@ -139,25 +140,26 @@ def check_new_data_in_range(df_source: pd.DataFrame, df_target: pd.DataFrame) -> ) @click.option( "--csv", - is_flag=True, - help="Print as CSV", + help="Save to CSV", ) -def all_df(fast=False, csv=False): - """loads all data and prints a summary""" +def all_df(fast=False, csv=None): + """Loads all data and prints a summary.""" logging.basicConfig(level=logging.INFO) # capture output if csv - if csv: - sys.stdout = io.StringIO() df = load_all_df(fast=fast) - if csv: - sys.stdout = sys.__stdout__ - if csv: - print(df.to_csv()) - return + # convert duration columns that are timedelta to hours + for col in itertools.chain( + df.columns[df.dtypes == "timedelta64[ns]"], + df.columns[df.dtypes == "timedelta64[us]"], + ): + df[col] = df[col].dt.total_seconds() / 3600 + + # dont truncate output + pd.set_option("display.max_columns", None) + pd.set_option("display.max_rows", None) - print(df) print(df.describe()) # check for missing data @@ -168,13 +170,23 @@ def all_df(fast=False, csv=False): print(df_days_na) print("Total days: ", len(df)) + # drop columns with too much missing data + columns_to_drop = df.columns[df.isna().sum() > len(df) * 0.5] + df = df.dropna(axis=1, thresh=int(len(df) * 0.5)) + print("Dropped columns with too much missing data: ", columns_to_drop) + # keep days with full coverage - df = df.dropna() - print("Total days with full coverage: ", len(df)) + # df = df.dropna() + # print("Total days with full coverage: ", len(df)) print("Final dataframe:") print(df) + if csv: + print(f"Saving to {csv}") + with open(csv, "w") as f: + f.write(df.to_csv()) + if __name__ == "__main__": all_df() diff --git a/src/quantifiedme/derived/heartrate.py b/src/quantifiedme/derived/heartrate.py index 64187a3..1474be2 100644 --- a/src/quantifiedme/derived/heartrate.py +++ b/src/quantifiedme/derived/heartrate.py @@ -1,22 +1,24 @@ -from datetime import timezone - +import click import pandas as pd +from ..load import fitbit, oura, whoop + # load heartrate from multiple sources, combine into a single dataframe def load_heartrate_df() -> pd.DataFrame: - from ..load import oura, fitbit, whoop - dfs = [] + print("# Loading Oura heartrate data") oura_df = oura.load_heartrate_df() oura_df["source"] = "oura" dfs.append(oura_df) + print("# Loading Fitbit heartrate data") fitbit_df = fitbit.load_heartrate_df() fitbit_df["source"] = "fitbit" dfs.append(fitbit_df) + print("# Loading Whoop heartrate data") whoop_df = whoop.load_heartrate_df() whoop_df["source"] = "whoop" dfs.append(whoop_df) @@ -28,7 +30,7 @@ def load_heartrate_df() -> pd.DataFrame: def load_heartrate_minutes_df(): """We consider using minute-resolution a decent starting point for summary heartrate data. - + NOTE: ignores source, combines all sources into a single point per freq. """ df = load_heartrate_df().drop(columns=["source"]) @@ -54,9 +56,18 @@ def load_heartrate_summary_df( df[f"hr_duration_{zone}"] = df_zones[df_zones == zone].groupby( pd.Grouper(freq=freq) ).count() * pd.Timedelta(minutes=1) + # replace NaT with 0 + df = df.fillna(pd.Timedelta(0)) return df -if __name__ == "__main__": - df = load_heartrate_summary_df() +@click.command() +@click.option("--freq", default="D") +def heartrate(freq: str): + """Loads heartrate data.""" + df = load_heartrate_summary_df(freq=freq) print(df) + + +if __name__ == "__main__": + heartrate() diff --git a/src/quantifiedme/derived/screentime.py b/src/quantifiedme/derived/screentime.py index 4ebaac1..5744422 100644 --- a/src/quantifiedme/derived/screentime.py +++ b/src/quantifiedme/derived/screentime.py @@ -1,5 +1,6 @@ import logging import pickle +from collections import defaultdict from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Literal @@ -194,6 +195,9 @@ def classify(events: list[Event], personal: bool) -> list[Event]: def load_category_df(events: list[Event]) -> pd.DataFrame: tss = {} all_categories = list({t for e in events for t in e.data["$tags"]}) + events_by_date = defaultdict(list) + for e in events: + events_by_date[e.timestamp.date()].append(e) for cat in all_categories: try: tss[cat] = categorytime_per_day(events, cat) @@ -203,14 +207,17 @@ def load_category_df(events: list[Event]) -> pd.DataFrame: df = pd.DataFrame(tss) df = df.replace(np.nan, 0) df["All_cols"] = df.sum(axis=1) - df["All_events"] = sum([e.duration / 60 / 60 for e in events], timedelta(0)) + df["All_events"] = [ + sum((e.duration for e in events_by_date[d]), start=timedelta(0)) + for d in df.index + ] return df @click.command() @click.option("--csv", is_flag=True, help="Print as CSV") def screentime(csv: bool): - """Load all screentime and print total duration""" + """Loads screentime data, and prints total duration.""" hostnames = load_config()["data"]["activitywatch"]["hostnames"] events = load_screentime( since=datetime.now(tz=timezone.utc) - timedelta(days=90), diff --git a/src/quantifiedme/derived/sleep.py b/src/quantifiedme/derived/sleep.py index 90358d5..68d3289 100644 --- a/src/quantifiedme/derived/sleep.py +++ b/src/quantifiedme/derived/sleep.py @@ -8,6 +8,7 @@ import matplotlib.pyplot as plt import pandas as pd +from ..load.fitbit import load_sleep_df as load_fitbit_sleep_df from ..load.oura import load_sleep_df as load_oura_sleep_df from ..load.whoop import load_sleep_df as load_whoop_sleep_df @@ -30,7 +31,7 @@ def _merge_several_sleep_records_per_day(df: pd.DataFrame) -> pd.DataFrame: if duplicates: df = df.groupby(df.index).agg({"start": "min", "end": "max", "score": "max"}) # type: ignore df["duration"] = df["end"] - df["start"] - logger.warning(f"Merged {duplicates} duplicate index entries") + logger.warning(f"Merged {duplicates} duplicate sleep entries") return df @@ -41,16 +42,19 @@ def load_sleep_df(ignore: list[str] = [], aggregate=True) -> pd.DataFrame: df: pd.DataFrame = pd.DataFrame() # Fitbit - # df = join(df, load_fitbit_sleep_df(), rsuffix="_fitbit") + if "fitbit" not in ignore: + df_fitbit = load_fitbit_sleep_df() + df_fitbit = _merge_several_sleep_records_per_day(df_fitbit) + df = join(df, df_fitbit.add_suffix("_fitbit")) # Oura if "oura" not in ignore: df_oura = load_oura_sleep_df() + df_oura = _merge_several_sleep_records_per_day(df_oura) df = join(df, df_oura.add_suffix("_oura")) # Whoop if "whoop" not in ignore: - # FIXME: can return multiple sleep records per day, which we should merge df_whoop = load_whoop_sleep_df() df_whoop = _merge_several_sleep_records_per_day(df_whoop) df = join(df, df_whoop.add_suffix("_whoop")) @@ -83,13 +87,16 @@ def join(df_target, df_source, **kwargs) -> pd.DataFrame: @click.option("--plot/--no-plot", default=False) @click.option("--dropna/--no-dropna", default=True) def sleep(aggregate=False, plot=False, dropna=True): + """Loads sleep data""" df = load_sleep_df(aggregate=aggregate) if dropna: df = df.dropna() if not aggregate: - print(df[["duration_whoop", "duration_oura", "score_oura", "score_whoop"]]) + sources = ["fitbit", "oura", "whoop"] + cols = ["duration", "score"] + print(df[[f"{c}_{s}" for s in sources for c in cols]]) # compare durations to ensure they are matching - df_durations = df[["duration_oura", "duration_whoop"]].apply( + df_durations = df[[f"duration_{s}" for s in sources]].apply( lambda x: x.dt.seconds / 60 / 60 ) print(df_durations.head()) diff --git a/src/quantifiedme/load/fitbit.py b/src/quantifiedme/load/fitbit.py index cf1e400..c2d7643 100644 --- a/src/quantifiedme/load/fitbit.py +++ b/src/quantifiedme/load/fitbit.py @@ -1,13 +1,52 @@ -from ..config import load_config -from ..cache import memory - +import multiprocessing from pathlib import Path +import matplotlib.pyplot as plt import pandas as pd +from ..cache import memory +from ..config import load_config + def load_sleep_df() -> pd.DataFrame: - raise NotImplementedError + filepath = load_config()["data"]["fitbit"] + filepath = Path(filepath).expanduser() + assert filepath.exists() + + # filepath is the root folder of an unzipped Fitbit export + # sleep data is in `Global Export Data/sleep-YYYY-MM-DD.json` + # we need to combine all of these files into a single dataframe + files = filepath.glob("Global Export Data/sleep-*.json") + + # load each file into a dataframe + dfs = [] + for f in sorted(files): + dfs.append(_load_sleep_file(f)) + + # combine all the dataframes into a single dataframe + df = pd.concat(dfs) + + return df + + +def _load_sleep_file(filepath): + with open(filepath) as f: + df = pd.read_json(f) + df = df[["dateOfSleep", "startTime", "endTime", "duration", "efficiency"]] + df = df.rename( + columns={ + "dateOfSleep": "date", + "startTime": "start", + "endTime": "end", + "efficiency": "score", + } + ) + df["date"] = pd.to_datetime(df["date"], utc=True) + df["start"] = pd.to_datetime(df["start"], utc=True) + df["end"] = pd.to_datetime(df["end"], utc=True) + df["duration"] = pd.to_timedelta(df["duration"] / 1000, unit="s") + df = df.set_index("date") + return df def _load_heartrate_file(filepath): @@ -28,16 +67,14 @@ def load_heartrate_df() -> pd.DataFrame: filepath = Path(filepath).expanduser() # filepath is the root folder of an unzipped Fitbit export - # heartrate data is split into daily files in `Physical Activity/heart_rate-YYYY-MM-DD.json` + # heartrate data is split into daily files in `Global Export Data/heart_rate-YYYY-MM-DD.json` # we need to combine all of these files into a single dataframe # get all the files in the folder - files = filepath.glob("Physical Activity/heart_rate-*.json") + files = filepath.glob("Global Export Data/heart_rate-*.json") # load each file into a dataframe # parallelize to speed up the process - import multiprocessing - pool = multiprocessing.Pool(20) dfs = pool.map(_load_heartrate_file, sorted(files)) @@ -54,6 +91,5 @@ def load_heartrate_df() -> pd.DataFrame: df = load_heartrate_df() print(df) df.plot() - import matplotlib.pyplot as plt plt.show() diff --git a/src/quantifiedme/load/oura.py b/src/quantifiedme/load/oura.py index 4019c15..4c8011d 100644 --- a/src/quantifiedme/load/oura.py +++ b/src/quantifiedme/load/oura.py @@ -1,4 +1,5 @@ import json +import logging from datetime import timedelta from pathlib import Path @@ -9,8 +10,12 @@ from ..config import load_config +logger = logging.getLogger(__name__) -def load_data(): + +def load_data_old(): + """Loads the data from the legacy export json file""" + logger.warning("Using legacy data format") filepath = load_config()["data"]["oura"] filepath = Path(filepath).expanduser() with open(filepath) as f: @@ -19,31 +24,37 @@ def load_data(): def load_sleep_df() -> pd.DataFrame: - data = load_data() + # new format + path = load_config()["data"]["oura-sleep"] + path = Path(path).expanduser() + with open(path) as f: + data = json.load(f) df = pd.DataFrame(data["sleep"]) - # summary_date is the "start" date + # "day" (prev "summary_date") is the "start" date # https://cloud.ouraring.com/docs/sleep # NOTE: Not sure why I have to subtract a day here, shouldn't be necessary according to docs, # but necessary for it to be correct. - df["summary_date"] = pd.to_datetime(df["summary_date"], utc=True) - timedelta( - days=1 - ) + df["day"] = pd.to_datetime(df["day"], utc=True) - timedelta(days=1) df["bedtime_start"] = pd.to_datetime(df["bedtime_start"], utc=True) df["bedtime_end"] = pd.to_datetime(df["bedtime_end"], utc=True) df = df.rename( columns={ - "summary_date": "timestamp", + "day": "timestamp", "bedtime_start": "start", "bedtime_end": "end", } ) df = df.set_index("timestamp") df["duration"] = df["end"] - df["start"] + + # remove naps (sleep < 1h) + df = df[df["duration"] > timedelta(hours=1)] + return df[["start", "end", "duration", "score"]] # type: ignore def load_readiness_df() -> pd.DataFrame: - data = load_data() + data = load_data_old() df = pd.DataFrame(data["readiness"]) df["summary_date"] = pd.to_datetime(df["summary_date"], utc=True) df = df.set_index("summary_date") @@ -51,7 +62,7 @@ def load_readiness_df() -> pd.DataFrame: def load_activity_df() -> pd.DataFrame: - data = load_data() + data = load_data_old() df = pd.DataFrame(data["activity"]) df["summary_date"] = pd.to_datetime(df["summary_date"], utc=True) df = df.set_index("summary_date") @@ -59,7 +70,6 @@ def load_activity_df() -> pd.DataFrame: def load_heartrate_df() -> pd.DataFrame: - # new data format filepath = load_config()["data"]["oura-heartrate"] filepath = Path(filepath).expanduser() with open(filepath) as f: @@ -81,6 +91,7 @@ def load_heartrate_df() -> pd.DataFrame: entry["heart_rate"]["interval"], ) for entry in raw["sleep"] + if "heart_rate" in entry ] data_heartrate_sleep = [ (iso8601.parse_date(start) + i * timedelta(seconds=interval), bpm) @@ -105,7 +116,7 @@ def load_heartrate_df() -> pd.DataFrame: @click.command() def oura(): - """TODO, just loads all data""" + """Loads Oura data""" sleep = load_sleep_df() activity = load_activity_df() readiness = load_readiness_df() diff --git a/src/quantifiedme/main.py b/src/quantifiedme/main.py index 7b15ebc..843e8f0 100644 --- a/src/quantifiedme/main.py +++ b/src/quantifiedme/main.py @@ -3,6 +3,7 @@ from .derived.all_df import all_df from .derived.screentime import screentime from .derived.sleep import sleep +from .derived.heartrate import heartrate from .load.habitbull import habits from .load.location import locate from .load.oura import oura @@ -15,7 +16,7 @@ def main(): pass -for subcmd in [locate, habits, oura, screentime, all_df, sleep]: +for subcmd in [locate, habits, oura, screentime, all_df, sleep, heartrate]: main.add_command(subcmd) main.add_command(qslang, name="qslang")