Skip to content

Commit

Permalink
feat: load sleep data from fitbit, many misc fixes (incl updated expo…
Browse files Browse the repository at this point in the history
…rt formats)
  • Loading branch information
ErikBjare committed Oct 23, 2023
1 parent 2486975 commit 795e33d
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 52 deletions.
46 changes: 29 additions & 17 deletions src/quantifiedme/derived/all_df.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import io
import itertools
import logging
import os
import sys
from datetime import (
date,
datetime,
Expand All @@ -26,7 +25,9 @@


def load_all_df(
fast=True, screentime_events: list[Event] | None = None, ignore: list[Sources] = []
fast=True,
screentime_events: list[Event] | None = None,
ignore: list[Sources] = [],
) -> pd.DataFrame:
"""
Loads a bunch of data into a single dataframe with one row per day.
Expand Down Expand Up @@ -139,25 +140,26 @@ def check_new_data_in_range(df_source: pd.DataFrame, df_target: pd.DataFrame) ->
)
@click.option(
"--csv",
is_flag=True,
help="Print as CSV",
help="Save to CSV",
)
def all_df(fast=False, csv=False):
"""loads all data and prints a summary"""
def all_df(fast=False, csv=None):
"""Loads all data and prints a summary."""
logging.basicConfig(level=logging.INFO)

# capture output if csv
if csv:
sys.stdout = io.StringIO()
df = load_all_df(fast=fast)
if csv:
sys.stdout = sys.__stdout__

if csv:
print(df.to_csv())
return
# convert duration columns that are timedelta to hours
for col in itertools.chain(
df.columns[df.dtypes == "timedelta64[ns]"],
df.columns[df.dtypes == "timedelta64[us]"],
):
df[col] = df[col].dt.total_seconds() / 3600

# dont truncate output
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

print(df)
print(df.describe())

# check for missing data
Expand All @@ -168,13 +170,23 @@ def all_df(fast=False, csv=False):
print(df_days_na)
print("Total days: ", len(df))

# drop columns with too much missing data
columns_to_drop = df.columns[df.isna().sum() > len(df) * 0.5]
df = df.dropna(axis=1, thresh=int(len(df) * 0.5))
print("Dropped columns with too much missing data: ", columns_to_drop)

# keep days with full coverage
df = df.dropna()
print("Total days with full coverage: ", len(df))
# df = df.dropna()
# print("Total days with full coverage: ", len(df))

print("Final dataframe:")
print(df)

if csv:
print(f"Saving to {csv}")
with open(csv, "w") as f:
f.write(df.to_csv())


if __name__ == "__main__":
all_df()
25 changes: 18 additions & 7 deletions src/quantifiedme/derived/heartrate.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
from datetime import timezone

import click
import pandas as pd

from ..load import fitbit, oura, whoop


# load heartrate from multiple sources, combine into a single dataframe
def load_heartrate_df() -> pd.DataFrame:
from ..load import oura, fitbit, whoop

dfs = []

print("# Loading Oura heartrate data")
oura_df = oura.load_heartrate_df()
oura_df["source"] = "oura"
dfs.append(oura_df)

print("# Loading Fitbit heartrate data")
fitbit_df = fitbit.load_heartrate_df()
fitbit_df["source"] = "fitbit"
dfs.append(fitbit_df)

print("# Loading Whoop heartrate data")
whoop_df = whoop.load_heartrate_df()
whoop_df["source"] = "whoop"
dfs.append(whoop_df)
Expand All @@ -28,7 +30,7 @@ def load_heartrate_df() -> pd.DataFrame:

def load_heartrate_minutes_df():
"""We consider using minute-resolution a decent starting point for summary heartrate data.
NOTE: ignores source, combines all sources into a single point per freq.
"""
df = load_heartrate_df().drop(columns=["source"])
Expand All @@ -54,9 +56,18 @@ def load_heartrate_summary_df(
df[f"hr_duration_{zone}"] = df_zones[df_zones == zone].groupby(
pd.Grouper(freq=freq)
).count() * pd.Timedelta(minutes=1)
# replace NaT with 0
df = df.fillna(pd.Timedelta(0))
return df


if __name__ == "__main__":
df = load_heartrate_summary_df()
@click.command()
@click.option("--freq", default="D")
def heartrate(freq: str):
"""Loads heartrate data."""
df = load_heartrate_summary_df(freq=freq)
print(df)


if __name__ == "__main__":
heartrate()
11 changes: 9 additions & 2 deletions src/quantifiedme/derived/screentime.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import pickle
from collections import defaultdict
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Literal
Expand Down Expand Up @@ -194,6 +195,9 @@ def classify(events: list[Event], personal: bool) -> list[Event]:
def load_category_df(events: list[Event]) -> pd.DataFrame:
tss = {}
all_categories = list({t for e in events for t in e.data["$tags"]})
events_by_date = defaultdict(list)
for e in events:
events_by_date[e.timestamp.date()].append(e)
for cat in all_categories:
try:
tss[cat] = categorytime_per_day(events, cat)
Expand All @@ -203,14 +207,17 @@ def load_category_df(events: list[Event]) -> pd.DataFrame:
df = pd.DataFrame(tss)
df = df.replace(np.nan, 0)
df["All_cols"] = df.sum(axis=1)
df["All_events"] = sum([e.duration / 60 / 60 for e in events], timedelta(0))
df["All_events"] = [
sum((e.duration for e in events_by_date[d]), start=timedelta(0))
for d in df.index
]
return df


@click.command()
@click.option("--csv", is_flag=True, help="Print as CSV")
def screentime(csv: bool):
"""Load all screentime and print total duration"""
"""Loads screentime data, and prints total duration."""
hostnames = load_config()["data"]["activitywatch"]["hostnames"]
events = load_screentime(
since=datetime.now(tz=timezone.utc) - timedelta(days=90),
Expand Down
17 changes: 12 additions & 5 deletions src/quantifiedme/derived/sleep.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import matplotlib.pyplot as plt
import pandas as pd

from ..load.fitbit import load_sleep_df as load_fitbit_sleep_df
from ..load.oura import load_sleep_df as load_oura_sleep_df
from ..load.whoop import load_sleep_df as load_whoop_sleep_df

Expand All @@ -30,7 +31,7 @@ def _merge_several_sleep_records_per_day(df: pd.DataFrame) -> pd.DataFrame:
if duplicates:
df = df.groupby(df.index).agg({"start": "min", "end": "max", "score": "max"}) # type: ignore
df["duration"] = df["end"] - df["start"]
logger.warning(f"Merged {duplicates} duplicate index entries")
logger.warning(f"Merged {duplicates} duplicate sleep entries")
return df


Expand All @@ -41,16 +42,19 @@ def load_sleep_df(ignore: list[str] = [], aggregate=True) -> pd.DataFrame:
df: pd.DataFrame = pd.DataFrame()

# Fitbit
# df = join(df, load_fitbit_sleep_df(), rsuffix="_fitbit")
if "fitbit" not in ignore:
df_fitbit = load_fitbit_sleep_df()
df_fitbit = _merge_several_sleep_records_per_day(df_fitbit)
df = join(df, df_fitbit.add_suffix("_fitbit"))

# Oura
if "oura" not in ignore:
df_oura = load_oura_sleep_df()
df_oura = _merge_several_sleep_records_per_day(df_oura)
df = join(df, df_oura.add_suffix("_oura"))

# Whoop
if "whoop" not in ignore:
# FIXME: can return multiple sleep records per day, which we should merge
df_whoop = load_whoop_sleep_df()
df_whoop = _merge_several_sleep_records_per_day(df_whoop)
df = join(df, df_whoop.add_suffix("_whoop"))
Expand Down Expand Up @@ -83,13 +87,16 @@ def join(df_target, df_source, **kwargs) -> pd.DataFrame:
@click.option("--plot/--no-plot", default=False)
@click.option("--dropna/--no-dropna", default=True)
def sleep(aggregate=False, plot=False, dropna=True):
"""Loads sleep data"""
df = load_sleep_df(aggregate=aggregate)
if dropna:
df = df.dropna()
if not aggregate:
print(df[["duration_whoop", "duration_oura", "score_oura", "score_whoop"]])
sources = ["fitbit", "oura", "whoop"]
cols = ["duration", "score"]
print(df[[f"{c}_{s}" for s in sources for c in cols]])
# compare durations to ensure they are matching
df_durations = df[["duration_oura", "duration_whoop"]].apply(
df_durations = df[[f"duration_{s}" for s in sources]].apply(
lambda x: x.dt.seconds / 60 / 60
)
print(df_durations.head())
Expand Down
54 changes: 45 additions & 9 deletions src/quantifiedme/load/fitbit.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,52 @@
from ..config import load_config
from ..cache import memory

import multiprocessing
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

from ..cache import memory
from ..config import load_config


def load_sleep_df() -> pd.DataFrame:
raise NotImplementedError
filepath = load_config()["data"]["fitbit"]
filepath = Path(filepath).expanduser()
assert filepath.exists()

# filepath is the root folder of an unzipped Fitbit export
# sleep data is in `Global Export Data/sleep-YYYY-MM-DD.json`
# we need to combine all of these files into a single dataframe
files = filepath.glob("Global Export Data/sleep-*.json")

# load each file into a dataframe
dfs = []
for f in sorted(files):
dfs.append(_load_sleep_file(f))

# combine all the dataframes into a single dataframe
df = pd.concat(dfs)

return df


def _load_sleep_file(filepath):
with open(filepath) as f:
df = pd.read_json(f)
df = df[["dateOfSleep", "startTime", "endTime", "duration", "efficiency"]]
df = df.rename(
columns={
"dateOfSleep": "date",
"startTime": "start",
"endTime": "end",
"efficiency": "score",
}
)
df["date"] = pd.to_datetime(df["date"], utc=True)
df["start"] = pd.to_datetime(df["start"], utc=True)
df["end"] = pd.to_datetime(df["end"], utc=True)
df["duration"] = pd.to_timedelta(df["duration"] / 1000, unit="s")
df = df.set_index("date")
return df


def _load_heartrate_file(filepath):
Expand All @@ -28,16 +67,14 @@ def load_heartrate_df() -> pd.DataFrame:
filepath = Path(filepath).expanduser()

# filepath is the root folder of an unzipped Fitbit export
# heartrate data is split into daily files in `Physical Activity/heart_rate-YYYY-MM-DD.json`
# heartrate data is split into daily files in `Global Export Data/heart_rate-YYYY-MM-DD.json`
# we need to combine all of these files into a single dataframe

# get all the files in the folder
files = filepath.glob("Physical Activity/heart_rate-*.json")
files = filepath.glob("Global Export Data/heart_rate-*.json")

# load each file into a dataframe
# parallelize to speed up the process
import multiprocessing

pool = multiprocessing.Pool(20)
dfs = pool.map(_load_heartrate_file, sorted(files))

Expand All @@ -54,6 +91,5 @@ def load_heartrate_df() -> pd.DataFrame:
df = load_heartrate_df()
print(df)
df.plot()
import matplotlib.pyplot as plt

plt.show()
Loading

0 comments on commit 795e33d

Please sign in to comment.