data-prep-for-pca.ipynb
How to get one large spreadsheet to run a PCA on and how to run the PCA in R.
Exporting data from Fitbit, Oura, RescueTime and Last.fm in prepartion of a PCA. This notebook also includes the R code for making the visualizations at the end, but unfortunately for some library incompatabilities the R code can not be run on the Open Humans Notebook servers!
Give the start & end dates for how much data you want to be included in the spreadsheet. Longer periods take longer to extract.
This code are the functions to extract the data from the different sources. It might not be ideal or great, but it works :D
Here we're actually exporting the data from the different sources
This merges the data into the all_data_df
and also saves it into a file called combined_data_for_pca.csv
.
Below is the code to run the PCA in R (again: this won't work on the OH Notebook server)
Exporting data from Fitbit, Oura, RescueTime and Last.fm in prepartion of a PCA. This notebook also includes the R code for making the visualizations at the end, but unfortunately for some library incompatabilities the R code can not be run on the Open Humans Notebook servers!
### GET DATA FROM DIFFERENT APPS
from ohapi import api
import os
import requests
import tempfile
import json
import pandas as pd
from datetime import datetime, date, timedelta
from collections import defaultdict
user_details = api.exchange_oauth2_member(os.environ.get('OH_ACCESS_TOKEN'))
Give the start & end dates for how much data you want to be included in the spreadsheet. Longer periods take longer to extract.
START_DATE = date.fromisoformat('2021-09-01')
END_DATE = date.fromisoformat('2022-06-07')
This code are the functions to extract the data from the different sources. It might not be ideal or great, but it works :D
fitbit = ''
for i in user_details['data']:
if i['basename'] == 'fitbit-data.json' and i['source'] == 'direct-sharing-102':
fitbit = json.loads(requests.get(i['download_url']).content)
def get_fitbit_weight(fitbit,start,end):
start_year = start.year
end_year = end.year
log_dates = []
weights = []
bodyfats = []
for year in range(start_year,end_year+1):
for month in range(1,13):
m = str(month)
if len(m) == 1:
m = "0"+ m
month_key = '{}-{}'.format(year,m)
if month_key in fitbit['weight-log'].keys():
monthly_values = fitbit['weight-log'][month_key]['weight']
for entry in monthly_values:
d = date.fromisoformat(entry['date'])
if (d >= start) and (d <= end):
log_dates.append(d)
weights.append(entry['weight'])
if 'fat' in entry.keys():
bodyfats.append(entry['fat'])
else:
bodyfats.append('')
df = pd.DataFrame(
data = {
'date': log_dates,
'weight_fitbit': weights,
'fat_fitbit': bodyfats
}
)
return df
def get_lastfm_playcount(user_details,start,end):
daily_playcount = defaultdict(int)
start_year = start.year
end_year = end.year
for year in range(start_year,end_year+1):
for f in user_details['data']:
if (f['basename'] == 'lastfm-data-{}.json'.format(year)) and (f['source'] == 'direct-sharing-810'):
lastfm = json.loads(requests.get(f['download_url']).content)
for entry in lastfm:
d = entry['date']
d = date.fromtimestamp(int(d['uts']))
if (d >= start) and (d <= end):
daily_playcount[d] += 1
dates = []
plays = []
for k,v in daily_playcount.items():
dates.append(k)
plays.append(v)
df = pd.DataFrame(
data = {
'date': dates,
'lastfm_playcount': plays
}
)
return df
def get_rescuetime_aggregates(user_details, start, end):
for i in user_details['data']:
if i['source'] == "direct-sharing-149":
rescuetime_data = json.loads(requests.get(i['download_url']).content)
rt_datetime = []
time_spent_seconds = []
activity = []
category = []
productivity = []
for element in rescuetime_data['rows']:
rt_datetime.append(element[0])
time_spent_seconds.append(element[1])
activity.append(element[3])
category.append(element[4])
productivity.append('rescuetime_productivity_' + str(element[5]))
rt_datetime = [datetime.strptime(dt,"%Y-%m-%dT%H:%M:%S") for dt in rt_datetime]
rt_date = [dt.date() for dt in rt_datetime]
rt_df = pd.DataFrame(data={
'date': rt_date,
'time_spent_seconds': time_spent_seconds,
'productivity': productivity
})
rt_grouped_df = rt_df.groupby(['date','productivity']).sum().reset_index()
rt_by_date = rt_grouped_df.pivot(index='date', columns='productivity',values='time_spent_seconds').reset_index()
rt_by_date = rt_by_date[(rt_by_date['date'] >= start) & (rt_by_date['date'] <= end)]
return rt_by_date
def get_rescuetime_category_aggregates(user_details, start, end):
for i in user_details['data']:
if i['source'] == "direct-sharing-149":
rescuetime_data = json.loads(requests.get(i['download_url']).content)
rt_datetime = []
time_spent_seconds = []
activity = []
category = []
productivity = []
for element in rescuetime_data['rows']:
rt_datetime.append(element[0])
time_spent_seconds.append(element[1])
activity.append(element[3])
category.append(element[4])
productivity.append('rescuetime_productivity_' + str(element[5]))
rt_datetime = [datetime.strptime(dt,"%Y-%m-%dT%H:%M:%S") for dt in rt_datetime]
rt_date = [dt.date() for dt in rt_datetime]
rt_df = pd.DataFrame(data={
'date': rt_date,
'time_spent_seconds': time_spent_seconds,
'category': category
})
rt_grouped_df = rt_df.groupby(['date','category']).sum().reset_index()
rt_by_date = rt_grouped_df.pivot(index='date', columns='category',values='time_spent_seconds').reset_index()
rt_by_date = rt_by_date[(rt_by_date['date'] >= start) & (rt_by_date['date'] <= end)]
return rt_by_date
def create_oura_sleep_activity(user_details, start, end):
for i in user_details['data']:
if i['basename'] == "oura-data.json":
oura_data = json.loads(requests.get(i['download_url']).content)
summary_dates = []
breath_averages = []
total_bed_duration = []
total_sleep_duration = []
awake_time = []
rem_time = []
deep_time = []
light_sleep_time = []
onset_latency = []
resting_hr = []
average_sleep_hr = []
hrv = []
temperature_delta = []
restless_time = []
for entry in oura_data['sleep']:
sd = date.fromisoformat(entry['summary_date']) + timedelta(days=1)
summary_dates.append(sd)
breath_averages.append(entry['breath_average'])
total_bed_duration.append(entry['duration'])
total_sleep_duration.append(entry['total'])
awake_time.append(entry['awake'])
rem_time.append(entry['rem'])
deep_time.append(entry['deep'])
light_sleep_time.append(entry['light'])
onset_latency.append(entry['onset_latency'])
restless_time.append(entry['restless'])
resting_hr.append(entry['hr_lowest'])
average_sleep_hr.append(entry['hr_average'])
hrv.append(entry['rmssd'])
temperature_delta.append(entry['temperature_delta'])
oura_sleep_df = pd.DataFrame(data={
'date': summary_dates,
'respiratory_rate': breath_averages,
'total_in_bed_duration': total_bed_duration,
'total_sleep_duration': total_sleep_duration,
'awake_time': awake_time,
'rem_sleep_time': rem_time,
'deep_sleep_time': deep_time,
'light_sleep_time': light_sleep_time,
'restless_time': restless_time,
'sleep_onset_latency': onset_latency,
'resting_hr': resting_hr,
'average_sleep_hr': average_sleep_hr,
'hrv': hrv,
'body_temperature_delta': temperature_delta
})
oura_sleep_df = oura_sleep_df[(oura_sleep_df['date'] >= start) & (oura_sleep_df['date'] <= end)]
summary_dates = []
calories_active = []
calories_total = []
stepcount = []
daily_movement = []
rest_time = []
inactive_time = []
low_activity_time = []
medium_activity_time = []
high_activity_time = []
average_met = []
met_min_inactive = []
met_min_low = []
met_min_medium = []
met_min_high = []
for entry in oura_data['activity']:
sd = date.fromisoformat(entry['summary_date'])
summary_dates.append(sd)
calories_active.append(entry['cal_active'])
calories_total.append(entry['cal_total'])
stepcount.append(entry['steps'])
daily_movement.append(entry['daily_movement'])
rest_time.append(entry['rest'])
inactive_time.append(entry['inactive'])
low_activity_time.append(entry['low'])
medium_activity_time.append(entry['medium'])
high_activity_time.append(entry['high'])
average_met.append(entry['average_met'])
met_min_inactive.append(entry['met_min_inactive'])
met_min_low.append(entry['met_min_low'])
met_min_medium.append(entry['met_min_medium'])
met_min_high.append(entry['met_min_high'])
oura_activity_df = pd.DataFrame(data={
'date': summary_dates,
'oura_calories_active': calories_active,
'oura_calories_total': calories_total,
'oura_steps': stepcount,
'oura_daily_movement': daily_movement,
'oura_rest_time': rest_time,
'oura_inactive_time': inactive_time,
'oura_low_activity_time': low_activity_time,
'oura_medium_activity_time': medium_activity_time,
'oura_high_activity_time': high_activity_time,
'oura_average_met': average_met,
'oura_met_min_inactive': met_min_inactive,
'oura_met_min_low': met_min_low,
'oura_met_min_medium': met_min_medium,
'oura_met_min_high': met_min_high
})
oura_activity_df = oura_activity_df[(oura_activity_df['date'] >= start) & (oura_activity_df['date'] <= end)]
return [oura_sleep_df,oura_activity_df]
def get_apple_metrics(user_details, start, end):
for i in user_details['data']:
if i['basename'] == 'cycling_distance.csv':
cycledist = pd.read_csv(i['download_url'])
cycledist['date'] = pd.to_datetime(cycledist['date'],utc=True)
cycledist['date'] = cycledist['date'].dt.date
cycledist = cycledist.groupby(['date']).sum().reset_index()
cycledist = cycledist.rename(columns={'qty':'cycle_distance'})
if i['basename'] == 'walking_running_distance.csv':
walking_distance = pd.read_csv(i['download_url'])
walking_distance['date'] = pd.to_datetime(walking_distance['date'],utc=True)
walking_distance['date'] = walking_distance['date'].dt.date
walking_distance = walking_distance.groupby(['date']).sum().reset_index()
walking_distance = walking_distance.rename(columns={'qty':'walking_distance'})
if i['basename'] == 'environmental_audio_exposure.csv':
environmental_audio = pd.read_csv(i['download_url'])
environmental_audio['date'] = pd.to_datetime(environmental_audio['date'],utc=True)
environmental_audio['date'] = environmental_audio['date'].dt.date
environmental_audio = environmental_audio.groupby(['date']).mean().reset_index()
environmental_audio = environmental_audio.rename(columns={'qty':'environmental_audio'})
cycledist = cycledist[(cycledist['date'] >= start) & (cycledist['date'] <= end)]
walking_distance = walking_distance[(walking_distance['date'] >= start) & (walking_distance['date'] <= end)]
environmental_audio = environmental_audio[(environmental_audio['date'] >= start) & (environmental_audio['date'] <= end)]
return [cycledist,walking_distance,environmental_audio]
Here we're actually exporting the data from the different sources
oura_sleep_df, oura_activity_df = create_oura_sleep_activity(user_details,START_DATE,END_DATE)
cycledist,walking_distance,environmental_audio = get_apple_metrics(user_details,START_DATE,END_DATE)
rescuetime_df = get_rescuetime_category_aggregates(user_details,START_DATE,END_DATE)
lastfm_df = get_lastfm_playcount(user_details,START_DATE,END_DATE)
fitbit_df = get_fitbit_weight(fitbit, START_DATE, END_DATE)
This merges the data into the all_data_df
and also saves it into a file called combined_data_for_pca.csv
.
fitbit_df = fitbit_df.groupby(['date']).mean().reset_index()
lastfm_df.merge(fitbit_df,how='outer',on='date',sort=True)
lastfm_fitbit_df = lastfm_df.merge(fitbit_df,how='outer',on='date',sort=True)
lastfm_fitbit_rescuetime_df = lastfm_fitbit_df.merge(rescuetime_df, how='outer', on='date', sort=True)
oura_df = oura_sleep_df.merge(oura_activity_df, how='outer', on='date', sort=True)
cycledist = cycledist.merge(walking_distance, how='outer', on='date',sort=True)
all_apple = cycledist.merge(environmental_audio,how='outer', on='date',sort=True)
all_data_df = lastfm_fitbit_rescuetime_df.merge(oura_df,how='outer', on='date',sort=True)
all_data_df = all_data_df.merge(all_apple,how='outer', on='date',sort=True)
all_data_df.to_csv('combined_data_for_pca.csv')
Below is the code to run the PCA in R (again: this won't work on the OH Notebook server)
install.packages(c("FactoMineR", "factoextra"))
library(tidyverse)
library("FactoMineR")
library("factoextra")
library(zoo)
library(lubridate)
# read file
df <- read.csv(file='combined_data_for_pca.csv')
# set dates as row names instead of being data
rownames(df) <- df$date
df_filtered <- df %>% mutate(weight_fitbit = na.approx(df$weight_fitbit,na.rm=FALSE)) # linear interpolation for missing weights
df_filtered <- df_filtered %>% fill(weight_fitbit, .direction='updown') # fill leading/trailing NA in weight
df_filtered[is.na(df_filtered)] <- 0 # replace all NA left now with 0, as it's not missing data
df_filtered <- df_filtered[3:39] # remove last non-data columns!
res.pca <- PCA(df_filtered, scale.unit = TRUE, ncp = 5, graph = TRUE) # run actual PCA
##### CLUSTER THE PCA VARIABLES USING KMEANS AND MAKE PLOT
var <- get_pca_var(res.pca)
res.km <- kmeans(var$coord, centers = 3, nstart = 25)
grp <- as.factor(res.km$cluster)
fviz_pca_var(res.pca, col.var = grp,
palette = c("#0073C2FF", "#EFC000FF", "#868686FF"),
labelsize=3,
repel=TRUE,
legend.title = "Cluster")
##### CLUSTER OBSERVATIONS/DAYS USING KMEANS, YOU CAN CHANGE NUMBER OF CENTERS FOR AS MANY CLUSTERS AS YOU WANT
res.kmi <- kmeans(df_filtered_scaled, centers = 2)
grp_ind <- as.factor(res.kmi$cluster)
##### MAKE BIPLOT WITH KMEANS LABELING
fviz_pca_biplot(res.pca,
col.ind = grp_ind, palette = "jco",
col.var='black',
mean.point=FALSE,
labelsize=3,
arrowsize=0.3,
addEllipses = TRUE,
label = "var",
repel = TRUE,
legend.title = "unsupervised label")
##### REPEAT BIPLOT, THIS TIME WITH WEEKDAY/WEEKEND LABELING
weekday_labels <- ifelse(weekday_labels <- as.numeric(wday(rownames(df_filtered),label=TRUE,week_start=1)) %in% c(6,7),"weekend",'weekday')
# BIPLOT WITH DAY OF WEEK LABEL
fviz_pca_biplot(res.pca, axes=c(3,4),
col.ind = weekday_labels, palette = "jco",
col.var='black',
mean.point=FALSE,
labelsize=3,
arrowsize=0.3,
addEllipses = TRUE,
label = "var",
repel = TRUE,
legend.title = "Type of day")