Github Exploration v.2.ipynb
Explore your github data!
Visualize the most commonly used words in your commit messages
See the commit activity per repo per year. A trip down memory lane!
Checking the busiest times and days of the week. If a repo is busiest on the weekend, could it be a side project?
A re-implementation of the official github commit calendar, using the calmap
library.
Approximate the duration of coding per repo, by looking at times between timestamps. This will tend to underestimate the total time, since the time between starting coding and the first commit is not captured.
Commits that have been pushed within a short time period of another commit, are likely to be trying to fix something that broke.
import sys
!{sys.executable} -m pip install wordcloud==1.5.0
!{sys.executable} -m pip install calmap
from datetime import datetime
import json
import numpy as np
import pandas as pd
import os
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle
import requests
import tempfile
import urllib.request
import ohapi
import matplotlib.pyplot as plt
%matplotlib inline
# load the commits from open humans API
token = os.environ.get('OH_ACCESS_TOKEN')
user = ohapi.api.exchange_oauth2_member(token)
for dset in sorted(user['data'], key=lambda x:x['id']):
if 'Github' in dset['metadata']['tags'] and 'commits' in dset['metadata']['tags']:
raw_data = requests.get(dset['download_url']).content
commit_data = json.loads(raw_data.decode("utf-8"))
break
messages = []
timestamps = []
repos = []
for repo in commit_data['repo_data']:
repo_data = commit_data['repo_data'][repo]
for commit in repo_data['commits']:
messages.append(commit['commit']['message'].lower())
timestamps.append(datetime.strptime(commit['commit']['committer']['date'], '%Y-%m-%dT%H:%M:%SZ'))
repos.append(repo)
# turn into a neat dataframe
df = pd.DataFrame(columns=['repo','message','datetime'])
df.repo = repos; df.message = messages; df.datetime = timestamps
df.head()
Visualize the most commonly used words in your commit messages
mask_url = 'http://oh-github.herokuapp.com/static/github.png'
mask_path = tempfile.NamedTemporaryFile(delete=False).name
urllib.request.urlretrieve(mask_url, mask_path)
mask = np.array(Image.open(mask_path))
def transform_format(val):
#print(val)
if val[3] == 255:
return 255
else:
return 0
# Transform your mask into a new one that will work with the function:
transformed_mask = np.ndarray((mask.shape[0], mask.shape[1]), np.int32)
for i in range(len(mask)):
transformed_mask[i] = list(map(transform_format, mask[i]))
# Create and generate a word cloud image:
text = ' '.join(df.message.values)
wordcloud = WordCloud(mask=transformed_mask, background_color='white',
width=len(mask[0]), height=len(mask)).generate(text)
# Display the generated image:
plt.rcParams['figure.figsize'] = [20, 10]
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
See the commit activity per repo per year. A trip down memory lane!
import random
import matplotlib
from matplotlib import cm
df['date'] = df.datetime.apply(lambda x: x.date())
df['year'] = df.datetime.apply(lambda x: x.year)
MAX_NUM_REPOS = 7 # maximum number of repos to show in a plot
for year in sorted(df.year.unique()):
df_curr = df[df.year==year]
df_curr = df_curr.sample(frac=1)
plt.clf()
fig, ax = plt.subplots()
repos = 0
have_data = False
grouped_data = df_curr.groupby(['repo'], sort=False) # sort=False enables to pick different repos each time
for (key, grp) in grouped_data:
df_temp = grp.groupby("date")['message'].count().reset_index(name='num_commits')
if df_temp.num_commits.sum() < 10:
continue
have_data = True
ax.plot_date( x=df_temp['date'].values, y=df_temp['num_commits'].values,
label=key, markersize=14)
repos+=1
if repos == MAX_NUM_REPOS:
break
if not have_data:
continue
plt.xlabel("Time")
plt.ylabel("Number of commits")
plt.title("Repo activity in {}".format(year))
plt.legend(loc='best')
plt.show()