Details for Github Exploration.ipynb

Published by carolinux

Description

Explore your github data!

0

Tags & Data Sources

github commits programming code github

Comments

Please log in to comment.

Notebook
Last updated 1 month, 1 week ago

In [13]:
import sys
!{sys.executable} -m pip install wordcloud==1.5.0
Collecting wordcloud==1.5.0
  Downloading https://files.pythonhosted.org/packages/ae/af/849edf14d573eba9c8082db898ff0d090428d9485371cc4fe21a66717ad2/wordcloud-1.5.0-cp36-cp36m-manylinux1_x86_64.whl (361kB)
    100% |████████████████████████████████| 368kB 1.7MB/s eta 0:00:01
Requirement already satisfied: pillow in /opt/conda/lib/python3.6/site-packages (from wordcloud==1.5.0)
Requirement already satisfied: numpy>=1.6.1 in /opt/conda/lib/python3.6/site-packages (from wordcloud==1.5.0)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.5.0
You are using pip version 9.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [15]:
from datetime import datetime
import json
import numpy as np
import pandas as pd
import os
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle
import requests
import tempfile
import urllib.request

import matplotlib.pyplot as plt
%matplotlib inline

Loading data from Open Humans

In [16]:
# load the commits from open humans API
token = os.environ.get('OH_ACCESS_TOKEN')

response = requests.get("https://www.openhumans.org/api/direct-sharing/project/exchange-member/?access_token={}".format(token))

user = json.loads(response.content.decode("utf-8"))
for dset in sorted(user['data'], key=lambda x:x['id']):
    if 'Github' in dset['metadata']['tags'] and 'commits' in dset['metadata']['tags']: 
        raw_data = requests.get(dset['download_url']).content
        commit_data = json.loads(raw_data.decode("utf-8"))
        break

messages = []
timestamps = []
repos = []
for repo in commit_data['repo_data']:
    repo_data = commit_data['repo_data'][repo]
    for commit in repo_data['commits']:
        messages.append(commit['commit']['message'].lower())
        timestamps.append(datetime.strptime(commit['commit']['committer']['date'], '%Y-%m-%dT%H:%M:%SZ'))
        repos.append(repo)
                        
# turn into a neat dataframe

df = pd.DataFrame(columns=['repo','message','datetime'])
df.repo = repos; df.message = messages; df.datetime = timestamps
df.head()
Out[16]:
repo message datetime
0 crowdsense/bonobo_trans added readme with my recommendations 2019-05-02 15:43:51
1 crowdsense/bonobo_trans create a working example with db input, and a ... 2019-05-02 15:31:55
2 carolinux/resiroop-cms-service put status ok/failed in config and use consist... 2018-04-03 15:15:16
3 carolinux/resiroop-cms-service add category toppings to dev s3 fake 2018-04-03 11:30:03
4 carolinux/resiroop-cms-service adjust return format of import/category_toppin... 2018-04-03 11:23:53

Commit Cloud

Visualize the most commonly used words in your commit messages

In [26]:
mask_url = 'http://oh-github.herokuapp.com/static/github.png'
mask_path = tempfile.NamedTemporaryFile(delete=False).name
urllib.request.urlretrieve(mask_url, mask_path)


mask = np.array(Image.open(mask_path))

def transform_format(val):
    #print(val)
    if val[3] == 255:
        return 255
    else:
        return 0

# Transform your mask into a new one that will work with the function:
transformed_mask = np.ndarray((mask.shape[0], mask.shape[1]), np.int32)

for i in range(len(mask)):
    transformed_mask[i] = list(map(transform_format, mask[i]))

# Create and generate a word cloud image:
text = ' '.join(df.message.values)
wordcloud = WordCloud(mask=transformed_mask, background_color='white',
                      width=len(mask[0]), height=len(mask)).generate(text)

# Display the generated image:
plt.rcParams['figure.figsize'] = [20, 10]
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Github activity over the years

See the commit activity per repo per year. A trip down memory lane!

In [7]:
import random
import matplotlib
from matplotlib import cm

df['date'] = df.datetime.apply(lambda x: x.date())
df['year'] = df.datetime.apply(lambda x: x.year)


for year in sorted(df.year.unique()):
    
    df_curr = df[df.year==year]

    plt.clf()
    fig, ax = plt.subplots()
    #max_num_repos = 7

    #num_repos = df_curr.repo.nunique()
    #repos_to_choose = set(random.sample(list(range(num_repos)), k=min(max_num_repos, num_repos)))
    have_data = False
    for idx, (key, grp) in enumerate(df_curr.groupby(['repo'])):
        #if idx not in repos_to_choose and "OpenHumans" not in key: # <:o)
        #    continue
        df_temp = grp.groupby("date")['message'].count().reset_index(name='num_commits')
        if df_temp.num_commits.sum() < 10:
            continue
        have_data = True
        ax.plot_date( x=df_temp['date'].values, y=df_temp['num_commits'].values,
                     label=key, markersize=14) 

    if not have_data:
        continue
    plt.xlabel("Time")
    plt.ylabel("Number of commits")
    plt.title("Repo activity in {}".format(year))
    plt.legend(loc='best')
    plt.show()
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>

Busiest times

Checking the busiest times and days of the week. If a repo is busiest on the weekend, could it be a side project?

In [8]:
def get_part_of_day(hour):
    return (
        "morning" if 6 <= hour <= 11
        else
        "afternoon" if 12 <= hour <= 17
        else
        "evening" if 18 <= hour <= 22
        else
        "night"
    )


df['day_of_week'] = df.datetime.apply(lambda x: datetime.strftime(x, '%A'))
df['hour_of_day'] = df.datetime.apply(lambda x: x.hour)
df['part_of_day'] = df.hour_of_day.apply(get_part_of_day)
df['day_and_hour'] = df.day_of_week + " " + df.part_of_day
print("Busiest days")
print(df['day_of_week'].value_counts())
print('\n')
print("Busiest times")
print(df['part_of_day'].value_counts())
print('\n')
print("Busiest days + times")
print(df['day_and_hour'].value_counts())
print('\n')
Busiest days
Friday       357
Thursday     338
Wednesday    264
Tuesday      240
Monday       218
Saturday     134
Sunday       120
Name: day_of_week, dtype: int64


Busiest times
afternoon    885
morning      383
evening      293
night        110
Name: part_of_day, dtype: int64


Busiest days + times
Friday afternoon       182
Thursday afternoon     182
Wednesday afternoon    149
Monday afternoon       124
Tuesday afternoon      122
Friday morning          83
Thursday morning        74
Friday evening          70
Monday morning          67
Tuesday morning         65
Sunday afternoon        63
Saturday afternoon      63
Wednesday evening       54
Thursday evening        49
Wednesday morning       47
Tuesday evening         34
Thursday night          33
Saturday evening        32
Saturday morning        31
Sunday evening          30
Monday evening          24
Friday night            22
Tuesday night           19
Sunday morning          16
Wednesday night         14
Sunday night            11
Saturday night           8
Monday night             3
Name: day_and_hour, dtype: int64


In [9]:
print("Side Project detection")
repos_with_dow = df.groupby('repo')['day_of_week'].agg(lambda x: (x.value_counts().index[0], len(x)))

for repo, (day, commit_count) in repos_with_dow.iteritems():
    if commit_count >= 5 and day in ("Saturday", "Sunday"):
        print("Project {} has most commits on {}s.".format(repo, day))
Side Project detection
Project carolinux/Pattern2Scala has most commits on Saturdays.
Project carolinux/articles has most commits on Saturdays.
Project carolinux/flask_ansible has most commits on Saturdays.
Project carolinux/fractals has most commits on Sundays.
Project carolinux/grid_helper has most commits on Saturdays.
Project carolinux/londonwald has most commits on Saturdays.

Estimation of time spent coding

Approximate the duration of coding per repo, by looking at times between timestamps. This will tend to underestimate the total time, since the time between starting coding and the first commit is not captured.

In [10]:
from datetime import timedelta
df_sorted = df.sort_values(['repo', 'datetime'])
df_sorted2 = df_sorted.shift(-1)
df_sorted['next_datetime'] = df_sorted2.datetime
df_sorted['next_repo'] = df_sorted2.repo
df_sorted['next_part_of_day'] = df_sorted2.part_of_day
# Set this to a value that makes sense with how often you commit
MAX_TIME_BETWEEN_COMMITS = timedelta(hours=8)


def determine_duration(dt1, dt2, repo1, repo2, pod1, pod2):
    if repo1 != repo2 or (pod1 != "morning" and pod2 == "morning") or dt2 - dt1 > MAX_TIME_BETWEEN_COMMITS:
        # this means commit at dt1 was the last one for the time period
        return timedelta(0)
    return dt2 - dt1


df_sorted['duration'] = df_sorted.apply(lambda x: 
                                        determine_duration(x['datetime'], x['next_datetime'],
                                                           x['repo'], x['next_repo'],
                                                           x['part_of_day'], x['next_part_of_day']), axis=1)

df_sorted.groupby('repo')['duration'].sum().sort_values(ascending=False).head(20)
Out[10]:
repo
carolinux/resiroop-shop               13 days 15:39:16
anitagraser/TimeManager                8 days 11:35:40
OpenHumans/oh-googlefit-integration    1 days 11:28:51
carolinux/mosaic                       1 days 08:25:24
carolinux/resiroop-cms-service         1 days 06:31:43
opengisch/qgis_excel_sync              1 days 05:12:59
carolinux/dotfiles                     1 days 03:50:08
carolinux/TimeManager                  1 days 01:29:27
carolinux/QGIS                         0 days 20:58:00
carolinux/resiroop-infra               0 days 20:45:50
carolinux/carolinux.github.com         0 days 16:35:01
OpenHumans/oh-github-source            0 days 11:29:55
carolinux/opencv_experiments           0 days 11:11:24
carolinux/Subs.py                      0 days 09:17:40
carolinux/shpsync                      0 days 08:05:37
carolinux/grid_helper                  0 days 07:58:30
carolinux/cv                           0 days 07:29:54
carolinux/londonwald                   0 days 07:08:01
carolinux/flask_ansible                0 days 06:47:44
carolinux/forestfires_dw               0 days 05:55:49
Name: duration, dtype: timedelta64[ns]
In [ ]: