numpy pandas matplotlib
https://github.com/pyHPC/ipynbhpc
Instructions:
from __future__ import division
from numpy.random import randn
import numpy as np
import pandas as pd
#from scipy import stats, integrate
import matplotlib.pyplot as plt
#import seaborn as sns
%matplotlib inline
np.set_printoptions(precision=4)
np.random.seed(sum(map(ord, "distributions")))
import math
math.sqrt(0.26)
!pwd
!ls
# take a look at the text data file
!head -n 10 allrounds_2013.txt
# load data into pandas dataframe
game = pd.read_csv('allrounds_2013.txt',
names=['Round', 'team1', 'team2', 'team1score', 'team2score'])
game[:5]
# use seaborn
#import seaborn as sns
#sns.set()
#sns.jointplot(x="team1score", y="team2score", data=game, kind='reg')
#sns.jointplot(x="team1score", y="team2score", data=game, kind='kde')
#f, ax = plt.subplots(figsize=(6, 6))
#cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
#sns.kdeplot(game.team1score, game.team2score, cmap=cmap, n_levels=60, shade=True)
# pivot_table
game['lead1'] = game.team1score - game.team2score
team1_team2 = game.pivot_table('lead1', index='team2',
columns='team1')
team1_team2
#18 teams
team1_team2.shape
#each team play 11 games as team1 or team2. so total games played is 22
team1_team2['Adelaide'].isnull().sum()
#total score for team1 teams, and team2 teams
#don't see obvious bias if a team plays as team1 or team2
team1_score = game.pivot_table('team1score', #index='team2',
columns='team1', aggfunc=sum)
team2_score = game.pivot_table('team2score', #index='team2',
columns='team2', aggfunc=sum)
#team1_score.plot()
pd.DataFrame({'team1':team1_score, 'team2':team2_score}).plot(kind='bar')
#lead score as team1, and team2
#game['lead1'] = game.team1score - game.team2score
team1_perf = game.pivot_table('lead1', #index='team2',
columns='team1', aggfunc=sum)
team1_perf
game['lead2'] = game.team2score - game.team1score
team2_perf = game.pivot_table('lead2', #index='team1',
columns='team2', aggfunc=sum)
#team2_perf
pd.DataFrame({'team1':team1_perf, 'team2':team2_perf}).plot(kind='bar')
#for each team, add its performance as team1, and team2
perf = team1_perf + team2_perf
perf
#sort performance to rank teams
perf.sort(ascending=False)
perf
#sorted team names by performance
sorted_team = perf.index.values
sorted_team
#make two empty dataframes
df = pd.DataFrame({})
df2 = pd.DataFrame({})
# when use group function, all NaN values are stripped
# group by team1: each group is a team played in team1
for groupid, group in game.groupby(['team1']):
#print groupid, group.lead.values
if groupid == 'Adelaide':
print group
#c = group.sort('lead', ascending=False)
#print groupid, c.lead.values
#print df[groupid]
df[groupid] = group.lead1.values
df
#df[sorted_team]
for groupid, group in game.groupby(['team2']):
#c = group.sort('lead2', ascending=False)
df2[groupid] = group.lead2.values
df2
#df2[sorted_team]
#df.reindex_axis(sorted(df.columns), axis=1)
#df.reindex_axis(sorted_team, axis=1)
# concatenate two dataframe
# total 22 games played for each team
frames = [df, df2]
result = pd.concat(frames, ignore_index=True)
result
# sort each column separatly
sortedteamplay = pd.DataFrame({})
for team in sorted_team:
teamplay = result[team].copy()
teamplay.sort(ascending=False)
sortedteamplay[team] = teamplay.values
sortedteamplay
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt
# Get current size
fig_size = plt.rcParams["figure.figsize"]
print "Current size:", fig_size
# Set figure width to 12 and height to 9
fig_size[0] = 12
fig_size[1] = 12
plt.rcParams["figure.figsize"] = fig_size
# team performance plot
from mpl_toolkits.axes_grid1 import ImageGrid
fig = plt.figure(figsize=(16, 16))
grid = ImageGrid(fig, 111, nrows_ncols=(1, 1),
direction='row', axes_pad=0.05, add_all=True,
label_mode='1', share_all=False,
cbar_location='right', cbar_mode='single',
cbar_size='5%', cbar_pad=0.05)
ax = grid[0]
ax.set_title('game lead (each team is a column)', fontsize=16)
ax.tick_params(axis='both', direction='out', labelsize=12)
#im = ax.imshow(df.values, interpolation='nearest', vmax=df.max().max(),
# vmin=df.min().min(), cmap='RdBu')
im = ax.imshow(sortedteamplay.values, interpolation='nearest', vmax=120, vmin=-120, cmap='RdBu')
#colorbar
ax.cax.colorbar(im)
ax.cax.tick_params(labelsize=12)
ax.set_xticks(np.arange(sortedteamplay.shape[1]))
ax.set_xticklabels(sorted_team, rotation='vertical', fontweight='bold')
ax.set_yticks(np.arange(sortedteamplay.shape[0]))
ax.set_yticklabels(sortedteamplay.index)
sorted_team
#g = sns.clustermap(sortedteamplay)
!tail -n 9 allgames_2013.txt
!head -n 10 allstats_2013.txt
import re,os,math
gameround = ''
team1 = team2 = ''
score1 = score2 = ''
timestart = 0
timeend = 0
def splitplay(action):
score = {'behind':1, 'goal':6}
for act in score:
#print act, score[act]
if(act in action):
ai = action.index(act)
return [action[0:ai-1], score[act]]
return [None, 0]
def timeinsec(timetxt):
timearr = map(int, re.findall(r'\d+', timetxt))
return timearr[0] * 60 + timearr[1]
f = open("allstats_ex_finals_2013.csv", "w")
gameid = 0
with open("allstats_2013.txt", "r") as ins:
for line in ins:
if 'Finals' in line:
break
line = line.rstrip('\n')
if line==',':
continue
items = line.split(',')
if (items[0].isdigit()) or ('Finals' in line):
timestart = timeend = 0
if(gameround != items[0]):
gameround = items[0]
gameid = 1
else:
gameid += 1
team1 = items[1]
team2 = items[2]
score1 = items[3]
score2 = items[4]
continue
if 'quarter' in line:
timestart = timeend
sec = timeinsec(line)
timeend = timestart + sec
continue
newitems = []
team1action = items[0]
if team1action == '':
newitems.extend([None, 0])
else:
newitems.extend(splitplay(team1action))
team1time = items[1]
if team1time != '':
newitems.append(timeinsec(team1time) + timestart)
team2time = items[3]
if team2time != '':
newitems.append(timeinsec(team2time) + timestart)
team2action = items[4]
if team2action == '':
newitems.extend([None, 0])
else:
newitems.extend(splitplay(team2action))
newitems.extend([team1, team2, gameround, gameid])#, str(timestart)])
f.write(','.join(str(item) for item in newitems) + '\n')
f.close()
import pandas as pd
allstats = pd.read_csv("allstats_ex_finals_2013.csv",
names=['player1','score1', 'time', 'player2', 'score2','team1','team2','round', 'game'])
allstats[:5]
# 23 rounds per year
import numpy as np
np.unique(allstats.round.values).shape
# each round has 9 games (mostly) -- here we test round1
np.unique(allstats.game[allstats.round==1].values)
from matplotlib.ticker import FuncFormatter
def minsec(x, pos):
'The two args are the value and tick position'
return '$%2dm:$%2ds' % (x/60, x%60)
formatter = FuncFormatter(minsec)
def plotgame(axes, team1, team2, title, group):
team1score = group.score1.cumsum()
team2score = group.score2.cumsum()
s1 = team1score.values[-1]
s2 = team2score.values[-1]
#print s1, s2
scoreboard = ':'.join(str(e) for e in [s1,s2])
gametimes = group.time# + group.qstart
gameframe = pd.DataFrame({'gametime': gametimes, 'lead':team1score-team2score,team1:team1score, team2:team2score})
ax = gameframe.plot(ax=axes,
x='gametime',
#y='lead',
y=[team1,team2],
#kind='scatter',
ylim=(0,120),
xlim=(0,7200),
figsize=(20, 20),
xticks=[0,1800, 3600, 5400, 7200],
legend=True,
sharex=True, sharey=True,
title=title+scoreboard)
#colors: b: blue g: green r: red c: cyan m: magenta y: yellow k: black w: white
#ax.hlines(0, 0, 7200, color='c', linewidth=2)
ax.get_xaxis().set_major_formatter(formatter)
ax.set_xlabel('')
ax.legend(loc='upper left')
import matplotlib.pyplot as plt
def plot_data(df, ncols=3):
# id of row and col of a subgraph
pcol = 0
prow = 0
grouped = df.groupby(['round','game'])
total = len(grouped)
nrows = int(total/ncols)
if total % ncols:
nrows+=1
print 'nrows=',nrows, 'ncols=',ncols
graphtogether = False
#if not graphtogether:
# fig, axes = plt.subplots(nrows, ncols)
#else:
fig = plt.figure(figsize=(12,12))
axes = fig.add_axes([0.15, 0.1, 0.7, 0.7])
for groupid, group in grouped:
print groupid, group.team1.unique()[0], group.team2.unique()[0]
gameround = groupid[0]
gameid = groupid[1]
team1 = group.team1.unique()[0]
team2 = group.team2.unique()[0]
#ax = axes[prow, pcol] if not graphtogether else axes
#print prow, pcol
ax = plt.subplot2grid((nrows,ncols),(prow, pcol)) if not graphtogether else axes
title = (team1+' vs. '+team2) if not graphtogether else 'Score'
plotgame(ax, team1, team2, title, group)
pcol+=1
if pcol == ncols:
pcol=0
prow+=1
# plot games Geelong played as team1
plotteam = 'Geelong'
thisteam1games = allstats[allstats.team1 == plotteam]
plot_data(thisteam1games)
#plot all games in specified round
plotround=1
thisroundgames = allstats[allstats.round == plotround]
plot_data(thisroundgames)
# plot 'Hawthorn' games played as team2
plotteam = 'Hawthorn'
thisteam2games = allstats[allstats.team2 == plotteam]
plot_data(thisteam2games)
# %load formatstats.py
#
# format stats line into csv tuples and save
# example stats line: Scott Pendlebury goal,2m 4s,1.0.6 - 0.0.0,,
# example csv tuples: Scott Pendlebury,6,124,None,0,Collingwood,Fremantle,1,1
# columns=['player1','score1', 'time', 'player2', 'score2','team1','team2','round', 'game']
#
import re,os,math
def splitplay(action):
score = {'behind':1, 'goal':6}
for act in score:
#print act, score[act]
if(act in action):
ai = action.index(act)
return [action[0:ai-1], score[act]]
return [None, 0]
def timeinsec(timetxt):
timearr = map(int, re.findall(r'\d+', timetxt))
return timearr[0] * 60 + timearr[1]
def format_stats(year):
filename = "allstats_"+year+".txt"
exfinals_filename = "allstats_ex_finals_"+year+".csv"
finals_filename = "allstats_finals_"+year+".csv"
print filename, exfinals_filename, finals_filename
exfinals_file = open(exfinals_filename, "w")
finals_file = open(finals_filename, "w")
gameround=''
team1=''
team2=''
score1=''
score2=''
gameid = 0
timestart = 0
timeend = 0
with open(filename, "r") as ins:
for line in ins:
line = line.rstrip('\n')
if line==',':
continue
items = line.split(',')
if (items[0].isdigit()) or ('Finals' in line):
timestart = timeend = 0
if(gameround != items[0]):
gameround = items[0]
gameid = 1
else:
gameid += 1
team1 = items[1]
team2 = items[2]
score1 = items[3]
score2 = items[4]
continue
if 'quarter' in line:
timestart = timeend
sec = timeinsec(line)
timeend = timestart + sec
continue
newitems = []
team1action = items[0]
if team1action == '':
newitems.extend([None, 0])
else:
newitems.extend(splitplay(team1action))
team1time = items[1]
if team1time != '':
newitems.append(timeinsec(team1time) + timestart)
team2time = items[3]
if team2time != '':
newitems.append(timeinsec(team2time) + timestart)
team2action = items[4]
if team2action == '':
newitems.extend([None, 0])
else:
newitems.extend(splitplay(team2action))
newitems.extend([team1, team2, gameround, gameid])#, str(timestart)])
newline = ','.join(str(item) for item in newitems) + '\n'
if gameround == 'Finals':
finals_file.write(newline)
else:
exfinals_file.write(newline)
finals_file.close()
exfinals_file.close()
if __name__ == '__main__':
import sys
format_stats(sys.argv[1])
year='2013'
final_filename = "allstats_finals_"+year+".csv"
final_stats = pd.read_csv(final_filename,
names=['player1','score1', 'time', 'player2', 'score2','team1','team2','round', 'game'])
final_stats.head(5)
plot_data(final_stats)
for g,group in allstats.groupby('team1'):
print g, group.player1.unique().shape
tp = allstats.pivot_table('score1', columns=['team1', 'player1'], aggfunc=sum)
#tp
ax1 = tp['Western Bulldogs'].plot(kind='kde', color='m')
tp['Hawthorn'].plot(kind='kde', color='r')
tp['Geelong'].plot(kind='kde', color='g')
tp['Melbourne'].plot(kind='kde', color='k')
tp['Greater Western Sydney'].plot(kind='kde', color='b')
#lines, labels = ax1.get_legend_handles_labels()
#ax1.legend(lines[:2], labels[:2], loc='best')
team2_players = allstats.pivot_table('score2', index='player2', aggfunc=sum)
team2_players.shape
#sns.distplot(team2_players)
team1_players = allstats.pivot_table('score1', index='player1', aggfunc=sum)
team1_players.shape
#sns.distplot(team1_players)
allscore = team1_players.add(team2_players, fill_value=0)
allscore.shape
allscore.sort(ascending=False)
allscore[0:5]
allscore[-5:]
result = allscore[1:590]
#result
topplayer = result.index
#topplayer
#result.to_frame()
t = pd.DataFrame({'player':result.index, 'totalscore':result.values})
t
def get_team(player):
inteam2=allstats[allstats.player2==player]
inteam1=allstats[allstats.player1==player]
if not inteam2.empty:
return inteam2[:1].team2.values[0]
elif not inteam1.empty:
return inteam1[:1].team1.values[0]
get_team('Will Langford')
get_team('Shaun Edwards')
t['team'] = topplayer.map(get_team)
t
nperf = pd.DataFrame({'team':perf.index, 'teamscore':perf.values})
nperf['rank']=nperf.index.values + 1
nperf
#http://chrisalbon.com/python/pandas_join_merge_dataframe.html
player_team = pd.merge(t, nperf, on='team', how='outer')
player_team
#player_team[['totalscore', 'teamscore']].plot(kind='bar')
from matplotlib import cm
cmap = cm.get_cmap('Spectral') # Colour map (there are many others)
player_team.plot(kind='scatter', x='totalscore', y='teamscore',
c='rank', cmap=cmap,
s=player_team['totalscore'])
from pandas.tools.plotting import scatter_matrix
scatter_matrix, alpha=0.5, figsize=(12, 12), diagonal='kde')
from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(player_team[['team', 'totalscore', 'teamscore']], 'team')