Introduction to Biostatistical Computing PHC 6937

Pandas data manipulation

Zhiguang Huo (Caleb)

Monday Nov 21st, 2022

Outlines

Hierarchical Indexing

data = pd.Series(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
data
## l1  a    0
##     b    1
##     c    2
## l2  a    3
##     b    4
##     c    5
## dtype: int64
data.index
## MultiIndex([('l1', 'a'),
##             ('l1', 'b'),
##             ('l1', 'c'),
##             ('l2', 'a'),
##             ('l2', 'b'),
##             ('l2', 'c')],
##            )
data["l1"]
## a    0
## b    1
## c    2
## dtype: int64
data.loc["l2"]
## a    3
## b    4
## c    5
## dtype: int64
data.loc["l2", "b"]
## 4
data.loc[:,"a"]
## l1    0
## l2    3
## dtype: int64
data.unstack()
##     a  b  c
## l1  0  1  2
## l2  3  4  5
data.unstack().stack()
## l1  a    0
##     b    1
##     c    2
## l2  a    3
##     b    4
##     c    5
## dtype: int64

Swap index

data = pd.Series(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
data.index.names = ["Key1", "Key2"]
data.swaplevel("Key1", "Key2")
## Key2  Key1
## a     l1      0
## b     l1      1
## c     l1      2
## a     l2      3
## b     l2      4
## c     l2      5
## dtype: int64
data.swaplevel(0, 1).sort_index()
## Key2  Key1
## a     l1      0
##       l2      3
## b     l1      1
##       l2      4
## c     l1      2
##       l2      5
## dtype: int64

Hierarchical Indexing

data2 = pd.DataFrame(np.arange(16).reshape((4, -1)),
    index=[["l1", "l1", "l2", "l2"], ["a", "b", "a", "b"]],
    columns=[["TX", "TX", "FL", "FL"],["Red", "Blue", "Red",  "Blue"]])
data2.index.names = ["Key1", "Key2"]
data2.columns.names = ["State", "Color"]
data2
## State      TX       FL     
## Color     Red Blue Red Blue
## Key1 Key2                  
## l1   a      0    1   2    3
##      b      4    5   6    7
## l2   a      8    9  10   11
##      b     12   13  14   15
data2["FL"]
## Color      Red  Blue
## Key1 Key2           
## l1   a       2     3
##      b       6     7
## l2   a      10    11
##      b      14    15

get indexes

data2.index.get_level_values(0)
## Index(['l1', 'l1', 'l2', 'l2'], dtype='object', name='Key1')
data2.index.get_level_values("Key1")
## Index(['l1', 'l1', 'l2', 'l2'], dtype='object', name='Key1')
data2.index.get_level_values(1)
## Index(['a', 'b', 'a', 'b'], dtype='object', name='Key2')
data2.columns.get_level_values(0)
## Index(['TX', 'TX', 'FL', 'FL'], dtype='object', name='State')
data2.columns.get_level_values("State")
## Index(['TX', 'TX', 'FL', 'FL'], dtype='object', name='State')
data2.columns.get_level_values(1)
## Index(['Red', 'Blue', 'Red', 'Blue'], dtype='object', name='Color')

Creating index with DataFrame’s columns

adata = pd.DataFrame({"c1": [1,1,1,2,2,2,2],
        "c2": list("abcdefg"),
        "c3": range(7,0,-1),
        "c4": list("abababa")}
)

adata.set_index("c1")
##    c2  c3 c4
## c1          
## 1   a   7  a
## 1   b   6  b
## 1   c   5  a
## 2   d   4  b
## 2   e   3  a
## 2   f   2  b
## 2   g   1  a
adata.set_index(["c1"], drop=False)
##     c1 c2  c3 c4
## c1              
## 1    1  a   7  a
## 1    1  b   6  b
## 1    1  c   5  a
## 2    2  d   4  b
## 2    2  e   3  a
## 2    2  f   2  b
## 2    2  g   1  a
bdata = adata.set_index(["c1", "c4"])
bdata
##       c2  c3
## c1 c4       
## 1  a   a   7
##    b   b   6
##    a   c   5
## 2  b   d   4
##    a   e   3
##    b   f   2
##    a   g   1
bdata.reset_index()
##    c1 c4 c2  c3
## 0   1  a  a   7
## 1   1  b  b   6
## 2   1  a  c   5
## 3   2  b  d   4
## 4   2  a  e   3
## 5   2  b  f   2
## 6   2  a  g   1

merge based on common column

data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abc"), "value2": range(3)})
pd.merge(data1, data2)
##   key  value1  value2
## 0   b       0       1
## 1   b       5       1
## 2   c       1       2
## 3   c       2       2
## 4   a       3       0
## 5   a       4       0
pd.merge(data1, data2, on = "key")
##   key  value1  value2
## 0   b       0       1
## 1   b       5       1
## 2   c       1       2
## 3   c       2       2
## 4   a       3       0
## 5   a       4       0

merge based on common column

data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
pd.merge(data1, data2, how="inner") ## default
##   key  value1  value2
## 0   b       0       1
## 1   b       5       1
## 2   a       3       0
## 3   a       4       0
pd.merge(data1, data2, how="left")
##   key  value1  value2
## 0   b       0     1.0
## 1   c       1     NaN
## 2   c       2     NaN
## 3   a       3     0.0
## 4   a       4     0.0
## 5   b       5     1.0
pd.merge(data1, data2, how="right")
##   key  value1  value2
## 0   a     3.0       0
## 1   a     4.0       0
## 2   b     0.0       1
## 3   b     5.0       1
## 4   d     NaN       2
pd.merge(data1, data2, how="outer")
##   key  value1  value2
## 0   b     0.0     1.0
## 1   b     5.0     1.0
## 2   c     1.0     NaN
## 3   c     2.0     NaN
## 4   a     3.0     0.0
## 5   a     4.0     0.0
## 6   d     NaN     2.0

merge based on common column

data1 = pd.DataFrame({"key1": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key2": list("abc"), "value2": range(3)})
pd.merge(data1, data2, left_on="key1", right_on="key2") 
##   key1  value1 key2  value2
## 0    b       0    b       1
## 1    b       5    b       1
## 2    c       1    c       2
## 3    c       2    c       2
## 4    a       3    a       0
## 5    a       4    a       0

merge based on common column

data1 = pd.DataFrame({"keyA": list("bccaab"), "keyB": list("AABBCC"), "value1": range(6)})
data2 = pd.DataFrame({"keyA": list("abc"), "keyB": list("CBA"), "value2": range(3)})
pd.merge(data1, data2, on=["keyA", "keyB"]) 
#pd.merge(data1, data2, left_on=["keyA", "keyB"], right_on=["keyA", "keyB"]) 
##   keyA keyB  value1  value2
## 0    c    A       1       2
## 1    a    C       4       0
pd.merge(data1, data2, on=["keyA", "keyB"], how="outer") 
##   keyA keyB  value1  value2
## 0    b    A     0.0     NaN
## 1    c    A     1.0     2.0
## 2    c    B     2.0     NaN
## 3    a    B     3.0     NaN
## 4    a    C     4.0     0.0
## 5    b    C     5.0     NaN
## 6    b    B     NaN     1.0

merge based on common column

data1 = pd.DataFrame({"keyA": list("bccaab"), "keyB": list("AABBCC"), "value1": range(6)})
data2 = pd.DataFrame({"keyA": list("abc"), "keyB": list("CBA"), "value2": range(3)})
pd.merge(data1, data2, on=["keyA"]) 
##   keyA keyB_x  value1 keyB_y  value2
## 0    b      A       0      B       1
## 1    b      C       5      B       1
## 2    c      A       1      A       2
## 3    c      B       2      A       2
## 4    a      B       3      C       0
## 5    a      C       4      C       0
pd.merge(data1, data2, on=["keyA"], suffixes = ["_1", "_2"]) 
##   keyA keyB_1  value1 keyB_2  value2
## 0    b      A       0      B       1
## 1    b      C       5      B       1
## 2    c      A       1      A       2
## 3    c      B       2      A       2
## 4    a      B       3      C       0
## 5    a      C       4      C       0

merge based on index

data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
data1_key = data1.set_index("key")
data2_key = data2.set_index("key")
pd.merge(data1_key,data2_key, left_index=True, right_index=True)
##      value1  value2
## key                
## a         3       0
## a         4       0
## b         0       1
## b         5       1
pd.merge(data1,data2_key, left_on="key", right_index=True)
##   key  value1  value2
## 0   b       0       1
## 5   b       5       1
## 3   a       3       0
## 4   a       4       0

merge based on hierarchical-index

df_left = pd.DataFrame({"Key1": ["l1","l2","l3"], 
        "Key2": ["a", "b", "c"], 
        "value":[11,22,33]})

df_right = pd.DataFrame(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
df_right.index.names = ["Key1", "Key2"]
pd.merge(df_left,df_right, left_on=["Key1", "Key2"], right_index=True, how="outer")
##   Key1 Key2  value    0
## 0   l1    a   11.0  0.0
## 1   l2    b   22.0  4.0
## 2   l3    c   33.0  NaN
## 2   l1    b    NaN  1.0
## 2   l1    c    NaN  2.0
## 2   l2    a    NaN  3.0
## 2   l2    c    NaN  5.0

Exercise:

Based on df_left and df_right in the previous slides

df_right2 = df_right.unstack("Key2")[0]
pd.merge(df_left,df_right2, left_on=["Key1"], right_index=True)
##   Key1 Key2  value  a  b  c
## 0   l1    a     11  0  1  2
## 1   l2    b     22  3  4  5
df_right2 = df_right.unstack("Key1")[0]
pd.merge(df_left,df_right2, left_on=["Key2"], right_index=True)
##   Key1 Key2  value  l1  l2
## 0   l1    a     11   0   3
## 1   l2    b     22   1   4
## 2   l3    c     33   2   5

merge using the join method

data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
data1_key = data1.set_index("key")
data2_key = data2.set_index("key")
data2_key.join(data1_key)
##      value2  value1
## key                
## a         0     3.0
## a         0     4.0
## b         1     0.0
## b         1     5.0
## d         2     NaN
data1_key.join(data2_key)
##      value1  value2
## key                
## a         3     0.0
## a         4     0.0
## b         0     1.0
## b         5     1.0
## c         1     NaN
## c         2     NaN
data1.join(data2_key, on="key")
##   key  value1  value2
## 0   b       0     1.0
## 1   c       1     NaN
## 2   c       2     NaN
## 3   a       3     0.0
## 4   a       4     0.0
## 5   b       5     1.0

Concatenating

series1 = pd.Series([1,2,3], index=list("abc"))
series2 = pd.Series([3,5,7], index=list("abd"))

pd.concat([series1, series2])
## a    1
## b    2
## c    3
## a    3
## b    5
## d    7
## dtype: int64
res = pd.concat([series1, series2], keys=["data1", "data2"])
res
## data1  a    1
##        b    2
##        c    3
## data2  a    3
##        b    5
##        d    7
## dtype: int64
res.unstack()
##          a    b    c    d
## data1  1.0  2.0  3.0  NaN
## data2  3.0  5.0  NaN  7.0
pd.concat([series1, series2], axis=1)
##      0    1
## a  1.0  3.0
## b  2.0  5.0
## c  3.0  NaN
## d  NaN  7.0
pd.concat([series1, series2], axis=1, join="inner")
##    0  1
## a  1  3
## b  2  5

Concatenating

df1 = pd.DataFrame(np.arange(6).reshape(3,-1), index=list("ABC"), columns= ["C1", "C2"])
df2 = pd.DataFrame(np.arange(4).reshape(2,-1), index=list("AB"), columns= ["C2", "C3"])

pd.concat([df1, df2])
##     C1  C2   C3
## A  0.0   1  NaN
## B  2.0   3  NaN
## C  4.0   5  NaN
## A  NaN   0  1.0
## B  NaN   2  3.0
pd.concat([df1, df2], ignore_index=True)
##     C1  C2   C3
## 0  0.0   1  NaN
## 1  2.0   3  NaN
## 2  4.0   5  NaN
## 3  NaN   0  1.0
## 4  NaN   2  3.0
df1 = pd.DataFrame(np.arange(6).reshape(3,-1), index=list("ABC"), columns= ["C1", "C2"])
df2 = pd.DataFrame(np.arange(4).reshape(2,-1), index=list("AB"), columns= ["D1", "D2"])

pd.concat([df1, df2], axis=1)
##    C1  C2   D1   D2
## A   0   1  0.0  1.0
## B   2   3  2.0  3.0
## C   4   5  NaN  NaN
pd.concat([df1, df2], axis=1, keys=["Level1", "Level2"])
##   Level1    Level2     
##       C1 C2     D1   D2
## A      0  1    0.0  1.0
## B      2  3    2.0  3.0
## C      4  5    NaN  NaN
pd.concat({"Level1": df1, "Level2": df2}, axis=1)
##   Level1    Level2     
##       C1 C2     D1   D2
## A      0  1    0.0  1.0
## B      2  3    2.0  3.0
## C      4  5    NaN  NaN

Combining data with overlaps

s1 = pd.Series([np.nan,1,2,3,np.nan], index=list("abcde"))
s2 = pd.Series([1,np.nan,6,7, np.nan], index=list("bcdea"))

np.where(pd.isna(s1), s2, s1) ## this will ignore index
## array([ 1.,  1.,  2.,  3., nan])
s1.combine_first(s2) ## also match index
## a    NaN
## b    1.0
## c    2.0
## d    3.0
## e    7.0
## dtype: float64
df1 = pd.DataFrame({"C1": [np.nan, 1, 2], 
                    "C2": [5, np.nan, 3],
                    "C3": [np.nan, 9, np.nan]
}, index=list("ABC"))
df2 = pd.DataFrame({"C1": [1,2,], 
                    "C2": [4,5,]
}, index=list("AB"))

df1
##     C1   C2   C3
## A  NaN  5.0  NaN
## B  1.0  NaN  9.0
## C  2.0  3.0  NaN
df2
##    C1  C2
## A   1   4
## B   2   5
df1.combine_first(df2)
##     C1   C2   C3
## A  1.0  5.0  NaN
## B  1.0  5.0  9.0
## C  2.0  3.0  NaN

Stack and unstack

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.columns.name="Columns"
pd1.index.name="States"
pd1
## Columns  b  d  c
## States          
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
pd1_stacked = pd1.stack()
pd1_stacked
## States   Columns
## Florida  b          0
##          d          1
##          c          2
## Texax    b          3
##          d          4
##          c          5
## Utah     b          6
##          d          7
##          c          8
## dtype: int64

Stack and unstack

pd1_stacked.unstack()
## Columns  b  d  c
## States          
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
pd1_stacked.unstack(0)
## States   Florida  Texax  Utah
## Columns                      
## b              0      3     6
## d              1      4     7
## c              2      5     8
pd1_stacked.unstack("Columns")
## Columns  b  d  c
## States          
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
pd1_stacked.unstack("States")
## States   Florida  Texax  Utah
## Columns                      
## b              0      3     6
## d              1      4     7
## c              2      5     8

Specify the levels in both stack and unstack

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.columns.name="Columns"
pd1.index.name="States"
# pd1
pd1.stack("Columns").unstack("States")
## States   Florida  Texax  Utah
## Columns                      
## b              0      3     6
## d              1      4     7
## c              2      5     8
pd1.T
## States   Florida  Texax  Utah
## Columns                      
## b              0      3     6
## d              1      4     7
## c              2      5     8

Pivoting between long and wide format

import io
import requests
url="https://caleb-huo.github.io/teaching/data/sleep/sleepstudy.csv"
s=requests.get(url).content
sleepdata=pd.read_csv(io.StringIO(s.decode('utf-8')))
sleepdata.head()
##    Reaction  Days  Subject
## 0  249.5600     0      308
## 1  258.7047     1      308
## 2  250.8006     2      308
## 3  321.4398     3      308
## 4  356.8519     4      308

Pivoting from long format to wide format

sleepdata_wide = sleepdata.pivot(index="Subject", columns="Days")
sleepdata_wide.head()
##          Reaction                      ...                              
## Days            0         1         2  ...         7         8         9
## Subject                                ...                              
## 308      249.5600  258.7047  250.8006  ...  290.1486  430.5853  466.3535
## 309      222.7339  205.2658  202.9778  ...  217.7272  224.2957  237.3142
## 310      199.0539  194.3322  234.3200  ...  255.7511  261.0125  247.5153
## 330      321.5426  300.4002  283.8565  ...  318.2613  305.3495  354.0487
## 331      287.6079  285.0000  301.8206  ...  334.8177  293.7469  371.5811
## 
## [5 rows x 10 columns]
sleepdata.set_index(["Subject", "Days"]).unstack("Days")

Pivoting from wide format to long format

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.index.name = "State"
pd2 = pd1.reset_index()
pd2
##      State  b  d  c
## 0  Florida  0  1  2
## 1    Texax  3  4  5
## 2     Utah  6  7  8
pd2.melt(id_vars="State", value_vars=["b", "d", "c"])
##      State variable  value
## 0  Florida        b      0
## 1    Texax        b      3
## 2     Utah        b      6
## 3  Florida        d      1
## 4    Texax        d      4
## 5     Utah        d      7
## 6  Florida        c      2
## 7    Texax        c      5
## 8     Utah        c      8
sleepdata_wide3 = sleepdata_wide
sleepdata_wide3.columns = sleepdata_wide.columns.get_level_values(1)
sleepdata_wide3.reset_index(inplace=True)
sleepdata_long = pd.melt(sleepdata_wide3, id_vars = ["Subject"], value_vars = range(10))
sleepdata_long
##      Subject Days     value
## 0        308    0  249.5600
## 1        309    0  222.7339
## 2        310    0  199.0539
## 3        330    0  321.5426
## 4        331    0  287.6079
## ..       ...  ...       ...
## 175      352    9  388.5417
## 176      369    9  366.5131
## 177      370    9  372.2288
## 178      371    9  369.4692
## 179      372    9  364.1236
## 
## [180 rows x 3 columns]
sleepdata_wide.set_index("Subject").stack().reset_index()
##      Subject  Days         0
## 0        308     0  249.5600
## 1        308     1  258.7047
## 2        308     2  250.8006
## 3        308     3  321.4398
## 4        308     4  356.8519
## ..       ...   ...       ...
## 175      372     5  329.6076
## 176      372     6  334.4818
## 177      372     7  343.2199
## 178      372     8  369.1417
## 179      372     9  364.1236
## 
## [180 rows x 3 columns]

Groupby with single group key

adata = pd.DataFrame({
    "Key1": list("AAABBB"),
    "value1": np.arange(6),
})
adata
##   Key1  value1
## 0    A       0
## 1    A       1
## 2    A       2
## 3    B       3
## 4    B       4
## 5    B       5
adata.groupby("Key1").mean()
##       value1
## Key1        
## A        1.0
## B        4.0
adata["value1"].groupby(adata["Key1"]).mean() ## this may be more interpretable
## Key1
## A    1.0
## B    4.0
## Name: value1, dtype: float64

Iteration over groups

adata = pd.DataFrame({
    "Key1": list("AAABBB"),
    "value1": np.arange(6),
})
grouped_data = adata.groupby("Key1")

for name, data in grouped_data:
    print(name)
    print(data)
## A
##   Key1  value1
## 0    A       0
## 1    A       1
## 2    A       2
## B
##   Key1  value1
## 3    B       3
## 4    B       4
## 5    B       5

Groupby and dictionary

adata = pd.DataFrame({
    "Key1": list("AAABBB"),
    "value1": np.arange(6),
})
grouped_data = adata.groupby("Key1")

adict = dict(list(grouped_data))
adict.keys()
## dict_keys(['A', 'B'])
adict["A"]
##   Key1  value1
## 0    A       0
## 1    A       1
## 2    A       2

Groupby with multiple group key

bdata = pd.DataFrame({
    "Key1": list("AAABBB"),
    "Key2": ["blue", "orange", "blue", "orange", "blue", "orange"],
    "value1": np.arange(6),
    "value2": np.arange(10,16)
})
bdata
##   Key1    Key2  value1  value2
## 0    A    blue       0      10
## 1    A  orange       1      11
## 2    A    blue       2      12
## 3    B  orange       3      13
## 4    B    blue       4      14
## 5    B  orange       5      15
bdata.groupby(["Key1", "Key2"]).mean()
##              value1  value2
## Key1 Key2                  
## A    blue       1.0    11.0
##      orange     1.0    11.0
## B    blue       4.0    14.0
##      orange     4.0    14.0
bdata.groupby(["Key1", "Key2"]).size()
## Key1  Key2  
## A     blue      2
##       orange    1
## B     blue      1
##       orange    2
## dtype: int64
bdata.groupby(["Key1", "Key2"])["value1"].mean()
## Key1  Key2  
## A     blue      1.0
##       orange    1.0
## B     blue      4.0
##       orange    4.0
## Name: value1, dtype: float64
bdata.groupby(["Key1", "Key2"])["value2"].size()
## Key1  Key2  
## A     blue      2
##       orange    1
## B     blue      1
##       orange    2
## Name: value2, dtype: int64

Grouping with functions

bdata.index = ["Amy", "Beth", "Carl", "Dan", "Emily", "Frank"]
bdata
##       Key1    Key2  value1  value2
## Amy      A    blue       0      10
## Beth     A  orange       1      11
## Carl     A    blue       2      12
## Dan      B  orange       3      13
## Emily    B    blue       4      14
## Frank    B  orange       5      15
bdata.groupby(len)[["value1", "value2"]].min()
##    value1  value2
## 3       0      10
## 4       1      11
## 5       4      14

Data Aggregation

bdata.drop(columns = "Key2").groupby("Key1").agg(np.median)
##       value1  value2
## Key1                
## A        1.0    11.0
## B        4.0    14.0
def maxDiff(arr):
    return(arr.max() - arr.min())

bdata.drop(columns = "Key2").groupby("Key1").agg(maxDiff)
##       value1  value2
## Key1                
## A          2       2
## B          2       2
bdata.drop(columns = "Key2").groupby("Key1").agg(lambda x: x.max() - x.min())
##       value1  value2
## Key1                
## A          2       2
## B          2       2

Data Aggregation

bdata.drop(columns = "Key2").groupby("Key1").agg([np.median, np.mean, maxDiff])
##      value1              value2              
##      median mean maxDiff median  mean maxDiff
## Key1                                         
## A       1.0  1.0       2   11.0  11.0       2
## B       4.0  4.0       2   14.0  14.0       2
bdata.drop(columns = "Key2").groupby("Key1").agg([("mymedian",np.median), ("mymean", np.mean), ("mymaxDiff", maxDiff)])
##        value1                    value2                 
##      mymedian mymean mymaxDiff mymedian mymean mymaxDiff
## Key1                                                    
## A         1.0    1.0         2     11.0   11.0         2
## B         4.0    4.0         2     14.0   14.0         2

Data Aggregation with selected columns

bdata.drop(columns = "Key2").groupby("Key1").agg({"value1": [np.median, np.mean], "value2": [maxDiff]})
##      value1       value2
##      median mean maxDiff
## Key1                    
## A       1.0  1.0       2
## B       4.0  4.0       2
bdata.groupby(["Key1", "Key2"]).agg({"value1": [np.median, np.mean], "value2": [maxDiff]})
##             value1       value2
##             median mean maxDiff
## Key1 Key2                      
## A    blue      1.0  1.0       2
##      orange    1.0  1.0       0
## B    blue      4.0  4.0       0
##      orange    4.0  4.0       2

Data Aggregation without row index

bdata.groupby(["Key1", "Key2"], as_index=False).agg(np.mean)
##   Key1    Key2  value1  value2
## 0    A    blue     1.0    11.0
## 1    A  orange     1.0    11.0
## 2    B    blue     4.0    14.0
## 3    B  orange     4.0    14.0

Data Aggregation

bdata.groupby(["Key1", "Key2"], as_index=False).agg("mean")
##   Key1    Key2  value1  value2
## 0    A    blue     1.0    11.0
## 1    A  orange     1.0    11.0
## 2    B    blue     4.0    14.0
## 3    B  orange     4.0    14.0
bdata.groupby(["Key1", "Key2"], as_index=False).agg(np.mean)
##   Key1    Key2  value1  value2
## 0    A    blue     1.0    11.0
## 1    A  orange     1.0    11.0
## 2    B    blue     4.0    14.0
## 3    B  orange     4.0    14.0
bdata.groupby(["Key1", "Key2"], as_index=False).mean()
##   Key1    Key2  value1  value2
## 0    A    blue     1.0    11.0
## 1    A  orange     1.0    11.0
## 2    B    blue     4.0    14.0
## 3    B  orange     4.0    14.0

Data Aggregation for functions with extra argument

def myMean(arr, offset = 5):
    return(arr.mean() - offset)

bdata.groupby(["Key1", "Key2"], as_index=False).agg(myMean)
##   Key1    Key2  value1  value2
## 0    A    blue    -4.0     6.0
## 1    A  orange    -4.0     6.0
## 2    B    blue    -1.0     9.0
## 3    B  orange    -1.0     9.0
bdata.groupby(["Key1", "Key2"], as_index=False).agg(myMean, offset=4)
##   Key1    Key2  value1  value2
## 0    A    blue    -3.0     7.0
## 1    A  orange    -3.0     7.0
## 2    B    blue     0.0    10.0
## 3    B  orange     0.0    10.0

Data Aggregation by apply function

bdata.groupby(["Key1", "Key2"])["value1"].apply(np.mean)
## Key1  Key2  
## A     blue      1.0
##       orange    1.0
## B     blue      4.0
##       orange    4.0
## Name: value1, dtype: float64
bdata.groupby(["Key1", "Key2"], as_index=False)["value1"].apply(np.mean)
##   Key1    Key2  value1
## 0    A    blue     1.0
## 1    A  orange     1.0
## 2    B    blue     4.0
## 3    B  orange     4.0
bdata.groupby(["Key1", "Key2"])["value2"].apply(myMean)
## Key1  Key2  
## A     blue      6.0
##       orange    6.0
## B     blue      9.0
##       orange    9.0
## Name: value2, dtype: float64
bdata.groupby(["Key1", "Key2"])["value2"].apply(myMean, offset=3)
## Key1  Key2  
## A     blue       8.0
##       orange     8.0
## B     blue      11.0
##       orange    11.0
## Name: value2, dtype: float64

Exercise

sleepdata.head()
##    Reaction  Days  Subject
## 0  249.5600     0      308
## 1  258.7047     1      308
## 2  250.8006     2      308
## 3  321.4398     3      308
## 4  356.8519     4      308
def maxDiff(arr):
    return(arr.max() - arr.min())


def secMax(arr):
    return(np.sort(arr)[::-1][1])


sleepdata.drop(columns="Days").groupby(["Subject"]).agg([np.mean, np.std, np.max, secMax, maxDiff])
##           Reaction                                         
##               mean        std      amax    secMax   maxDiff
## Subject                                                    
## 308      342.13383  79.821763  466.3535  430.5853  216.7935
## 309      215.23298  10.812193  237.3142  224.2957   34.3364
## 310      231.00127  21.855996  261.0125  255.7511   66.6803
## 330      303.22142  22.909199  354.0487  321.5426   73.8091
## 331      309.43605  27.242607  371.5811  334.8177   86.5811
## 332      307.30207  64.306126  454.1619  346.8311  219.3013
## 333      316.15831  30.068208  362.0428  348.8399   85.2735
## 334      295.30205  41.855607  377.2990  335.7469  133.9343
## 335      250.07004  13.833852  273.9472  270.8021   38.6362
## 337      375.72101  59.623789  458.9167  455.8643  167.3055
## 349      275.83447  42.937942  351.6451  336.2806  121.3284
## 350      313.60268  63.360558  394.4872  389.0527  151.0329
## 351      290.09775  28.978811  347.5655  321.5418   97.0390
## 352      337.42154  47.602380  388.5417  375.6406  166.8646
## 369      306.03464  37.460429  366.5131  348.1229  109.2707
## 370      291.70177  59.208212  372.2288  365.1630  146.9648
## 371      294.98404  36.505989  369.4692  350.7807  110.2034
## 372      317.88613  35.822804  369.1417  364.1236   99.7300

Reference