Zhiguang Huo (Caleb)
Monday Nov 21st, 2022
data = pd.Series(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
data## l1  a    0
##     b    1
##     c    2
## l2  a    3
##     b    4
##     c    5
## dtype: int64## MultiIndex([('l1', 'a'),
##             ('l1', 'b'),
##             ('l1', 'c'),
##             ('l2', 'a'),
##             ('l2', 'b'),
##             ('l2', 'c')],
##            )## a    0
## b    1
## c    2
## dtype: int64## a    3
## b    4
## c    5
## dtype: int64## 4## l1    0
## l2    3
## dtype: int64##     a  b  c
## l1  0  1  2
## l2  3  4  5## l1  a    0
##     b    1
##     c    2
## l2  a    3
##     b    4
##     c    5
## dtype: int64data = pd.Series(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
data.index.names = ["Key1", "Key2"]
data.swaplevel("Key1", "Key2")## Key2  Key1
## a     l1      0
## b     l1      1
## c     l1      2
## a     l2      3
## b     l2      4
## c     l2      5
## dtype: int64## Key2  Key1
## a     l1      0
##       l2      3
## b     l1      1
##       l2      4
## c     l1      2
##       l2      5
## dtype: int64data2 = pd.DataFrame(np.arange(16).reshape((4, -1)),
    index=[["l1", "l1", "l2", "l2"], ["a", "b", "a", "b"]],
    columns=[["TX", "TX", "FL", "FL"],["Red", "Blue", "Red",  "Blue"]])
data2.index.names = ["Key1", "Key2"]
data2.columns.names = ["State", "Color"]
data2## State      TX       FL     
## Color     Red Blue Red Blue
## Key1 Key2                  
## l1   a      0    1   2    3
##      b      4    5   6    7
## l2   a      8    9  10   11
##      b     12   13  14   15## Color      Red  Blue
## Key1 Key2           
## l1   a       2     3
##      b       6     7
## l2   a      10    11
##      b      14    15## Index(['l1', 'l1', 'l2', 'l2'], dtype='object', name='Key1')## Index(['l1', 'l1', 'l2', 'l2'], dtype='object', name='Key1')## Index(['a', 'b', 'a', 'b'], dtype='object', name='Key2')## Index(['TX', 'TX', 'FL', 'FL'], dtype='object', name='State')## Index(['TX', 'TX', 'FL', 'FL'], dtype='object', name='State')## Index(['Red', 'Blue', 'Red', 'Blue'], dtype='object', name='Color')adata = pd.DataFrame({"c1": [1,1,1,2,2,2,2],
        "c2": list("abcdefg"),
        "c3": range(7,0,-1),
        "c4": list("abababa")}
)
adata.set_index("c1")##    c2  c3 c4
## c1          
## 1   a   7  a
## 1   b   6  b
## 1   c   5  a
## 2   d   4  b
## 2   e   3  a
## 2   f   2  b
## 2   g   1  a##     c1 c2  c3 c4
## c1              
## 1    1  a   7  a
## 1    1  b   6  b
## 1    1  c   5  a
## 2    2  d   4  b
## 2    2  e   3  a
## 2    2  f   2  b
## 2    2  g   1  a##       c2  c3
## c1 c4       
## 1  a   a   7
##    b   b   6
##    a   c   5
## 2  b   d   4
##    a   e   3
##    b   f   2
##    a   g   1##    c1 c4 c2  c3
## 0   1  a  a   7
## 1   1  b  b   6
## 2   1  a  c   5
## 3   2  b  d   4
## 4   2  a  e   3
## 5   2  b  f   2
## 6   2  a  g   1data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abc"), "value2": range(3)})
pd.merge(data1, data2)##   key  value1  value2
## 0   b       0       1
## 1   b       5       1
## 2   c       1       2
## 3   c       2       2
## 4   a       3       0
## 5   a       4       0##   key  value1  value2
## 0   b       0       1
## 1   b       5       1
## 2   c       1       2
## 3   c       2       2
## 4   a       3       0
## 5   a       4       0data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
pd.merge(data1, data2, how="inner") ## default##   key  value1  value2
## 0   b       0       1
## 1   b       5       1
## 2   a       3       0
## 3   a       4       0##   key  value1  value2
## 0   b       0     1.0
## 1   c       1     NaN
## 2   c       2     NaN
## 3   a       3     0.0
## 4   a       4     0.0
## 5   b       5     1.0##   key  value1  value2
## 0   a     3.0       0
## 1   a     4.0       0
## 2   b     0.0       1
## 3   b     5.0       1
## 4   d     NaN       2##   key  value1  value2
## 0   b     0.0     1.0
## 1   b     5.0     1.0
## 2   c     1.0     NaN
## 3   c     2.0     NaN
## 4   a     3.0     0.0
## 5   a     4.0     0.0
## 6   d     NaN     2.0data1 = pd.DataFrame({"key1": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key2": list("abc"), "value2": range(3)})
pd.merge(data1, data2, left_on="key1", right_on="key2") ##   key1  value1 key2  value2
## 0    b       0    b       1
## 1    b       5    b       1
## 2    c       1    c       2
## 3    c       2    c       2
## 4    a       3    a       0
## 5    a       4    a       0data1 = pd.DataFrame({"keyA": list("bccaab"), "keyB": list("AABBCC"), "value1": range(6)})
data2 = pd.DataFrame({"keyA": list("abc"), "keyB": list("CBA"), "value2": range(3)})
pd.merge(data1, data2, on=["keyA", "keyB"]) 
#pd.merge(data1, data2, left_on=["keyA", "keyB"], right_on=["keyA", "keyB"]) ##   keyA keyB  value1  value2
## 0    c    A       1       2
## 1    a    C       4       0##   keyA keyB  value1  value2
## 0    b    A     0.0     NaN
## 1    c    A     1.0     2.0
## 2    c    B     2.0     NaN
## 3    a    B     3.0     NaN
## 4    a    C     4.0     0.0
## 5    b    C     5.0     NaN
## 6    b    B     NaN     1.0data1 = pd.DataFrame({"keyA": list("bccaab"), "keyB": list("AABBCC"), "value1": range(6)})
data2 = pd.DataFrame({"keyA": list("abc"), "keyB": list("CBA"), "value2": range(3)})
pd.merge(data1, data2, on=["keyA"]) ##   keyA keyB_x  value1 keyB_y  value2
## 0    b      A       0      B       1
## 1    b      C       5      B       1
## 2    c      A       1      A       2
## 3    c      B       2      A       2
## 4    a      B       3      C       0
## 5    a      C       4      C       0##   keyA keyB_1  value1 keyB_2  value2
## 0    b      A       0      B       1
## 1    b      C       5      B       1
## 2    c      A       1      A       2
## 3    c      B       2      A       2
## 4    a      B       3      C       0
## 5    a      C       4      C       0data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
data1_key = data1.set_index("key")
data2_key = data2.set_index("key")
pd.merge(data1_key,data2_key, left_index=True, right_index=True)##      value1  value2
## key                
## a         3       0
## a         4       0
## b         0       1
## b         5       1##   key  value1  value2
## 0   b       0       1
## 5   b       5       1
## 3   a       3       0
## 4   a       4       0df_left = pd.DataFrame({"Key1": ["l1","l2","l3"], 
        "Key2": ["a", "b", "c"], 
        "value":[11,22,33]})
df_right = pd.DataFrame(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
df_right.index.names = ["Key1", "Key2"]
pd.merge(df_left,df_right, left_on=["Key1", "Key2"], right_index=True, how="outer")##   Key1 Key2  value    0
## 0   l1    a   11.0  0.0
## 1   l2    b   22.0  4.0
## 2   l3    c   33.0  NaN
## 2   l1    b    NaN  1.0
## 2   l1    c    NaN  2.0
## 2   l2    a    NaN  3.0
## 2   l2    c    NaN  5.0Based on df_left and df_right in the previous slides
df_right2 = df_right.unstack("Key2")[0]
pd.merge(df_left,df_right2, left_on=["Key1"], right_index=True)##   Key1 Key2  value  a  b  c
## 0   l1    a     11  0  1  2
## 1   l2    b     22  3  4  5df_right2 = df_right.unstack("Key1")[0]
pd.merge(df_left,df_right2, left_on=["Key2"], right_index=True)##   Key1 Key2  value  l1  l2
## 0   l1    a     11   0   3
## 1   l2    b     22   1   4
## 2   l3    c     33   2   5data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
data1_key = data1.set_index("key")
data2_key = data2.set_index("key")##      value2  value1
## key                
## a         0     3.0
## a         0     4.0
## b         1     0.0
## b         1     5.0
## d         2     NaN##      value1  value2
## key                
## a         3     0.0
## a         4     0.0
## b         0     1.0
## b         5     1.0
## c         1     NaN
## c         2     NaN##   key  value1  value2
## 0   b       0     1.0
## 1   c       1     NaN
## 2   c       2     NaN
## 3   a       3     0.0
## 4   a       4     0.0
## 5   b       5     1.0series1 = pd.Series([1,2,3], index=list("abc"))
series2 = pd.Series([3,5,7], index=list("abd"))
pd.concat([series1, series2])## a    1
## b    2
## c    3
## a    3
## b    5
## d    7
## dtype: int64## data1  a    1
##        b    2
##        c    3
## data2  a    3
##        b    5
##        d    7
## dtype: int64##          a    b    c    d
## data1  1.0  2.0  3.0  NaN
## data2  3.0  5.0  NaN  7.0##      0    1
## a  1.0  3.0
## b  2.0  5.0
## c  3.0  NaN
## d  NaN  7.0##    0  1
## a  1  3
## b  2  5df1 = pd.DataFrame(np.arange(6).reshape(3,-1), index=list("ABC"), columns= ["C1", "C2"])
df2 = pd.DataFrame(np.arange(4).reshape(2,-1), index=list("AB"), columns= ["C2", "C3"])
pd.concat([df1, df2])##     C1  C2   C3
## A  0.0   1  NaN
## B  2.0   3  NaN
## C  4.0   5  NaN
## A  NaN   0  1.0
## B  NaN   2  3.0##     C1  C2   C3
## 0  0.0   1  NaN
## 1  2.0   3  NaN
## 2  4.0   5  NaN
## 3  NaN   0  1.0
## 4  NaN   2  3.0df1 = pd.DataFrame(np.arange(6).reshape(3,-1), index=list("ABC"), columns= ["C1", "C2"])
df2 = pd.DataFrame(np.arange(4).reshape(2,-1), index=list("AB"), columns= ["D1", "D2"])
pd.concat([df1, df2], axis=1)##    C1  C2   D1   D2
## A   0   1  0.0  1.0
## B   2   3  2.0  3.0
## C   4   5  NaN  NaN##   Level1    Level2     
##       C1 C2     D1   D2
## A      0  1    0.0  1.0
## B      2  3    2.0  3.0
## C      4  5    NaN  NaN##   Level1    Level2     
##       C1 C2     D1   D2
## A      0  1    0.0  1.0
## B      2  3    2.0  3.0
## C      4  5    NaN  NaNs1 = pd.Series([np.nan,1,2,3,np.nan], index=list("abcde"))
s2 = pd.Series([1,np.nan,6,7, np.nan], index=list("bcdea"))
np.where(pd.isna(s1), s2, s1) ## this will ignore index## array([ 1.,  1.,  2.,  3., nan])## a    NaN
## b    1.0
## c    2.0
## d    3.0
## e    7.0
## dtype: float64df1 = pd.DataFrame({"C1": [np.nan, 1, 2], 
                    "C2": [5, np.nan, 3],
                    "C3": [np.nan, 9, np.nan]
}, index=list("ABC"))
df2 = pd.DataFrame({"C1": [1,2,], 
                    "C2": [4,5,]
}, index=list("AB"))
df1##     C1   C2   C3
## A  NaN  5.0  NaN
## B  1.0  NaN  9.0
## C  2.0  3.0  NaN##    C1  C2
## A   1   4
## B   2   5##     C1   C2   C3
## A  1.0  5.0  NaN
## B  1.0  5.0  9.0
## C  2.0  3.0  NaNpd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.columns.name="Columns"
pd1.index.name="States"
pd1## Columns  b  d  c
## States          
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8## States   Columns
## Florida  b          0
##          d          1
##          c          2
## Texax    b          3
##          d          4
##          c          5
## Utah     b          6
##          d          7
##          c          8
## dtype: int64## Columns  b  d  c
## States          
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8## States   Florida  Texax  Utah
## Columns                      
## b              0      3     6
## d              1      4     7
## c              2      5     8## Columns  b  d  c
## States          
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8## States   Florida  Texax  Utah
## Columns                      
## b              0      3     6
## d              1      4     7
## c              2      5     8pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.columns.name="Columns"
pd1.index.name="States"
# pd1
pd1.stack("Columns").unstack("States")## States   Florida  Texax  Utah
## Columns                      
## b              0      3     6
## d              1      4     7
## c              2      5     8## States   Florida  Texax  Utah
## Columns                      
## b              0      3     6
## d              1      4     7
## c              2      5     8import io
import requests
url="https://caleb-huo.github.io/teaching/data/sleep/sleepstudy.csv"
s=requests.get(url).content
sleepdata=pd.read_csv(io.StringIO(s.decode('utf-8')))
sleepdata.head()##    Reaction  Days  Subject
## 0  249.5600     0      308
## 1  258.7047     1      308
## 2  250.8006     2      308
## 3  321.4398     3      308
## 4  356.8519     4      308##          Reaction                      ...                              
## Days            0         1         2  ...         7         8         9
## Subject                                ...                              
## 308      249.5600  258.7047  250.8006  ...  290.1486  430.5853  466.3535
## 309      222.7339  205.2658  202.9778  ...  217.7272  224.2957  237.3142
## 310      199.0539  194.3322  234.3200  ...  255.7511  261.0125  247.5153
## 330      321.5426  300.4002  283.8565  ...  318.2613  305.3495  354.0487
## 331      287.6079  285.0000  301.8206  ...  334.8177  293.7469  371.5811
## 
## [5 rows x 10 columns]sleepdata.set_index(["Subject", "Days"]).unstack("Days")pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.index.name = "State"
pd2 = pd1.reset_index()
pd2##      State  b  d  c
## 0  Florida  0  1  2
## 1    Texax  3  4  5
## 2     Utah  6  7  8##      State variable  value
## 0  Florida        b      0
## 1    Texax        b      3
## 2     Utah        b      6
## 3  Florida        d      1
## 4    Texax        d      4
## 5     Utah        d      7
## 6  Florida        c      2
## 7    Texax        c      5
## 8     Utah        c      8sleepdata_wide3 = sleepdata_wide
sleepdata_wide3.columns = sleepdata_wide.columns.get_level_values(1)
sleepdata_wide3.reset_index(inplace=True)
sleepdata_long = pd.melt(sleepdata_wide3, id_vars = ["Subject"], value_vars = range(10))
sleepdata_long##      Subject Days     value
## 0        308    0  249.5600
## 1        309    0  222.7339
## 2        310    0  199.0539
## 3        330    0  321.5426
## 4        331    0  287.6079
## ..       ...  ...       ...
## 175      352    9  388.5417
## 176      369    9  366.5131
## 177      370    9  372.2288
## 178      371    9  369.4692
## 179      372    9  364.1236
## 
## [180 rows x 3 columns]##      Subject  Days         0
## 0        308     0  249.5600
## 1        308     1  258.7047
## 2        308     2  250.8006
## 3        308     3  321.4398
## 4        308     4  356.8519
## ..       ...   ...       ...
## 175      372     5  329.6076
## 176      372     6  334.4818
## 177      372     7  343.2199
## 178      372     8  369.1417
## 179      372     9  364.1236
## 
## [180 rows x 3 columns]##   Key1  value1
## 0    A       0
## 1    A       1
## 2    A       2
## 3    B       3
## 4    B       4
## 5    B       5##       value1
## Key1        
## A        1.0
## B        4.0## Key1
## A    1.0
## B    4.0
## Name: value1, dtype: float64adata = pd.DataFrame({
    "Key1": list("AAABBB"),
    "value1": np.arange(6),
})
grouped_data = adata.groupby("Key1")
for name, data in grouped_data:
    print(name)
    print(data)## A
##   Key1  value1
## 0    A       0
## 1    A       1
## 2    A       2
## B
##   Key1  value1
## 3    B       3
## 4    B       4
## 5    B       5adata = pd.DataFrame({
    "Key1": list("AAABBB"),
    "value1": np.arange(6),
})
grouped_data = adata.groupby("Key1")
adict = dict(list(grouped_data))
adict.keys()## dict_keys(['A', 'B'])##   Key1  value1
## 0    A       0
## 1    A       1
## 2    A       2bdata = pd.DataFrame({
    "Key1": list("AAABBB"),
    "Key2": ["blue", "orange", "blue", "orange", "blue", "orange"],
    "value1": np.arange(6),
    "value2": np.arange(10,16)
})
bdata##   Key1    Key2  value1  value2
## 0    A    blue       0      10
## 1    A  orange       1      11
## 2    A    blue       2      12
## 3    B  orange       3      13
## 4    B    blue       4      14
## 5    B  orange       5      15##              value1  value2
## Key1 Key2                  
## A    blue       1.0    11.0
##      orange     1.0    11.0
## B    blue       4.0    14.0
##      orange     4.0    14.0## Key1  Key2  
## A     blue      2
##       orange    1
## B     blue      1
##       orange    2
## dtype: int64## Key1  Key2  
## A     blue      1.0
##       orange    1.0
## B     blue      4.0
##       orange    4.0
## Name: value1, dtype: float64## Key1  Key2  
## A     blue      2
##       orange    1
## B     blue      1
##       orange    2
## Name: value2, dtype: int64##       Key1    Key2  value1  value2
## Amy      A    blue       0      10
## Beth     A  orange       1      11
## Carl     A    blue       2      12
## Dan      B  orange       3      13
## Emily    B    blue       4      14
## Frank    B  orange       5      15##    value1  value2
## 3       0      10
## 4       1      11
## 5       4      14##       value1  value2
## Key1                
## A        1.0    11.0
## B        4.0    14.0def maxDiff(arr):
    return(arr.max() - arr.min())
bdata.drop(columns = "Key2").groupby("Key1").agg(maxDiff)##       value1  value2
## Key1                
## A          2       2
## B          2       2##       value1  value2
## Key1                
## A          2       2
## B          2       2##      value1              value2              
##      median mean maxDiff median  mean maxDiff
## Key1                                         
## A       1.0  1.0       2   11.0  11.0       2
## B       4.0  4.0       2   14.0  14.0       2##        value1                    value2                 
##      mymedian mymean mymaxDiff mymedian mymean mymaxDiff
## Key1                                                    
## A         1.0    1.0         2     11.0   11.0         2
## B         4.0    4.0         2     14.0   14.0         2##      value1       value2
##      median mean maxDiff
## Key1                    
## A       1.0  1.0       2
## B       4.0  4.0       2##             value1       value2
##             median mean maxDiff
## Key1 Key2                      
## A    blue      1.0  1.0       2
##      orange    1.0  1.0       0
## B    blue      4.0  4.0       0
##      orange    4.0  4.0       2##   Key1    Key2  value1  value2
## 0    A    blue     1.0    11.0
## 1    A  orange     1.0    11.0
## 2    B    blue     4.0    14.0
## 3    B  orange     4.0    14.0##   Key1    Key2  value1  value2
## 0    A    blue     1.0    11.0
## 1    A  orange     1.0    11.0
## 2    B    blue     4.0    14.0
## 3    B  orange     4.0    14.0##   Key1    Key2  value1  value2
## 0    A    blue     1.0    11.0
## 1    A  orange     1.0    11.0
## 2    B    blue     4.0    14.0
## 3    B  orange     4.0    14.0##   Key1    Key2  value1  value2
## 0    A    blue     1.0    11.0
## 1    A  orange     1.0    11.0
## 2    B    blue     4.0    14.0
## 3    B  orange     4.0    14.0def myMean(arr, offset = 5):
    return(arr.mean() - offset)
bdata.groupby(["Key1", "Key2"], as_index=False).agg(myMean)##   Key1    Key2  value1  value2
## 0    A    blue    -4.0     6.0
## 1    A  orange    -4.0     6.0
## 2    B    blue    -1.0     9.0
## 3    B  orange    -1.0     9.0##   Key1    Key2  value1  value2
## 0    A    blue    -3.0     7.0
## 1    A  orange    -3.0     7.0
## 2    B    blue     0.0    10.0
## 3    B  orange     0.0    10.0## Key1  Key2  
## A     blue      1.0
##       orange    1.0
## B     blue      4.0
##       orange    4.0
## Name: value1, dtype: float64##   Key1    Key2  value1
## 0    A    blue     1.0
## 1    A  orange     1.0
## 2    B    blue     4.0
## 3    B  orange     4.0## Key1  Key2  
## A     blue      6.0
##       orange    6.0
## B     blue      9.0
##       orange    9.0
## Name: value2, dtype: float64## Key1  Key2  
## A     blue       8.0
##       orange     8.0
## B     blue      11.0
##       orange    11.0
## Name: value2, dtype: float64##    Reaction  Days  Subject
## 0  249.5600     0      308
## 1  258.7047     1      308
## 2  250.8006     2      308
## 3  321.4398     3      308
## 4  356.8519     4      308def maxDiff(arr):
    return(arr.max() - arr.min())
def secMax(arr):
    return(np.sort(arr)[::-1][1])
sleepdata.drop(columns="Days").groupby(["Subject"]).agg([np.mean, np.std, np.max, secMax, maxDiff])##           Reaction                                         
##               mean        std      amax    secMax   maxDiff
## Subject                                                    
## 308      342.13383  79.821763  466.3535  430.5853  216.7935
## 309      215.23298  10.812193  237.3142  224.2957   34.3364
## 310      231.00127  21.855996  261.0125  255.7511   66.6803
## 330      303.22142  22.909199  354.0487  321.5426   73.8091
## 331      309.43605  27.242607  371.5811  334.8177   86.5811
## 332      307.30207  64.306126  454.1619  346.8311  219.3013
## 333      316.15831  30.068208  362.0428  348.8399   85.2735
## 334      295.30205  41.855607  377.2990  335.7469  133.9343
## 335      250.07004  13.833852  273.9472  270.8021   38.6362
## 337      375.72101  59.623789  458.9167  455.8643  167.3055
## 349      275.83447  42.937942  351.6451  336.2806  121.3284
## 350      313.60268  63.360558  394.4872  389.0527  151.0329
## 351      290.09775  28.978811  347.5655  321.5418   97.0390
## 352      337.42154  47.602380  388.5417  375.6406  166.8646
## 369      306.03464  37.460429  366.5131  348.1229  109.2707
## 370      291.70177  59.208212  372.2288  365.1630  146.9648
## 371      294.98404  36.505989  369.4692  350.7807  110.2034
## 372      317.88613  35.822804  369.1417  364.1236   99.7300