Zhiguang Huo (Caleb)
Monday Nov 21st, 2022
data = pd.Series(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
data
## l1 a 0
## b 1
## c 2
## l2 a 3
## b 4
## c 5
## dtype: int64
## MultiIndex([('l1', 'a'),
## ('l1', 'b'),
## ('l1', 'c'),
## ('l2', 'a'),
## ('l2', 'b'),
## ('l2', 'c')],
## )
## a 0
## b 1
## c 2
## dtype: int64
## a 3
## b 4
## c 5
## dtype: int64
## 4
## l1 0
## l2 3
## dtype: int64
## a b c
## l1 0 1 2
## l2 3 4 5
## l1 a 0
## b 1
## c 2
## l2 a 3
## b 4
## c 5
## dtype: int64
data = pd.Series(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
data.index.names = ["Key1", "Key2"]
data.swaplevel("Key1", "Key2")
## Key2 Key1
## a l1 0
## b l1 1
## c l1 2
## a l2 3
## b l2 4
## c l2 5
## dtype: int64
## Key2 Key1
## a l1 0
## l2 3
## b l1 1
## l2 4
## c l1 2
## l2 5
## dtype: int64
data2 = pd.DataFrame(np.arange(16).reshape((4, -1)),
index=[["l1", "l1", "l2", "l2"], ["a", "b", "a", "b"]],
columns=[["TX", "TX", "FL", "FL"],["Red", "Blue", "Red", "Blue"]])
data2.index.names = ["Key1", "Key2"]
data2.columns.names = ["State", "Color"]
data2
## State TX FL
## Color Red Blue Red Blue
## Key1 Key2
## l1 a 0 1 2 3
## b 4 5 6 7
## l2 a 8 9 10 11
## b 12 13 14 15
## Color Red Blue
## Key1 Key2
## l1 a 2 3
## b 6 7
## l2 a 10 11
## b 14 15
## Index(['l1', 'l1', 'l2', 'l2'], dtype='object', name='Key1')
## Index(['l1', 'l1', 'l2', 'l2'], dtype='object', name='Key1')
## Index(['a', 'b', 'a', 'b'], dtype='object', name='Key2')
## Index(['TX', 'TX', 'FL', 'FL'], dtype='object', name='State')
## Index(['TX', 'TX', 'FL', 'FL'], dtype='object', name='State')
## Index(['Red', 'Blue', 'Red', 'Blue'], dtype='object', name='Color')
adata = pd.DataFrame({"c1": [1,1,1,2,2,2,2],
"c2": list("abcdefg"),
"c3": range(7,0,-1),
"c4": list("abababa")}
)
adata.set_index("c1")
## c2 c3 c4
## c1
## 1 a 7 a
## 1 b 6 b
## 1 c 5 a
## 2 d 4 b
## 2 e 3 a
## 2 f 2 b
## 2 g 1 a
## c1 c2 c3 c4
## c1
## 1 1 a 7 a
## 1 1 b 6 b
## 1 1 c 5 a
## 2 2 d 4 b
## 2 2 e 3 a
## 2 2 f 2 b
## 2 2 g 1 a
## c2 c3
## c1 c4
## 1 a a 7
## b b 6
## a c 5
## 2 b d 4
## a e 3
## b f 2
## a g 1
## c1 c4 c2 c3
## 0 1 a a 7
## 1 1 b b 6
## 2 1 a c 5
## 3 2 b d 4
## 4 2 a e 3
## 5 2 b f 2
## 6 2 a g 1
data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abc"), "value2": range(3)})
pd.merge(data1, data2)
## key value1 value2
## 0 b 0 1
## 1 b 5 1
## 2 c 1 2
## 3 c 2 2
## 4 a 3 0
## 5 a 4 0
## key value1 value2
## 0 b 0 1
## 1 b 5 1
## 2 c 1 2
## 3 c 2 2
## 4 a 3 0
## 5 a 4 0
data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
pd.merge(data1, data2, how="inner") ## default
## key value1 value2
## 0 b 0 1
## 1 b 5 1
## 2 a 3 0
## 3 a 4 0
## key value1 value2
## 0 b 0 1.0
## 1 c 1 NaN
## 2 c 2 NaN
## 3 a 3 0.0
## 4 a 4 0.0
## 5 b 5 1.0
## key value1 value2
## 0 a 3.0 0
## 1 a 4.0 0
## 2 b 0.0 1
## 3 b 5.0 1
## 4 d NaN 2
## key value1 value2
## 0 b 0.0 1.0
## 1 b 5.0 1.0
## 2 c 1.0 NaN
## 3 c 2.0 NaN
## 4 a 3.0 0.0
## 5 a 4.0 0.0
## 6 d NaN 2.0
data1 = pd.DataFrame({"key1": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key2": list("abc"), "value2": range(3)})
pd.merge(data1, data2, left_on="key1", right_on="key2")
## key1 value1 key2 value2
## 0 b 0 b 1
## 1 b 5 b 1
## 2 c 1 c 2
## 3 c 2 c 2
## 4 a 3 a 0
## 5 a 4 a 0
data1 = pd.DataFrame({"keyA": list("bccaab"), "keyB": list("AABBCC"), "value1": range(6)})
data2 = pd.DataFrame({"keyA": list("abc"), "keyB": list("CBA"), "value2": range(3)})
pd.merge(data1, data2, on=["keyA", "keyB"])
#pd.merge(data1, data2, left_on=["keyA", "keyB"], right_on=["keyA", "keyB"])
## keyA keyB value1 value2
## 0 c A 1 2
## 1 a C 4 0
## keyA keyB value1 value2
## 0 b A 0.0 NaN
## 1 c A 1.0 2.0
## 2 c B 2.0 NaN
## 3 a B 3.0 NaN
## 4 a C 4.0 0.0
## 5 b C 5.0 NaN
## 6 b B NaN 1.0
data1 = pd.DataFrame({"keyA": list("bccaab"), "keyB": list("AABBCC"), "value1": range(6)})
data2 = pd.DataFrame({"keyA": list("abc"), "keyB": list("CBA"), "value2": range(3)})
pd.merge(data1, data2, on=["keyA"])
## keyA keyB_x value1 keyB_y value2
## 0 b A 0 B 1
## 1 b C 5 B 1
## 2 c A 1 A 2
## 3 c B 2 A 2
## 4 a B 3 C 0
## 5 a C 4 C 0
## keyA keyB_1 value1 keyB_2 value2
## 0 b A 0 B 1
## 1 b C 5 B 1
## 2 c A 1 A 2
## 3 c B 2 A 2
## 4 a B 3 C 0
## 5 a C 4 C 0
data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
data1_key = data1.set_index("key")
data2_key = data2.set_index("key")
pd.merge(data1_key,data2_key, left_index=True, right_index=True)
## value1 value2
## key
## a 3 0
## a 4 0
## b 0 1
## b 5 1
## key value1 value2
## 0 b 0 1
## 5 b 5 1
## 3 a 3 0
## 4 a 4 0
df_left = pd.DataFrame({"Key1": ["l1","l2","l3"],
"Key2": ["a", "b", "c"],
"value":[11,22,33]})
df_right = pd.DataFrame(np.arange(6), index = [["l1", "l1", "l1", "l2", "l2", "l2"], list("abc") * 2])
df_right.index.names = ["Key1", "Key2"]
pd.merge(df_left,df_right, left_on=["Key1", "Key2"], right_index=True, how="outer")
## Key1 Key2 value 0
## 0 l1 a 11.0 0.0
## 1 l2 b 22.0 4.0
## 2 l3 c 33.0 NaN
## 2 l1 b NaN 1.0
## 2 l1 c NaN 2.0
## 2 l2 a NaN 3.0
## 2 l2 c NaN 5.0
Based on df_left and df_right in the previous slides
df_right2 = df_right.unstack("Key2")[0]
pd.merge(df_left,df_right2, left_on=["Key1"], right_index=True)
## Key1 Key2 value a b c
## 0 l1 a 11 0 1 2
## 1 l2 b 22 3 4 5
df_right2 = df_right.unstack("Key1")[0]
pd.merge(df_left,df_right2, left_on=["Key2"], right_index=True)
## Key1 Key2 value l1 l2
## 0 l1 a 11 0 3
## 1 l2 b 22 1 4
## 2 l3 c 33 2 5
data1 = pd.DataFrame({"key": list("bccaab"), "value1": range(6)})
data2 = pd.DataFrame({"key": list("abd"), "value2": range(3)})
data1_key = data1.set_index("key")
data2_key = data2.set_index("key")
## value2 value1
## key
## a 0 3.0
## a 0 4.0
## b 1 0.0
## b 1 5.0
## d 2 NaN
## value1 value2
## key
## a 3 0.0
## a 4 0.0
## b 0 1.0
## b 5 1.0
## c 1 NaN
## c 2 NaN
## key value1 value2
## 0 b 0 1.0
## 1 c 1 NaN
## 2 c 2 NaN
## 3 a 3 0.0
## 4 a 4 0.0
## 5 b 5 1.0
series1 = pd.Series([1,2,3], index=list("abc"))
series2 = pd.Series([3,5,7], index=list("abd"))
pd.concat([series1, series2])
## a 1
## b 2
## c 3
## a 3
## b 5
## d 7
## dtype: int64
## data1 a 1
## b 2
## c 3
## data2 a 3
## b 5
## d 7
## dtype: int64
## a b c d
## data1 1.0 2.0 3.0 NaN
## data2 3.0 5.0 NaN 7.0
## 0 1
## a 1.0 3.0
## b 2.0 5.0
## c 3.0 NaN
## d NaN 7.0
## 0 1
## a 1 3
## b 2 5
df1 = pd.DataFrame(np.arange(6).reshape(3,-1), index=list("ABC"), columns= ["C1", "C2"])
df2 = pd.DataFrame(np.arange(4).reshape(2,-1), index=list("AB"), columns= ["C2", "C3"])
pd.concat([df1, df2])
## C1 C2 C3
## A 0.0 1 NaN
## B 2.0 3 NaN
## C 4.0 5 NaN
## A NaN 0 1.0
## B NaN 2 3.0
## C1 C2 C3
## 0 0.0 1 NaN
## 1 2.0 3 NaN
## 2 4.0 5 NaN
## 3 NaN 0 1.0
## 4 NaN 2 3.0
df1 = pd.DataFrame(np.arange(6).reshape(3,-1), index=list("ABC"), columns= ["C1", "C2"])
df2 = pd.DataFrame(np.arange(4).reshape(2,-1), index=list("AB"), columns= ["D1", "D2"])
pd.concat([df1, df2], axis=1)
## C1 C2 D1 D2
## A 0 1 0.0 1.0
## B 2 3 2.0 3.0
## C 4 5 NaN NaN
## Level1 Level2
## C1 C2 D1 D2
## A 0 1 0.0 1.0
## B 2 3 2.0 3.0
## C 4 5 NaN NaN
## Level1 Level2
## C1 C2 D1 D2
## A 0 1 0.0 1.0
## B 2 3 2.0 3.0
## C 4 5 NaN NaN
s1 = pd.Series([np.nan,1,2,3,np.nan], index=list("abcde"))
s2 = pd.Series([1,np.nan,6,7, np.nan], index=list("bcdea"))
np.where(pd.isna(s1), s2, s1) ## this will ignore index
## array([ 1., 1., 2., 3., nan])
## a NaN
## b 1.0
## c 2.0
## d 3.0
## e 7.0
## dtype: float64
df1 = pd.DataFrame({"C1": [np.nan, 1, 2],
"C2": [5, np.nan, 3],
"C3": [np.nan, 9, np.nan]
}, index=list("ABC"))
df2 = pd.DataFrame({"C1": [1,2,],
"C2": [4,5,]
}, index=list("AB"))
df1
## C1 C2 C3
## A NaN 5.0 NaN
## B 1.0 NaN 9.0
## C 2.0 3.0 NaN
## C1 C2
## A 1 4
## B 2 5
## C1 C2 C3
## A 1.0 5.0 NaN
## B 1.0 5.0 9.0
## C 2.0 3.0 NaN
pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.columns.name="Columns"
pd1.index.name="States"
pd1
## Columns b d c
## States
## Florida 0 1 2
## Texax 3 4 5
## Utah 6 7 8
## States Columns
## Florida b 0
## d 1
## c 2
## Texax b 3
## d 4
## c 5
## Utah b 6
## d 7
## c 8
## dtype: int64
## Columns b d c
## States
## Florida 0 1 2
## Texax 3 4 5
## Utah 6 7 8
## States Florida Texax Utah
## Columns
## b 0 3 6
## d 1 4 7
## c 2 5 8
## Columns b d c
## States
## Florida 0 1 2
## Texax 3 4 5
## Utah 6 7 8
## States Florida Texax Utah
## Columns
## b 0 3 6
## d 1 4 7
## c 2 5 8
pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.columns.name="Columns"
pd1.index.name="States"
# pd1
pd1.stack("Columns").unstack("States")
## States Florida Texax Utah
## Columns
## b 0 3 6
## d 1 4 7
## c 2 5 8
## States Florida Texax Utah
## Columns
## b 0 3 6
## d 1 4 7
## c 2 5 8
import io
import requests
url="https://caleb-huo.github.io/teaching/data/sleep/sleepstudy.csv"
s=requests.get(url).content
sleepdata=pd.read_csv(io.StringIO(s.decode('utf-8')))
sleepdata.head()
## Reaction Days Subject
## 0 249.5600 0 308
## 1 258.7047 1 308
## 2 250.8006 2 308
## 3 321.4398 3 308
## 4 356.8519 4 308
## Reaction ...
## Days 0 1 2 ... 7 8 9
## Subject ...
## 308 249.5600 258.7047 250.8006 ... 290.1486 430.5853 466.3535
## 309 222.7339 205.2658 202.9778 ... 217.7272 224.2957 237.3142
## 310 199.0539 194.3322 234.3200 ... 255.7511 261.0125 247.5153
## 330 321.5426 300.4002 283.8565 ... 318.2613 305.3495 354.0487
## 331 287.6079 285.0000 301.8206 ... 334.8177 293.7469 371.5811
##
## [5 rows x 10 columns]
sleepdata.set_index(["Subject", "Days"]).unstack("Days")
pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.index.name = "State"
pd2 = pd1.reset_index()
pd2
## State b d c
## 0 Florida 0 1 2
## 1 Texax 3 4 5
## 2 Utah 6 7 8
## State variable value
## 0 Florida b 0
## 1 Texax b 3
## 2 Utah b 6
## 3 Florida d 1
## 4 Texax d 4
## 5 Utah d 7
## 6 Florida c 2
## 7 Texax c 5
## 8 Utah c 8
sleepdata_wide3 = sleepdata_wide
sleepdata_wide3.columns = sleepdata_wide.columns.get_level_values(1)
sleepdata_wide3.reset_index(inplace=True)
sleepdata_long = pd.melt(sleepdata_wide3, id_vars = ["Subject"], value_vars = range(10))
sleepdata_long
## Subject Days value
## 0 308 0 249.5600
## 1 309 0 222.7339
## 2 310 0 199.0539
## 3 330 0 321.5426
## 4 331 0 287.6079
## .. ... ... ...
## 175 352 9 388.5417
## 176 369 9 366.5131
## 177 370 9 372.2288
## 178 371 9 369.4692
## 179 372 9 364.1236
##
## [180 rows x 3 columns]
## Subject Days 0
## 0 308 0 249.5600
## 1 308 1 258.7047
## 2 308 2 250.8006
## 3 308 3 321.4398
## 4 308 4 356.8519
## .. ... ... ...
## 175 372 5 329.6076
## 176 372 6 334.4818
## 177 372 7 343.2199
## 178 372 8 369.1417
## 179 372 9 364.1236
##
## [180 rows x 3 columns]
## Key1 value1
## 0 A 0
## 1 A 1
## 2 A 2
## 3 B 3
## 4 B 4
## 5 B 5
## value1
## Key1
## A 1.0
## B 4.0
## Key1
## A 1.0
## B 4.0
## Name: value1, dtype: float64
adata = pd.DataFrame({
"Key1": list("AAABBB"),
"value1": np.arange(6),
})
grouped_data = adata.groupby("Key1")
for name, data in grouped_data:
print(name)
print(data)
## A
## Key1 value1
## 0 A 0
## 1 A 1
## 2 A 2
## B
## Key1 value1
## 3 B 3
## 4 B 4
## 5 B 5
adata = pd.DataFrame({
"Key1": list("AAABBB"),
"value1": np.arange(6),
})
grouped_data = adata.groupby("Key1")
adict = dict(list(grouped_data))
adict.keys()
## dict_keys(['A', 'B'])
## Key1 value1
## 0 A 0
## 1 A 1
## 2 A 2
bdata = pd.DataFrame({
"Key1": list("AAABBB"),
"Key2": ["blue", "orange", "blue", "orange", "blue", "orange"],
"value1": np.arange(6),
"value2": np.arange(10,16)
})
bdata
## Key1 Key2 value1 value2
## 0 A blue 0 10
## 1 A orange 1 11
## 2 A blue 2 12
## 3 B orange 3 13
## 4 B blue 4 14
## 5 B orange 5 15
## value1 value2
## Key1 Key2
## A blue 1.0 11.0
## orange 1.0 11.0
## B blue 4.0 14.0
## orange 4.0 14.0
## Key1 Key2
## A blue 2
## orange 1
## B blue 1
## orange 2
## dtype: int64
## Key1 Key2
## A blue 1.0
## orange 1.0
## B blue 4.0
## orange 4.0
## Name: value1, dtype: float64
## Key1 Key2
## A blue 2
## orange 1
## B blue 1
## orange 2
## Name: value2, dtype: int64
## Key1 Key2 value1 value2
## Amy A blue 0 10
## Beth A orange 1 11
## Carl A blue 2 12
## Dan B orange 3 13
## Emily B blue 4 14
## Frank B orange 5 15
## value1 value2
## 3 0 10
## 4 1 11
## 5 4 14
## value1 value2
## Key1
## A 1.0 11.0
## B 4.0 14.0
def maxDiff(arr):
return(arr.max() - arr.min())
bdata.drop(columns = "Key2").groupby("Key1").agg(maxDiff)
## value1 value2
## Key1
## A 2 2
## B 2 2
## value1 value2
## Key1
## A 2 2
## B 2 2
## value1 value2
## median mean maxDiff median mean maxDiff
## Key1
## A 1.0 1.0 2 11.0 11.0 2
## B 4.0 4.0 2 14.0 14.0 2
## value1 value2
## mymedian mymean mymaxDiff mymedian mymean mymaxDiff
## Key1
## A 1.0 1.0 2 11.0 11.0 2
## B 4.0 4.0 2 14.0 14.0 2
## value1 value2
## median mean maxDiff
## Key1
## A 1.0 1.0 2
## B 4.0 4.0 2
## value1 value2
## median mean maxDiff
## Key1 Key2
## A blue 1.0 1.0 2
## orange 1.0 1.0 0
## B blue 4.0 4.0 0
## orange 4.0 4.0 2
## Key1 Key2 value1 value2
## 0 A blue 1.0 11.0
## 1 A orange 1.0 11.0
## 2 B blue 4.0 14.0
## 3 B orange 4.0 14.0
## Key1 Key2 value1 value2
## 0 A blue 1.0 11.0
## 1 A orange 1.0 11.0
## 2 B blue 4.0 14.0
## 3 B orange 4.0 14.0
## Key1 Key2 value1 value2
## 0 A blue 1.0 11.0
## 1 A orange 1.0 11.0
## 2 B blue 4.0 14.0
## 3 B orange 4.0 14.0
## Key1 Key2 value1 value2
## 0 A blue 1.0 11.0
## 1 A orange 1.0 11.0
## 2 B blue 4.0 14.0
## 3 B orange 4.0 14.0
def myMean(arr, offset = 5):
return(arr.mean() - offset)
bdata.groupby(["Key1", "Key2"], as_index=False).agg(myMean)
## Key1 Key2 value1 value2
## 0 A blue -4.0 6.0
## 1 A orange -4.0 6.0
## 2 B blue -1.0 9.0
## 3 B orange -1.0 9.0
## Key1 Key2 value1 value2
## 0 A blue -3.0 7.0
## 1 A orange -3.0 7.0
## 2 B blue 0.0 10.0
## 3 B orange 0.0 10.0
## Key1 Key2
## A blue 1.0
## orange 1.0
## B blue 4.0
## orange 4.0
## Name: value1, dtype: float64
## Key1 Key2 value1
## 0 A blue 1.0
## 1 A orange 1.0
## 2 B blue 4.0
## 3 B orange 4.0
## Key1 Key2
## A blue 6.0
## orange 6.0
## B blue 9.0
## orange 9.0
## Name: value2, dtype: float64
## Key1 Key2
## A blue 8.0
## orange 8.0
## B blue 11.0
## orange 11.0
## Name: value2, dtype: float64
## Reaction Days Subject
## 0 249.5600 0 308
## 1 258.7047 1 308
## 2 250.8006 2 308
## 3 321.4398 3 308
## 4 356.8519 4 308
def maxDiff(arr):
return(arr.max() - arr.min())
def secMax(arr):
return(np.sort(arr)[::-1][1])
sleepdata.drop(columns="Days").groupby(["Subject"]).agg([np.mean, np.std, np.max, secMax, maxDiff])
## Reaction
## mean std amax secMax maxDiff
## Subject
## 308 342.13383 79.821763 466.3535 430.5853 216.7935
## 309 215.23298 10.812193 237.3142 224.2957 34.3364
## 310 231.00127 21.855996 261.0125 255.7511 66.6803
## 330 303.22142 22.909199 354.0487 321.5426 73.8091
## 331 309.43605 27.242607 371.5811 334.8177 86.5811
## 332 307.30207 64.306126 454.1619 346.8311 219.3013
## 333 316.15831 30.068208 362.0428 348.8399 85.2735
## 334 295.30205 41.855607 377.2990 335.7469 133.9343
## 335 250.07004 13.833852 273.9472 270.8021 38.6362
## 337 375.72101 59.623789 458.9167 455.8643 167.3055
## 349 275.83447 42.937942 351.6451 336.2806 121.3284
## 350 313.60268 63.360558 394.4872 389.0527 151.0329
## 351 290.09775 28.978811 347.5655 321.5418 97.0390
## 352 337.42154 47.602380 388.5417 375.6406 166.8646
## 369 306.03464 37.460429 366.5131 348.1229 109.2707
## 370 291.70177 59.208212 372.2288 365.1630 146.9648
## 371 294.98404 36.505989 369.4692 350.7807 110.2034
## 372 317.88613 35.822804 369.1417 364.1236 99.7300