Zhiguang Huo (Caleb)
Wednesday Nov 16th, 2022
## 0 False
## 1 True
## 2 True
## 3 True
## 4 False
## dtype: bool
## 0 False
## 1 True
## 2 True
## 3 True
## 4 False
## dtype: bool
## 0 0.0
## 4 1.0
## dtype: float64
## 0 0.0
## 4 1.0
## dtype: float64
## 0 0.0
## 4 1.0
## dtype: float64
## 0 1 2
## 0 1.0 2.0 3.0
## 0 1 2
## 0 1.0 2.0 3.0
## 2 4.0 NaN 6.0
## 0 1 2 4
## 0 1.0 2.0 3.0 NaN
## 1 NaN NaN NaN NaN
## 2 4.0 NaN 6.0 NaN
## 0 1 2
## 0 1.0 2.0 3.0
## 1 NaN NaN NaN
## 2 4.0 NaN 6.0
## Empty DataFrame
## Columns: []
## Index: [0, 1, 2]
## 0 1 2 4
## 0 1.0 2.0 3.0 NaN
## 1 NaN NaN NaN NaN
## 2 4.0 NaN 6.0 NaN
## 0 1 2 4
## 0 1.0 2.0 3.0 NaN
## 2 4.0 NaN 6.0 NaN
## 0 1 2 4
## 0 1.0 2.0 3.0 NaN
## 0 1 2 4
## 0 1.0 2.0 3.0 -99.0
## 1 -99.0 -99.0 -99.0 -99.0
## 2 4.0 -99.0 6.0 -99.0
## 0 1 2 4
## 0 1.0 2.0 3.0 -4.0
## 1 NaN -1.0 -2.0 -4.0
## 2 4.0 -1.0 6.0 -4.0
## 0 1 2 4
## 0 1.0 2.0 3.0 -99.0
## 1 -99.0 -99.0 -99.0 -99.0
## 2 4.0 -99.0 6.0 -99.0
arr = pd.Series([0,np.nan,3,np.nan, 5, np.nan])
#apd.fillna(apd.mean()) ## fill by mean alue
arr.fillna(method="ffill")
## 0 0.0
## 1 0.0
## 2 3.0
## 3 3.0
## 4 5.0
## 5 5.0
## dtype: float64
## 0 0.0
## 1 3.0
## 2 3.0
## 3 5.0
## 4 5.0
## 5 NaN
## dtype: float64
## 0 False
## 1 False
## 2 True
## 3 True
## Name: c1, dtype: bool
## (array([2, 3]),)
## c1 c2
## 2 3 c
## 3 4 d
## c1 c2
## 3 4 d
## 0 False
## 1 False
## 2 False
## 3 True
## 4 False
## 5 False
## dtype: bool
## c1 c2
## 0 1 a
## 1 1 b
## 2 2 a
## 4 3 a
## 5 3 b
## c1 c2
## 0 1 a
## 2 2 a
## 4 3 a
## 0 True
## 1 False
## 2 True
## 3 False
## 4 True
## 5 False
## dtype: bool
## c1 c2
## 1 1 b
## 3 2 a
## 5 3 b
population = pd.DataFrame({"City": ["Gainesville", "Orlando", "Tampa", "Pittsburgh", "Philadelphia"],
"Population": [140,309,387,300,1576]}
)
city_to_state = {"Gainesville": "FL", "Orlando": "FL", "Tampa": "FL", "Pittsburgh": "PA", "Philadelphia":"PA"}
population
## City Population
## 0 Gainesville 140
## 1 Orlando 309
## 2 Tampa 387
## 3 Pittsburgh 300
## 4 Philadelphia 1576
## {'Gainesville': 'FL', 'Orlando': 'FL', 'Tampa': 'FL', 'Pittsburgh': 'PA', 'Philadelphia': 'PA'}
## 0 FL
## 1 FL
## 2 FL
## 3 PA
## 4 PA
## Name: City, dtype: object
## 0 FL
## 1 FL
## 2 FL
## 3 PA
## 4 PA
## Name: City, dtype: object
## 0 1 2
## 0 1 2 3
## 1 2 3 4
## 2 3 4 5
## 0 1 2
## 0 1 2.0 3.0
## 1 2 3.0 NaN
## 2 3 NaN 5.0
## 0 1 2
## 0 1 2 -3
## 1 2 -3 -4
## 2 -3 -4 5
## 0 1 2
## 0 1 2 -3
## 1 2 -3 -4
## 2 -3 -4 5
data = pd.DataFrame({"l1": [1,2,3,18,13, 1, 15,6,-99,21,3,np.nan],
"l2": [1,np.nan,3,7,np.nan,1, 5,-99,-99,3,3,9]})
data
## l1 l2
## 0 1.0 1.0
## 1 2.0 NaN
## 2 3.0 3.0
## 3 18.0 7.0
## 4 13.0 NaN
## 5 1.0 1.0
## 6 15.0 5.0
## 7 6.0 -99.0
## 8 -99.0 -99.0
## 9 21.0 3.0
## 10 3.0 3.0
## 11 NaN 9.0
write the code in one line
## 9 3
## 3 9
## 6 6
## 2 3
## 0 1
## Name: l1, dtype: int64
pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1
## b d c
## Florida 0 1 2
## Texax 3 4 5
## Utah 6 7 8
## b d c
## Flo 0 1 2
## Tex 3 4 5
## Uta 6 7 8
## B D C
## FLO 0 1 2
## TEX 3 4 5
## UTA 6 7 8
## BB d c
## FL 0 1 2
## TX 3 4 5
## Uta 6 7 8
## BB d c
## FL 0 1 2
## TX 3 4 5
## Uta 6 7 8
ages = np.random.default_rng(32611).integers(low=0, high=100, size=10)
bins = [0, 20, 40, 60, 80, 100]
groups = pd.cut(ages, bins)
groups.codes
## array([0, 2, 2, 0, 1, 0, 0, 3, 4, 3], dtype=int8)
## IntervalIndex([(0, 20], (20, 40], (40, 60], (60, 80], (80, 100]], dtype='interval[int64, right]')
## (0, 20] 4
## (20, 40] 1
## (40, 60] 2
## (60, 80] 2
## (80, 100] 1
## dtype: int64
groups2 = pd.cut(ages, bins, labels = ["0-20", "20-40", "40-60", "60-80", "80-100"])
groups2.value_counts()
## 0-20 4
## 20-40 1
## 40-60 2
## 60-80 2
## 80-100 1
## dtype: int64
## 0 1 2
## 0 0.195122 -0.563317 0.973858
## 1 -1.873995 0.394967 -0.132278
## 2 -0.643411 -1.046220 -0.278885
## 3 -0.039059 -0.575599 0.026662
## 4 0.414205 0.643462 -1.125217
## 0 1 2
## count 100.000000 100.000000 100.000000
## mean 0.027024 0.071957 0.002366
## std 0.980115 1.005985 0.947386
## min -2.403350 -2.823156 -2.256457
## 25% -0.760132 -0.606894 -0.740145
## 50% 0.155343 -0.046282 -0.078738
## 75% 0.723528 0.717592 0.559711
## max 2.170296 2.972694 3.529913
outliers are defined as >2 or <-2
outliers for a certain column
## 13 -2.090740
## 53 2.167734
## 74 2.170296
## 87 -2.403350
## Name: 0, dtype: float64
## 0 1 2
## 92 0.678229 -0.692429 3.529913
## 0 1 2
## count 100.000000 100.000000 100.000000
## mean 0.028585 0.060009 -0.015086
## std 0.961568 0.936246 0.880808
## min -2.000000 -2.000000 -2.000000
## 25% -0.760132 -0.606894 -0.740145
## 50% 0.155343 -0.046282 -0.078738
## 75% 0.723528 0.717592 0.559711
## max 2.000000 2.000000 2.000000
## 0 1 2
## 57 0.755502 1.748730 -0.549839
## 58 0.603501 0.778081 -0.741010
## 38 0.066027 -0.638866 0.302885
## 0 1 2
## 67 1.114620 -0.284325 -0.394080
## 96 0.035634 -0.493672 0.018179
## 48 0.579743 -0.404422 -0.661694
## keys values
## 0 b 0
## 1 b 1
## 2 a 2
## 3 a 3
## 4 c 4
## 5 c 5
## a b c
## 0 0 1 0
## 1 0 1 0
## 2 1 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 1
## Group_a Group_b Group_c
## 0 0 1 0
## 1 0 1 0
## 2 1 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 1
## keys values Group_a Group_b Group_c
## 0 b 0 0 1 0
## 1 b 1 0 1 0
## 2 a 2 1 0 0
## 3 a 3 1 0 0
## 4 c 4 0 0 1
## 5 c 5 0 0 1
data = pd.Series({"Alex": "alex@gmail.com", "Beth": "BETH@yahoo.com", "Carl": "Carl@ufl.edu"})
data.str.contains("ufl")
## Alex False
## Beth False
## Carl True
## dtype: bool
## Alex [alex, gmail, com]
## Beth [BETH, yahoo, com]
## Carl [Carl, ufl, edu]
## dtype: object
## Alex [(alex, gmail, com)]
## Beth [(BETH, yahoo, com)]
## Carl [(Carl, ufl, edu)]
## dtype: object
## Alex gmail
## Beth yahoo
## Carl ufl
## dtype: object