Zhiguang Huo (Caleb)
Monday Nov 9th, 2023
## 0 1
## 1 2
## 2 3
## dtype: int64
## array([1, 2, 3])
## RangeIndex(start=0, stop=3, step=1)
[i for i in aseries.index]
list(aseries.index)
## a 1.0
## b 2.2
## c 3.5
## dtype: float64
## Index(['a', 'b', 'c'], dtype='object')
## a 10.0
## b 22.0
## c 35.0
## dtype: float64
## a 0.000000
## b 0.788457
## c 1.252763
## dtype: float64
## 1.0
## 1.0
## a 1.0
## b 2.2
## dtype: float64
## True
## a 1
## b 2
## c 3
## dtype: int64
## a 1
## c 3
## b 2
## dtype: int64
## d NaN
## c 3.0
## b 2.0
## dtype: float64
## d True
## c False
## b False
## dtype: bool
## a 0
## b 1
## c 2
## dtype: int64
## Index(['a', 'b', 'c'], dtype='object')
## Index(['a', 'b'], dtype='object')
obj.index[0] = "x"
## Index(['a', 'b', 'c'], dtype='object')
## aa 0
## bb 1
## cc 2
## dtype: int64
## b 1.1
## c 2.2
## a 3.3
## dtype: float64
## a 3.3
## b 1.1
## c 2.2
## d NaN
## dtype: float64
## a 0
## b 1
## c 2
## d 3
## dtype: int64
## b 1
## c 2
## d 3
## dtype: int64
## c 2
## d 3
## dtype: int64
## b 1
## c 2
## d 3
## dtype: int64
## 2
## 2
##
## <string>:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
## b 2
## dtype: int64
## b 2
## c 3
## dtype: int64
## b 2
## d 4
## dtype: int64
## c 3
## d 4
## dtype: int64
## b 2
## c 3
## dtype: int64
## 1
## a 1
## c 3
## dtype: int64
## 2
## b 2
## c 3
## d 4
## dtype: int64
## 3
## array(['a', 'b', 'c', 'd'], dtype=object)
## a 4
## b 4
## c 4
## d 3
## Name: count, dtype: int64
## a 4
## b 4
## c 4
## d 3
## Name: count, dtype: int64
## 0 a
## 1 b
## 4 a
## 10 a
## 11 a
## 12 b
## 13 b
## 14 b
## dtype: object
## array([0, 1, 2, 3, 0, 3, 2, 2, 2, 3, 0, 0, 1, 1, 1])
a1 = pd.Series([1,2,3], index = ["a", "b", "c"])
a2 = pd.Series([2,3,4], index = ["a", "b", "c"])
a3 = pd.Series([2,3,4], index = ["b", "c", "d"])
## a 3
## b 5
## c 7
## dtype: int64
## a NaN
## b 4.0
## c 6.0
## d NaN
## dtype: float64
## {'Name': ['Amy', 'Beth', 'Carl'], 'Age': [24, 22, 19], 'Sex': ['F', 'F', 'M']}
## Name Age Sex
## 0 Amy 24 F
## 1 Beth 22 F
## 2 Carl 19 M
## Sex Age Name
## 0 F 24 Amy
## 1 F 22 Beth
## 2 M 19 Carl
## Sex Age Name Email
## 0 F 24 Amy NaN
## 1 F 22 Beth NaN
## 2 M 19 Carl NaN
## Sex Age Name
## a F 24 Amy
## b F 22 Beth
## c M 19 Carl
## Sex Age Name
## a F 24 Amy
## b F 22 Beth
## c M 19 Carl
## index Sex Age Name
## 0 a F 24 Amy
## 1 b F 22 Beth
## 2 c M 19 Carl
## Sex Age Name
## index
## a F 24 Amy
## b F 22 Beth
## c M 19 Carl
## a 24
## b 22
## c 19
## Name: Age, dtype: int64
## a 24
## b 22
## c 19
## Name: Age, dtype: int64
## Name Age
## a Amy 24
## b Beth 22
## c Carl 19
## Sex Age Name
## b F 22 Beth
## c M 19 Carl
## Sex Age Name
## a F 24 Amy
## b F 22 Beth
## Sex F
## Age 24
## Name Amy
## Name: a, dtype: object
## a Amy
## b Beth
## c Carl
## Name: Name, dtype: object
## Name Amy
## Age 24
## Name: a, dtype: object
## Sex F
## Age 24
## Name Amy
## Name: a, dtype: object
## Sex M
## Age 19
## Name Carl
## Name: c, dtype: object
## a 24
## b 22
## c 19
## Name: Age, dtype: int64
## Sex F
## Age 24
## Name: a, dtype: object
## Sex Age Name Age20
## a F 24 Amy True
## b F 22 Beth True
## c M 19 Carl False
## Sex Age Name Age20 debt
## a F 24 Amy True NaN
## b F 22 Beth True NaN
## c M 19 Carl False NaN
## Sex Age Name Age20 debt
## a F 24 Amy True 15.1
## b F 22 Beth True 15.1
## c M 19 Carl False 15.1
## Sex Age Name Age20 debt
## a F 24 Amy True 0.0
## b F 22 Beth True 1.0
## c M 19 Carl False 2.0
## Sex Age Name Age20 debt
## a F 24 Amy True 5.0
## b F 22 Beth True NaN
## c M 19 Carl False 7.7
## Index(['Sex', 'Age', 'Name', 'Age20', 'debt'], dtype='object')
## Index(['Sex', 'Age', 'Name', 'Age20'], dtype='object')
## a b c
## Sex F F M
## Age 24 22 19
## Name Amy Beth Carl
## Age20 True True False
pop = {"Florida": {2020: 10.9, 2021: 11.3, 2022: 13.4},
"Texas": {2020: 20.5, 2021: 21.1}
}
bpd = pd.DataFrame(pop)
bpd
## Florida Texas
## 2020 10.9 20.5
## 2021 11.3 21.1
## 2022 13.4 NaN
## States Florida Texas
## Year
## 2020 10.9 20.5
## 2021 11.3 21.1
## 2022 13.4 NaN
## array([[10.9, 20.5],
## [11.3, 21.1],
## [13.4, nan]])
## Index([2020, 2021, 2022], dtype='int64', name='Year')
## True
## Index(['Florida', 'Texas'], dtype='object', name='States')
## True
data = pd.DataFrame(np.arange(6).reshape(3,-1), index =['x', 'y', 'z'], columns = ["Florida", 'Texas'] )
data
## Florida Texas
## x 0 1
## y 2 3
## z 4 5
## Florida Texas
## x 0.0 1.0
## y 2.0 3.0
## z 4.0 5.0
## w NaN NaN
## Utah Florida Texas
## x NaN 0.0 1.0
## y NaN 2.0 3.0
## z NaN 4.0 5.0
## w NaN NaN NaN
## Utah Florida Texas
## x NaN 0.0 1.0
## y NaN 2.0 3.0
data0 = {"Name": ["Amy", "Beth", "Carl"],
"Age": [24, 22, 19],
"Sex": ["F", "F", "M"]
}
data = pd.DataFrame(data0, index = ["1", "2", "3"])
data.drop("1") ## drop by index
## Name Age Sex
## 2 Beth 22 F
## 3 Carl 19 M
## Age Sex
## 1 24 F
## 2 22 F
## 3 19 M
## Sex
## 1 F
## 2 F
## 3 M
pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd2 = pd.DataFrame(np.arange(12).reshape(-1,3), columns = list("bac"), index = ["Florida", "Texax", "Utah", "Ohio"])
pd1
## b d c
## Florida 0 1 2
## Texax 3 4 5
## Utah 6 7 8
## b a c
## Florida 0 1 2
## Texax 3 4 5
## Utah 6 7 8
## Ohio 9 10 11
## a b c d
## Florida NaN 0.0 4.0 NaN
## Ohio NaN NaN NaN NaN
## Texax NaN 6.0 10.0 NaN
## Utah NaN 12.0 16.0 NaN
pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd3 = pd.DataFrame(np.arange(1,10).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1.add(pd3)
## b d c
## Florida 1 3 5
## Texax 7 9 11
## Utah 13 15 17
## b d c
## Florida -1 0 1
## Texax 2 3 4
## Utah 5 6 7
pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
series1 = pd1.iloc[0]
pd1
## b d c
## Florida 0 1 2
## Texax 3 4 5
## Utah 6 7 8
## b 0
## d 1
## c 2
## Name: Florida, dtype: int64
## b d c
## Florida 0 0 0
## Texax 3 3 3
## Utah 6 6 6
## b 1
## d 3
## a 2
## c 4
## dtype: int64
## a 2
## b 1
## c 4
## d 3
## dtype: int64
## d 3
## c 4
## b 1
## a 2
## dtype: int64
## b 1
## a 2
## d 3
## c 4
## dtype: int64
pd1 = pd.DataFrame(np.array([3,2,5,1,4,6]).reshape(3,2), index = ['c', 'a', 'b'], columns = ["x", "y"])
pd1.sort_index()
## x y
## a 5 1
## b 4 6
## c 3 2
## x y
## c 3 2
## b 4 6
## a 5 1
## x y
## a 5 1
## c 3 2
## b 4 6
pd1 = pd.DataFrame(np.arange(12).reshape(3,4), index = ['x', 'y', 'z'], columns = ["a", "b", "c", "d"])
pd1.apply(np.mean)
## a 4.0
## b 5.0
## c 6.0
## d 7.0
## dtype: float64
## x 1.5
## y 5.5
## z 9.5
## dtype: float64
## a 8
## b 8
## c 8
## d 8
## dtype: int64
## a 8
## b 8
## c 8
## d 8
## dtype: int64
## a b c d
## min 0 1 2 3
## max 8 9 10 11
## a 12
## b 15
## c 18
## d 21
## dtype: int64
## x 6
## y 22
## z 38
## dtype: int64
## a 4.0
## b 5.0
## c 6.0
## d 7.0
## dtype: float64
## a z
## b z
## c z
## d z
## dtype: object
## x a
## y a
## z a
## dtype: object
## a b c d
## x 0 1 2 3
## y 4 6 8 10
## z 12 15 18 21
## a 16.0
## b 16.0
## c 16.0
## d 16.0
## dtype: float64
## a b c d
## count 3.0 3.0 3.0 3.0
## mean 4.0 5.0 6.0 7.0
## std 4.0 4.0 4.0 4.0
## min 0.0 1.0 2.0 3.0
## 25% 2.0 3.0 4.0 5.0
## 50% 4.0 5.0 6.0 7.0
## 75% 6.0 7.0 8.0 9.0
## max 8.0 9.0 10.0 11.0
rng = np.random.default_rng(42)
pd2 = pd.DataFrame(rng.standard_normal(12).reshape(3,4), index = ['x', 'y', 'z'], columns = ["a", "b", "c", "d"])
pd2["a"].corr(pd2["b"])
## 0.8476294400143133
## 0.8476294400143133
## a b c d
## a 1.000000 0.847629 0.957308 0.999918
## b 0.847629 1.000000 0.964820 0.854339
## c 0.957308 0.964820 1.000000 0.960923
## d 0.999918 0.854339 0.960923 1.000000
## 0.2333460858435695
## a b c d
## a 1.488842 0.233346 0.469448 0.833917
## b 0.233346 0.050903 0.087484 0.131745
## c 0.469448 0.087484 0.161518 0.263957
## d 0.833917 0.131745 0.263957 0.467162
## a
## 3 2
## 1 1
## 2 1
## Name: count, dtype: int64
## a b c
## 1 1.0 2.0 NaN
## 2 1.0 1.0 2.0
## 3 2.0 NaN 2.0
## 4 NaN 1.0 NaN
## a b c
## 1 1.0 2.0 0.0
## 2 1.0 1.0 2.0
## 3 2.0 0.0 2.0
## 4 0.0 1.0 0.0
afile = "https://caleb-huo.github.io/teaching/data/Python/Student_data.csv"
bfile = "https://caleb-huo.github.io/teaching/data/Python/Student_data.xlsx"
data0 = pd.read_csv("sleepstudy.csv")
data1 = pd.read_excel("sleepstudy.xlsx")
## Name Hobby Year_in_colledge Initial_GPA Study_time
## 0 Dan Football freshman 3.1 10
## 1 Beth Music sophomore 3.2 20
## 2 Carl Basketball senior 3.6 14
## 3 Frank Cooking sophomore 3.4 16
## 4 Emily Running junior 3.3 18
## 0 1 2 3 4
## 0 Name Hobby Year_in_colledge Initial_GPA Study_time
## 1 Dan Football freshman 3.1 10
## 2 Beth Music sophomore 3.2 20
## 3 Carl Basketball senior 3.6 14
## 4 Frank Cooking sophomore 3.4 16
## 0 1 2 3 4
## 0 Dan Football freshman 3.1 10
## 1 Beth Music sophomore 3.2 20
## 2 Carl Basketball senior 3.6 14
## 3 Frank Cooking sophomore 3.4 16
## 4 Emily Running junior 3.3 18
## a b c d e
## 0 Dan Football freshman 3.1 10
## 1 Beth Music sophomore 3.2 20
## 2 Carl Basketball senior 3.6 14
## 3 Frank Cooking sophomore 3.4 16
## 4 Emily Running junior 3.3 18
## Hobby Year_in_colledge Initial_GPA Study_time
## Name
## Dan Football freshman 3.1 10
## Beth Music sophomore 3.2 20
## Carl Basketball senior 3.6 14
## Frank Cooking sophomore 3.4 16
## Emily Running junior 3.3 18
## Hobby Year_in_colledge Initial_GPA Study_time
## Name
## Amy Swimming senior 3.0 15
## Ashely Skiing senior 3.1 15
## Beth Music sophomore 3.2 20
## Carl Basketball senior 3.6 14
## Chris Singing freshman 3.6 19
data = pd.read_csv("Student_data.csv")
data.to_csv("mydata.csv") ## index=True as default
data.to_csv("mydata.csv", index=False)
data.to_csv("mydata.txt", sep="\t")
import sys
data.to_csv(sys.stdout)
data.to_csv(sys.stdout, sep="\t")
data.to_csv(sys.stdout, index=False)
data.to_csv(sys.stdout, header=False)
data.to_csv(sys.stdout, columns = ["Hobby", "Year_in_colledge"])
## datetime.datetime(2023, 11, 9, 12, 44, 11, 203884)
## (2023, 11, 9)
## (12, 44, 11)
datetime1 = datetime(2022,10,13,7,30,0)
datetime2 = datetime(2022,10,10,5,20,0)
delta = datetime1 - datetime2
delta
## datetime.timedelta(days=3, seconds=7800)
## 3
## 7800
## datetime.datetime(2022, 10, 25, 7, 30)
## datetime.datetime(2022, 10, 25, 7, 30, 2)
## '2022-10-13 00:00:00'
## '2022-10-13'
## datetime.datetime(2021, 10, 11, 0, 0)
## datetime.datetime(2021, 10, 11, 0, 0)
## DatetimeIndex(['2021-10-11 12:00:01', '2021-10-12 03:40:01'], dtype='datetime64[ns]', freq=None)
mydate = [datetime(2022,10,13), datetime(2022,10,14), datetime(2022,10,18)]
data = pd.Series(np.arange(30,33), index=mydate)
data
## 2022-10-13 30
## 2022-10-14 31
## 2022-10-18 32
## dtype: int64
## DatetimeIndex(['2022-10-13', '2022-10-14', '2022-10-18'], dtype='datetime64[ns]', freq=None)
## 30
## 31
## 32
## DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
## '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
## '2022-01-09', '2022-01-10',
## ...
## '2023-07-23', '2023-07-24', '2023-07-25', '2023-07-26',
## '2023-07-27', '2023-07-28', '2023-07-29', '2023-07-30',
## '2023-07-31', '2023-08-01'],
## dtype='datetime64[ns]', length=578, freq='D')
## DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
## '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
## '2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
## '2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
## '2022-01-17', '2022-01-18', '2022-01-19', '2022-01-20',
## '2022-01-21', '2022-01-22', '2022-01-23', '2022-01-24',
## '2022-01-25', '2022-01-26', '2022-01-27', '2022-01-28',
## '2022-01-29', '2022-01-30', '2022-01-31', '2022-02-01',
## '2022-02-02', '2022-02-03', '2022-02-04'],
## dtype='datetime64[ns]', freq='D')
## DatetimeIndex(['2021-11-28', '2021-11-29', '2021-11-30', '2021-12-01',
## '2021-12-02', '2021-12-03', '2021-12-04', '2021-12-05',
## '2021-12-06', '2021-12-07', '2021-12-08', '2021-12-09',
## '2021-12-10', '2021-12-11', '2021-12-12', '2021-12-13',
## '2021-12-14', '2021-12-15', '2021-12-16', '2021-12-17',
## '2021-12-18', '2021-12-19', '2021-12-20', '2021-12-21',
## '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
## '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
## '2021-12-30', '2021-12-31', '2022-01-01'],
## dtype='datetime64[ns]', freq='D')
## DatetimeIndex(['2012-01-01', '2012-02-01', '2012-03-01', '2012-04-01',
## '2012-05-01', '2012-06-01', '2012-07-01', '2012-08-01',
## '2012-09-01', '2012-10-01'],
## dtype='datetime64[ns]', freq='MS')
data = pd.DataFrame(rng.standard_normal(len(ind1)*2).reshape(-1,2), index=ind1, columns=["a", "b"])
data.head()
## a b
## 2022-01-01 0.066031 1.127241
## 2022-01-02 0.467509 -0.859292
## 2022-01-03 0.368751 -0.958883
## 2022-01-04 0.878450 -0.049926
## 2022-01-05 -0.184862 -0.680930
## a b
## 2022-01-01 0.066031 1.127241
## 2022-01-02 0.467509 -0.859292
## 2022-01-03 0.368751 -0.958883
## 2022-01-04 0.878450 -0.049926
## 2022-01-05 -0.184862 -0.680930
## ... ... ...
## 2022-12-27 0.548739 -0.187671
## 2022-12-28 0.278144 0.158119
## 2022-12-29 0.777767 0.807008
## 2022-12-30 -1.619872 -2.247269
## 2022-12-31 1.001745 1.187725
##
## [365 rows x 2 columns]
## a b
## 2022-10-01 -1.244472 -0.888797
## 2022-10-02 -0.070680 0.334295
## 2022-10-03 0.051142 -0.765535
## 2022-10-04 0.900185 0.739413
## 2022-10-05 -0.159648 -0.652916
## 2022-10-06 0.548428 0.187974
## 2022-10-07 -1.448127 -0.067980
## 2022-10-08 0.262036 -0.899695
## 2022-10-09 0.189843 -1.454822
## 2022-10-10 1.336186 1.247950
## 2022-10-11 -0.252517 0.363454
## 2022-10-12 -2.409922 -1.156348
## 2022-10-13 -0.293779 -1.072133
## 2022-10-14 0.714396 1.997297
## 2022-10-15 -1.176615 -0.837463
## 2022-10-16 0.235448 1.611116
## 2022-10-17 -1.222374 0.249036
## 2022-10-18 1.821299 -1.651759
## 2022-10-19 -1.281069 -0.423607
## 2022-10-20 -0.520588 0.812601
## 2022-10-21 0.241660 -1.774962
## 2022-10-22 0.515410 -0.577539
## 2022-10-23 1.274447 -0.627588
## 2022-10-24 -0.636615 0.541132
## 2022-10-25 0.762926 0.448099
## 2022-10-26 -1.685597 0.538034
## 2022-10-27 -1.034308 0.235276
## 2022-10-28 -1.423734 0.446322
## 2022-10-29 -0.806599 -1.282635
## 2022-10-30 0.713820 0.241645
## 2022-10-31 -0.613977 1.451179
## a -2.409922
## b -1.156348
## Name: 2022-10-12 00:00:00, dtype: float64
## a b
## 2022-10-12 -2.409922 -1.156348
## 2022-10-13 -0.293779 -1.072133
## 2022-10-14 0.714396 1.997297
## 2022-10-15 -1.176615 -0.837463
## 2022-10-16 0.235448 1.611116
## ... ... ...
## 2023-07-28 -2.450872 -1.417684
## 2023-07-29 -1.187070 -0.363261
## 2023-07-30 -0.254608 -1.507326
## 2023-07-31 -0.985158 -0.860842
## 2023-08-01 2.457424 1.801742
##
## [294 rows x 2 columns]
## 0 False
## 1 True
## 2 True
## 3 True
## 4 False
## dtype: bool
## 0 False
## 1 True
## 2 True
## 3 True
## 4 False
## dtype: bool
## 0 0.0
## 4 1.0
## dtype: float64
## 0 0.0
## 4 1.0
## dtype: float64
## 0 0.0
## 4 1.0
## dtype: float64
## 0 1 2
## 0 1.0 2.0 3.0
## 0 1 2
## 0 1.0 2.0 3.0
## 2 4.0 NaN 6.0
## 0 1 2 4
## 0 1.0 2.0 3.0 NaN
## 1 NaN NaN NaN NaN
## 2 4.0 NaN 6.0 NaN
## 0 1 2
## 0 1.0 2.0 3.0
## 1 NaN NaN NaN
## 2 4.0 NaN 6.0
## Empty DataFrame
## Columns: []
## Index: [0, 1, 2]
## 0 1 2 4
## 0 1.0 2.0 3.0 -99.0
## 1 -99.0 -99.0 -99.0 -99.0
## 2 4.0 -99.0 6.0 -99.0
## 0 1 2 4
## 0 1.0 2.0 3.0 -4.0
## 1 NaN -1.0 -2.0 -4.0
## 2 4.0 -1.0 6.0 -4.0
## 0 1 2 4
## 0 1.0 2.0 3.0 -99.0
## 1 -99.0 -99.0 -99.0 -99.0
## 2 4.0 -99.0 6.0 -99.0
arr = pd.Series([0,np.nan,3,np.nan, 5, np.nan])
#apd.fillna(apd.mean()) ## fill by mean alue
arr.fillna(method="ffill")
## 0 0.0
## 1 0.0
## 2 3.0
## 3 3.0
## 4 5.0
## 5 5.0
## dtype: float64
##
## <string>:2: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
## 0 0.0
## 1 3.0
## 2 3.0
## 3 5.0
## 4 5.0
## 5 NaN
## dtype: float64
##
## <string>:1: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
## 0 False
## 1 False
## 2 True
## 3 True
## Name: c1, dtype: bool
## (array([2, 3]),)
## c1 c2
## 2 3 c
## 3 4 d
## c1 c2
## 3 4 d
## 0 False
## 1 False
## 2 False
## 3 True
## 4 False
## 5 False
## dtype: bool
## c1 c2
## 0 1 a
## 1 1 b
## 2 2 a
## 4 3 a
## 5 3 b
## c1 c2
## 0 1 a
## 2 2 a
## 4 3 a
## 0 True
## 1 False
## 2 True
## 3 False
## 4 True
## 5 False
## dtype: bool
## c1 c2
## 1 1 b
## 3 2 a
## 5 3 b
population = pd.DataFrame({"City": ["Gainesville", "Orlando", "Tampa", "Pittsburgh", "Philadelphia"],
"Population": [140,309,387,300,1576]}
)
city_to_state = {"Gainesville": "FL", "Orlando": "FL", "Tampa": "FL", "Pittsburgh": "PA", "Philadelphia":"PA"}
population
## City Population
## 0 Gainesville 140
## 1 Orlando 309
## 2 Tampa 387
## 3 Pittsburgh 300
## 4 Philadelphia 1576
## {'Gainesville': 'FL', 'Orlando': 'FL', 'Tampa': 'FL', 'Pittsburgh': 'PA', 'Philadelphia': 'PA'}
## 0 FL
## 1 FL
## 2 FL
## 3 PA
## 4 PA
## Name: City, dtype: object
## 0 FL
## 1 FL
## 2 FL
## 3 PA
## 4 PA
## Name: City, dtype: object
## 0 1 2
## 0 1 2 3
## 1 2 3 4
## 2 3 4 5
## 0 1 2
## 0 1 2.0 3.0
## 1 2 3.0 NaN
## 2 3 NaN 5.0
## 0 1 2
## 0 1 2 -3
## 1 2 -3 -4
## 2 -3 -4 5
## 0 1 2
## 0 1 2 -3
## 1 2 -3 -4
## 2 -3 -4 5
data = pd.DataFrame({"l1": [1,2,3,18,13, 1, 15,6,-99,21,3,np.nan],
"l2": [1,np.nan,3,7,np.nan,1, 5,-99,-99,3,3,9]})
data
## l1 l2
## 0 1.0 1.0
## 1 2.0 NaN
## 2 3.0 3.0
## 3 18.0 7.0
## 4 13.0 NaN
## 5 1.0 1.0
## 6 15.0 5.0
## 7 6.0 -99.0
## 8 -99.0 -99.0
## 9 21.0 3.0
## 10 3.0 3.0
## 11 NaN 9.0
pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1
## b d c
## Florida 0 1 2
## Texax 3 4 5
## Utah 6 7 8
## b d c
## Flo 0 1 2
## Tex 3 4 5
## Uta 6 7 8
## B D C
## FLO 0 1 2
## TEX 3 4 5
## UTA 6 7 8
## BB d c
## FL 0 1 2
## TX 3 4 5
## Uta 6 7 8
## BB d c
## FL 0 1 2
## TX 3 4 5
## Uta 6 7 8
ages = np.random.default_rng(32611).integers(low=0, high=100, size=10)
bins = [0, 20, 40, 60, 80, 100]
groups = pd.cut(ages, bins)
groups.codes
## array([0, 2, 2, 0, 1, 0, 0, 3, 4, 3], dtype=int8)
## IntervalIndex([(0, 20], (20, 40], (40, 60], (60, 80], (80, 100]], dtype='interval[int64, right]')
## (0, 20] 4
## (20, 40] 1
## (40, 60] 2
## (60, 80] 2
## (80, 100] 1
## Name: count, dtype: int64
groups2 = pd.cut(ages, bins, labels = ["0-20", "20-40", "40-60", "60-80", "80-100"])
groups2.value_counts()
## 0-20 4
## 20-40 1
## 40-60 2
## 60-80 2
## 80-100 1
## Name: count, dtype: int64
## 0 1 2
## 57 0.755502 1.748730 -0.549839
## 58 0.603501 0.778081 -0.741010
## 38 0.066027 -0.638866 0.302885
## 0 1 2
## 42 0.175722 0.637392 0.456941
## 85 1.971000 1.961173 0.553331
## 50 0.902698 1.089878 -0.729095