Programming basics for Biostatistics 6099

Pandas Basics

Zhiguang Huo (Caleb)

Monday Nov 9th, 2023

Outlines

Introduction to Pandas

import pandas as pd

Series

aseries = pd.Series([1,2,3])
aseries
## 0    1
## 1    2
## 2    3
## dtype: int64
aseries.values
## array([1, 2, 3])
aseries.index ## like range(3)
## RangeIndex(start=0, stop=3, step=1)
[i for i in aseries.index]
list(aseries.index)

Series

bseries = pd.Series([1.,2.2,3.5], index=['a', 'b', 'c'])
bseries
## a    1.0
## b    2.2
## c    3.5
## dtype: float64
bseries.index ## like range(3)
## Index(['a', 'b', 'c'], dtype='object')
## [i for i in bseries.index]
## list(bseries.index)

Series

bseries*10
## a    10.0
## b    22.0
## c    35.0
## dtype: float64
np.log(bseries)
## a    0.000000
## b    0.788457
## c    1.252763
## dtype: float64

Series

bseries['a']
## 1.0
bseries.a
## 1.0
bseries[['a','b']]
## a    1.0
## b    2.2
## dtype: float64
'a' in bseries
## True
adict = {"a":1, "b":2, "c":3}
pd.Series(adict) 
## a    1
## b    2
## c    3
## dtype: int64
cseries = pd.Series(adict, index=["a", "c", "b"]) 
cseries
## a    1
## c    3
## b    2
## dtype: int64
dseries = pd.Series(adict, index=["d", "c", "b"]) 
dseries ## nan will be created
## d    NaN
## c    3.0
## b    2.0
## dtype: float64
pd.isnull(dseries)
## d     True
## c    False
## b    False
## dtype: bool

Series meta data

bseries.name="XX"
bseries.index.name="YY"
## bseries

Series indexing

obj = pd.Series(np.arange(3), index=["a", "b", "c"])
obj
## a    0
## b    1
## c    2
## dtype: int64
obj.index ## index is like range()
## Index(['a', 'b', 'c'], dtype='object')
obj.index[:-1]
## Index(['a', 'b'], dtype='object')
obj.index[0] = "x"
obj.index
## Index(['a', 'b', 'c'], dtype='object')
obj.index = ["aa", "bb", "cc"]
obj
## aa    0
## bb    1
## cc    2
## dtype: int64

Reindexing

data = pd.Series([1.1, 2.2, 3.3], index = ['b', 'c', 'a'])
data
## b    1.1
## c    2.2
## a    3.3
## dtype: float64
data2 = data.reindex(['a', 'b', 'c', 'd'])
data2
## a    3.3
## b    1.1
## c    2.2
## d    NaN
## dtype: float64

Drop entries in Series

data = pd.Series(np.arange(4), index = ['a', 'b', 'c', 'd'])
data
## a    0
## b    1
## c    2
## d    3
## dtype: int64
data.drop("a") ## not inplace operator
## b    1
## c    2
## d    3
## dtype: int64
data2 = data.drop(["a", "b"])
data2
## c    2
## d    3
## dtype: int64
data.drop(["a"], inplace=True)
data
## b    1
## c    2
## d    3
## dtype: int64

Series entry selection

data = pd.Series([1,2,3,4], index=["a", "b", "c", "d"])
data["b"]
## 2
data[1]
## 2
## 
## <string>:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
data[[1]]
## b    2
## dtype: int64
data[1:3]
## b    2
## c    3
## dtype: int64

Series entry selection (more)

data[[1,3]]
## b    2
## d    4
## dtype: int64
data[data > 2] ## by logical array
## c    3
## d    4
## dtype: int64
data["b":"c"]
## b    2
## c    3
## dtype: int64
##data["b":"c"] = 0

Series entry selection by loc/iloc

data = pd.Series([1,2,3,4], index=["a", "b", "c", "d"])
data.loc["a"]
## 1
data.loc[["a", "c"]]
## a    1
## c    3
## dtype: int64
data.iloc[1]
## 2
data.iloc[1:]
## b    2
## c    3
## d    4
## dtype: int64
data.iloc[-2]
## 3

Series membership

data = pd.Series(list("abcdadcccdaabbb"))
data.unique()
## array(['a', 'b', 'c', 'd'], dtype=object)
data.value_counts()
## a    4
## b    4
## c    4
## d    3
## Name: count, dtype: int64
pd.Series.value_counts(data, sort = False)
## a    4
## b    4
## c    4
## d    3
## Name: count, dtype: int64

Series membership

mask = data.isin(["a", "b"])
data[mask]
## 0     a
## 1     b
## 4     a
## 10    a
## 11    a
## 12    b
## 13    b
## 14    b
## dtype: object
uniq_val = data.unique()
pd.Index(uniq_val).get_indexer(data)
## array([0, 1, 2, 3, 0, 3, 2, 2, 2, 3, 0, 0, 1, 1, 1])

Arithmetic and data alignment

a1 = pd.Series([1,2,3], index = ["a", "b", "c"])
a2 = pd.Series([2,3,4], index = ["a", "b", "c"])
a3 = pd.Series([2,3,4], index = ["b", "c", "d"])
a1 + a2
## a    3
## b    5
## c    7
## dtype: int64
a1 + a3
## a    NaN
## b    4.0
## c    6.0
## d    NaN
## dtype: float64

DataFrame

data = {"Name": ["Amy", "Beth", "Carl"],
        "Age": [24, 22, 19],
        "Sex": ["F", "F", "M"]        
}
data
## {'Name': ['Amy', 'Beth', 'Carl'], 'Age': [24, 22, 19], 'Sex': ['F', 'F', 'M']}
pd.DataFrame(data)
##    Name  Age Sex
## 0   Amy   24   F
## 1  Beth   22   F
## 2  Carl   19   M
pd.DataFrame(data, columns = ["Sex", "Age", "Name"])
##   Sex  Age  Name
## 0   F   24   Amy
## 1   F   22  Beth
## 2   M   19  Carl
pd.DataFrame(data, columns = ["Sex", "Age", "Name", "Email"])
##   Sex  Age  Name Email
## 0   F   24   Amy   NaN
## 1   F   22  Beth   NaN
## 2   M   19  Carl   NaN
apd = pd.DataFrame(data, columns = ["Sex", "Age", "Name"], index = ["a", "b", "c"])
apd
##   Sex  Age  Name
## a   F   24   Amy
## b   F   22  Beth
## c   M   19  Carl

reset_index and set_index

apd
##   Sex  Age  Name
## a   F   24   Amy
## b   F   22  Beth
## c   M   19  Carl
data2 = apd.reset_index()
data2
##   index Sex  Age  Name
## 0     a   F   24   Amy
## 1     b   F   22  Beth
## 2     c   M   19  Carl
data2.set_index("index")
##       Sex  Age  Name
## index               
## a       F   24   Amy
## b       F   22  Beth
## c       M   19  Carl
#data2.set_index("Name")

DataFrame - select columns

apd["Age"]
## a    24
## b    22
## c    19
## Name: Age, dtype: int64
apd.Age
## a    24
## b    22
## c    19
## Name: Age, dtype: int64
apd[["Name", "Age"]]
##    Name  Age
## a   Amy   24
## b  Beth   22
## c  Carl   19

DataFrame - select rows

## apd[1] # this won't work
apd[1:]
##   Sex  Age  Name
## b   F   22  Beth
## c   M   19  Carl
apd[apd["Age"] > 20]
##   Sex  Age  Name
## a   F   24   Amy
## b   F   22  Beth

Selection with loc

apd.loc["a"]
## Sex       F
## Age      24
## Name    Amy
## Name: a, dtype: object
apd.loc[:,"Name"]
## a     Amy
## b    Beth
## c    Carl
## Name: Name, dtype: object
apd.loc["a", ["Name", "Age"]]
## Name    Amy
## Age      24
## Name: a, dtype: object

Selection with iloc

apd.iloc[0]
## Sex       F
## Age      24
## Name    Amy
## Name: a, dtype: object
apd.iloc[-1]
## Sex        M
## Age       19
## Name    Carl
## Name: c, dtype: object
apd.iloc[:,1]
## a    24
## b    22
## c    19
## Name: Age, dtype: int64
apd.iloc[0, [0,1]]
## Sex     F
## Age    24
## Name: a, dtype: object

create columns

apd["Age20"] = apd.Age >= 20
apd
##   Sex  Age  Name  Age20
## a   F   24   Amy   True
## b   F   22  Beth   True
## c   M   19  Carl  False
apd["debt"] = np.nan
apd
##   Sex  Age  Name  Age20  debt
## a   F   24   Amy   True   NaN
## b   F   22  Beth   True   NaN
## c   M   19  Carl  False   NaN

modify columns

apd["debt"] = 15.1
apd
##   Sex  Age  Name  Age20  debt
## a   F   24   Amy   True  15.1
## b   F   22  Beth   True  15.1
## c   M   19  Carl  False  15.1
apd["debt"] = np.arange(3.)
apd
##   Sex  Age  Name  Age20  debt
## a   F   24   Amy   True   0.0
## b   F   22  Beth   True   1.0
## c   M   19  Carl  False   2.0
val = pd.Series([5., 7.7], index=["a", "c"])
apd["debt"] = val
apd
##   Sex  Age  Name  Age20  debt
## a   F   24   Amy   True   5.0
## b   F   22  Beth   True   NaN
## c   M   19  Carl  False   7.7

more on columns for pd.DataFrame

apd.columns
## Index(['Sex', 'Age', 'Name', 'Age20', 'debt'], dtype='object')
del apd["debt"]
apd.columns
## Index(['Sex', 'Age', 'Name', 'Age20'], dtype='object')
apd.T
##           a     b      c
## Sex       F     F      M
## Age      24    22     19
## Name    Amy  Beth   Carl
## Age20  True  True  False

create DataFrame via nested dictionary

pop = {"Florida": {2020: 10.9, 2021: 11.3, 2022: 13.4},
        "Texas": {2020: 20.5, 2021: 21.1}
}
bpd = pd.DataFrame(pop)
bpd
##       Florida  Texas
## 2020     10.9   20.5
## 2021     11.3   21.1
## 2022     13.4    NaN

some extra operations on pd.DataFrame

bpd.index.name = "Year"
bpd.columns.name = "States"
bpd
## States  Florida  Texas
## Year                  
## 2020       10.9   20.5
## 2021       11.3   21.1
## 2022       13.4    NaN

some attributes of pd.DataFrame

bpd.values
## array([[10.9, 20.5],
##        [11.3, 21.1],
##        [13.4,  nan]])
bpd.index
## Index([2020, 2021, 2022], dtype='int64', name='Year')
2021 in bpd.index
## True
bpd.columns
## Index(['Florida', 'Texas'], dtype='object', name='States')
"Florida" in bpd.columns
## True

Reindexing for DataFrame

data = pd.DataFrame(np.arange(6).reshape(3,-1), index =['x', 'y', 'z'], columns = ["Florida", 'Texas'] )
data
##    Florida  Texas
## x        0      1
## y        2      3
## z        4      5
data2 = data.reindex(['x', 'y', 'z', 'w'])
data2
##    Florida  Texas
## x      0.0    1.0
## y      2.0    3.0
## z      4.0    5.0
## w      NaN    NaN
states = ['Utah', 'Florida', 'Texas']
data3 = data2.reindex(columns = states)
data3
##    Utah  Florida  Texas
## x   NaN      0.0    1.0
## y   NaN      2.0    3.0
## z   NaN      4.0    5.0
## w   NaN      NaN    NaN
data3.loc[["x", "y"], states]
##    Utah  Florida  Texas
## x   NaN      0.0    1.0
## y   NaN      2.0    3.0

Drop entries in DataFrame

data0 = {"Name": ["Amy", "Beth", "Carl"],
        "Age": [24, 22, 19],
        "Sex": ["F", "F", "M"]        
}
data = pd.DataFrame(data0, index = ["1", "2", "3"])
data.drop("1") ## drop by index
##    Name  Age Sex
## 2  Beth   22   F
## 3  Carl   19   M
data.drop("Name", axis=1)
##    Age Sex
## 1   24   F
## 2   22   F
## 3   19   M
data.drop(["Name", "Age"], axis="columns")
##   Sex
## 1   F
## 2   F
## 3   M

## data.drop("1", inplace=True); data

Data alignment for DataFrame

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd2 = pd.DataFrame(np.arange(12).reshape(-1,3), columns = list("bac"), index = ["Florida", "Texax", "Utah", "Ohio"])

pd1
##          b  d  c
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
pd2
##          b   a   c
## Florida  0   1   2
## Texax    3   4   5
## Utah     6   7   8
## Ohio     9  10  11
pd1 + pd2
##           a     b     c   d
## Florida NaN   0.0   4.0 NaN
## Ohio    NaN   NaN   NaN NaN
## Texax   NaN   6.0  10.0 NaN
## Utah    NaN  12.0  16.0 NaN

Arithmetic for DataFrame

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd3 = pd.DataFrame(np.arange(1,10).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])

pd1.add(pd3)
##           b   d   c
## Florida   1   3   5
## Texax     7   9  11
## Utah     13  15  17
pd1.sub(1)
##          b  d  c
## Florida -1  0  1
## Texax    2  3  4
## Utah     5  6  7

Operation between DataFrame and Series

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])

series1 = pd1.iloc[0]

pd1
##          b  d  c
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
series1
## b    0
## d    1
## c    2
## Name: Florida, dtype: int64
pd1 - series1
##          b  d  c
## Florida  0  0  0
## Texax    3  3  3
## Utah     6  6  6

Sorting and Ranking

series1 = pd.Series([1,3,2,4], index = ['b', 'd', 'a', 'c'])
series1
## b    1
## d    3
## a    2
## c    4
## dtype: int64
series1.sort_index()
## a    2
## b    1
## c    4
## d    3
## dtype: int64
series1.sort_index(ascending=False)
## d    3
## c    4
## b    1
## a    2
## dtype: int64
series1.sort_values()
## b    1
## a    2
## d    3
## c    4
## dtype: int64

Sorting and Ranking

pd1 = pd.DataFrame(np.array([3,2,5,1,4,6]).reshape(3,2), index = ['c', 'a', 'b'], columns = ["x", "y"])
pd1.sort_index()
##    x  y
## a  5  1
## b  4  6
## c  3  2
pd1.sort_values(["x"])
##    x  y
## c  3  2
## b  4  6
## a  5  1
pd1.sort_values(["y", "x"]) ## ## multiple criteria
##    x  y
## a  5  1
## c  3  2
## b  4  6

Function application

pd1 = pd.DataFrame(np.arange(12).reshape(3,4), index = ['x', 'y', 'z'], columns = ["a", "b", "c", "d"])


pd1.apply(np.mean)
## a    4.0
## b    5.0
## c    6.0
## d    7.0
## dtype: float64
pd1.apply(np.mean, axis = 1)
## x    1.5
## y    5.5
## z    9.5
## dtype: float64

Function application

f = lambda x: x.max() - x.min()
pd1.apply(f)
## a    8
## b    8
## c    8
## d    8
## dtype: int64
pd1.apply(lambda x: x.max() - x.min())
## a    8
## b    8
## c    8
## d    8
## dtype: int64
def f2(x):
    return pd.Series([x.min(), x.max()], index = ["min", "max"])

pd1.apply(f2)
##      a  b   c   d
## min  0  1   2   3
## max  8  9  10  11

Summary statistics

pd1.sum() ## default axis = 0
## a    12
## b    15
## c    18
## d    21
## dtype: int64
pd1.sum(axis = 1)
## x     6
## y    22
## z    38
## dtype: int64
pd1.median()
## a    4.0
## b    5.0
## c    6.0
## d    7.0
## dtype: float64
pd1.idxmax()
## a    z
## b    z
## c    z
## d    z
## dtype: object

Summary statistics

pd1.idxmin(axis = "columns")
## x    a
## y    a
## z    a
## dtype: object
pd1.cumsum()
##     a   b   c   d
## x   0   1   2   3
## y   4   6   8  10
## z  12  15  18  21
pd1.var() ## default axis = 0
## a    16.0
## b    16.0
## c    16.0
## d    16.0
## dtype: float64
## pd1.std(axis="columns")

Summary statistics

pd1.describe()
##          a    b     c     d
## count  3.0  3.0   3.0   3.0
## mean   4.0  5.0   6.0   7.0
## std    4.0  4.0   4.0   4.0
## min    0.0  1.0   2.0   3.0
## 25%    2.0  3.0   4.0   5.0
## 50%    4.0  5.0   6.0   7.0
## 75%    6.0  7.0   8.0   9.0
## max    8.0  9.0  10.0  11.0

Correlation

rng = np.random.default_rng(42)
pd2 = pd.DataFrame(rng.standard_normal(12).reshape(3,4), index = ['x', 'y', 'z'], columns = ["a", "b", "c", "d"])

pd2["a"].corr(pd2["b"])
## 0.8476294400143133
pd2.a.corr(pd2.b)
## 0.8476294400143133
pd2.corr()
##           a         b         c         d
## a  1.000000  0.847629  0.957308  0.999918
## b  0.847629  1.000000  0.964820  0.854339
## c  0.957308  0.964820  1.000000  0.960923
## d  0.999918  0.854339  0.960923  1.000000

Covariance

pd2["a"].cov(pd2["b"])
## 0.2333460858435695
pd2.cov()
##           a         b         c         d
## a  1.488842  0.233346  0.469448  0.833917
## b  0.233346  0.050903  0.087484  0.131745
## c  0.469448  0.087484  0.161518  0.263957
## d  0.833917  0.131745  0.263957  0.467162

Value counts for DataFrame

data = pd.DataFrame({'a':[1,2,3,3], 'b':[1,1,2,4], 'c':[2,2,3,3]})
data["a"].value_counts()
## a
## 3    2
## 1    1
## 2    1
## Name: count, dtype: int64
data.apply(pd.Series.value_counts)
##      a    b    c
## 1  1.0  2.0  NaN
## 2  1.0  1.0  2.0
## 3  2.0  NaN  2.0
## 4  NaN  1.0  NaN
data.apply(pd.Series.value_counts).fillna(0)
##      a    b    c
## 1  1.0  2.0  0.0
## 2  1.0  1.0  2.0
## 3  2.0  0.0  2.0
## 4  0.0  1.0  0.0

Read in files using Pandas

afile = "https://caleb-huo.github.io/teaching/data/Python/Student_data.csv"
bfile = "https://caleb-huo.github.io/teaching/data/Python/Student_data.xlsx"
data0 = pd.read_csv("sleepstudy.csv")
data1 = pd.read_excel("sleepstudy.xlsx")

Read_csv directly through an URL

url="https://caleb-huo.github.io/teaching/data/Python/Student_data.csv"
c=pd.read_csv(url)
c.head()
##     Name       Hobby Year_in_colledge  Initial_GPA  Study_time
## 0    Dan    Football         freshman          3.1          10
## 1   Beth       Music        sophomore          3.2          20
## 2   Carl  Basketball           senior          3.6          14
## 3  Frank     Cooking        sophomore          3.4          16
## 4  Emily     Running           junior          3.3          18

Read_csv more options

data = pd.read_csv("Student_data.csv", header=None)
data.head()
##        0           1                 2            3           4
## 0   Name       Hobby  Year_in_colledge  Initial_GPA  Study_time
## 1    Dan    Football          freshman          3.1          10
## 2   Beth       Music         sophomore          3.2          20
## 3   Carl  Basketball            senior          3.6          14
## 4  Frank     Cooking         sophomore          3.4          16
data = pd.read_csv("Student_data.csv", header=None, skiprows = 1)
data.head()
##        0           1          2    3   4
## 0    Dan    Football   freshman  3.1  10
## 1   Beth       Music  sophomore  3.2  20
## 2   Carl  Basketball     senior  3.6  14
## 3  Frank     Cooking  sophomore  3.4  16
## 4  Emily     Running     junior  3.3  18

Read_csv more options

data = pd.read_csv("Student_data.csv", skiprows = 1, names = ["a", "b", "c", "d", 'e'])
data.head()
##        a           b          c    d   e
## 0    Dan    Football   freshman  3.1  10
## 1   Beth       Music  sophomore  3.2  20
## 2   Carl  Basketball     senior  3.6  14
## 3  Frank     Cooking  sophomore  3.4  16
## 4  Emily     Running     junior  3.3  18
data = pd.read_csv("Student_data.csv", index_col = "Name")
data.head()
##             Hobby Year_in_colledge  Initial_GPA  Study_time
## Name                                                       
## Dan      Football         freshman          3.1          10
## Beth        Music        sophomore          3.2          20
## Carl   Basketball           senior          3.6          14
## Frank     Cooking        sophomore          3.4          16
## Emily     Running           junior          3.3          18
data.sort_index().head()
##              Hobby Year_in_colledge  Initial_GPA  Study_time
## Name                                                        
## Amy       Swimming           senior          3.0          15
## Ashely      Skiing           senior          3.1          15
## Beth         Music        sophomore          3.2          20
## Carl    Basketball           senior          3.6          14
## Chris      Singing         freshman          3.6          19

Save results to csv or other format

data = pd.read_csv("Student_data.csv")
data.to_csv("mydata.csv") ## index=True as default
data.to_csv("mydata.csv", index=False)
data.to_csv("mydata.txt", sep="\t")
import sys
data.to_csv(sys.stdout) 
data.to_csv(sys.stdout, sep="\t") 
data.to_csv(sys.stdout, index=False) 
data.to_csv(sys.stdout, header=False) 
data.to_csv(sys.stdout, columns = ["Hobby", "Year_in_colledge"]) 

Review datetime

from datetime import datetime
from datetime import timedelta

now = datetime.now()
now
## datetime.datetime(2023, 11, 9, 12, 44, 11, 203884)
now.year, now.month, now.day
## (2023, 11, 9)
now.hour, now.minute, now.second
## (12, 44, 11)

timedelta

datetime1 = datetime(2022,10,13,7,30,0)
datetime2 = datetime(2022,10,10,5,20,0)
delta = datetime1 - datetime2
delta
## datetime.timedelta(days=3, seconds=7800)
delta.days
## 3
delta.seconds
## 7800
datetime1 + timedelta(12)
## datetime.datetime(2022, 10, 25, 7, 30)
datetime1 + timedelta(12, 2)
## datetime.datetime(2022, 10, 25, 7, 30, 2)

String and Datatime Convertion

date1 = datetime(2022,10,13)
str(date1)
## '2022-10-13 00:00:00'
date1.strftime("%Y-%m-%d")
## '2022-10-13'
val1 = "2021-10-11"
datetime.strptime(val1, "%Y-%m-%d")
## datetime.datetime(2021, 10, 11, 0, 0)
val2 = "10/11/2021"
datetime.strptime(val2, "%m/%d/%Y")
## datetime.datetime(2021, 10, 11, 0, 0)
dateStr = ["2021-10-11 12:00:01", "2021-10-12 03:40:01"]
pd.to_datetime(dateStr)
## DatetimeIndex(['2021-10-11 12:00:01', '2021-10-12 03:40:01'], dtype='datetime64[ns]', freq=None)

Time Series Basics

mydate = [datetime(2022,10,13), datetime(2022,10,14), datetime(2022,10,18)]
data = pd.Series(np.arange(30,33), index=mydate)
data
## 2022-10-13    30
## 2022-10-14    31
## 2022-10-18    32
## dtype: int64
data.index
## DatetimeIndex(['2022-10-13', '2022-10-14', '2022-10-18'], dtype='datetime64[ns]', freq=None)
data["2022/10/13"]
## 30
data["20221014"]
## 31
data["2022-10-18"]
## 32

Date range

ind1 = pd.date_range("2022-01-01", "2023-08-01")
ind1
## DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
##                '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
##                '2022-01-09', '2022-01-10',
##                ...
##                '2023-07-23', '2023-07-24', '2023-07-25', '2023-07-26',
##                '2023-07-27', '2023-07-28', '2023-07-29', '2023-07-30',
##                '2023-07-31', '2023-08-01'],
##               dtype='datetime64[ns]', length=578, freq='D')
ind2 = pd.date_range(start = "2022-01-01", periods=35)
ind2
## DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
##                '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
##                '2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
##                '2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
##                '2022-01-17', '2022-01-18', '2022-01-19', '2022-01-20',
##                '2022-01-21', '2022-01-22', '2022-01-23', '2022-01-24',
##                '2022-01-25', '2022-01-26', '2022-01-27', '2022-01-28',
##                '2022-01-29', '2022-01-30', '2022-01-31', '2022-02-01',
##                '2022-02-02', '2022-02-03', '2022-02-04'],
##               dtype='datetime64[ns]', freq='D')
ind3 = pd.date_range(end = "2022-01-01", periods=35)
ind3
## DatetimeIndex(['2021-11-28', '2021-11-29', '2021-11-30', '2021-12-01',
##                '2021-12-02', '2021-12-03', '2021-12-04', '2021-12-05',
##                '2021-12-06', '2021-12-07', '2021-12-08', '2021-12-09',
##                '2021-12-10', '2021-12-11', '2021-12-12', '2021-12-13',
##                '2021-12-14', '2021-12-15', '2021-12-16', '2021-12-17',
##                '2021-12-18', '2021-12-19', '2021-12-20', '2021-12-21',
##                '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
##                '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
##                '2021-12-30', '2021-12-31', '2022-01-01'],
##               dtype='datetime64[ns]', freq='D')
ind4 = pd.date_range(start = "2012-01-01", periods=10, freq="MS") ## MS: month start
ind4
## DatetimeIndex(['2012-01-01', '2012-02-01', '2012-03-01', '2012-04-01',
##                '2012-05-01', '2012-06-01', '2012-07-01', '2012-08-01',
##                '2012-09-01', '2012-10-01'],
##               dtype='datetime64[ns]', freq='MS')

Date selections

data = pd.DataFrame(rng.standard_normal(len(ind1)*2).reshape(-1,2), index=ind1, columns=["a", "b"])

data.head()
##                    a         b
## 2022-01-01  0.066031  1.127241
## 2022-01-02  0.467509 -0.859292
## 2022-01-03  0.368751 -0.958883
## 2022-01-04  0.878450 -0.049926
## 2022-01-05 -0.184862 -0.680930
data.loc["2022"] ## all rows in a year
##                    a         b
## 2022-01-01  0.066031  1.127241
## 2022-01-02  0.467509 -0.859292
## 2022-01-03  0.368751 -0.958883
## 2022-01-04  0.878450 -0.049926
## 2022-01-05 -0.184862 -0.680930
## ...              ...       ...
## 2022-12-27  0.548739 -0.187671
## 2022-12-28  0.278144  0.158119
## 2022-12-29  0.777767  0.807008
## 2022-12-30 -1.619872 -2.247269
## 2022-12-31  1.001745  1.187725
## 
## [365 rows x 2 columns]
data.loc["2022-10"] ## all rows in a month
##                    a         b
## 2022-10-01 -1.244472 -0.888797
## 2022-10-02 -0.070680  0.334295
## 2022-10-03  0.051142 -0.765535
## 2022-10-04  0.900185  0.739413
## 2022-10-05 -0.159648 -0.652916
## 2022-10-06  0.548428  0.187974
## 2022-10-07 -1.448127 -0.067980
## 2022-10-08  0.262036 -0.899695
## 2022-10-09  0.189843 -1.454822
## 2022-10-10  1.336186  1.247950
## 2022-10-11 -0.252517  0.363454
## 2022-10-12 -2.409922 -1.156348
## 2022-10-13 -0.293779 -1.072133
## 2022-10-14  0.714396  1.997297
## 2022-10-15 -1.176615 -0.837463
## 2022-10-16  0.235448  1.611116
## 2022-10-17 -1.222374  0.249036
## 2022-10-18  1.821299 -1.651759
## 2022-10-19 -1.281069 -0.423607
## 2022-10-20 -0.520588  0.812601
## 2022-10-21  0.241660 -1.774962
## 2022-10-22  0.515410 -0.577539
## 2022-10-23  1.274447 -0.627588
## 2022-10-24 -0.636615  0.541132
## 2022-10-25  0.762926  0.448099
## 2022-10-26 -1.685597  0.538034
## 2022-10-27 -1.034308  0.235276
## 2022-10-28 -1.423734  0.446322
## 2022-10-29 -0.806599 -1.282635
## 2022-10-30  0.713820  0.241645
## 2022-10-31 -0.613977  1.451179
data.loc["2022-10-12"]
## a   -2.409922
## b   -1.156348
## Name: 2022-10-12 00:00:00, dtype: float64
data.loc["2022-10-12":] ## starting from a date
##                    a         b
## 2022-10-12 -2.409922 -1.156348
## 2022-10-13 -0.293779 -1.072133
## 2022-10-14  0.714396  1.997297
## 2022-10-15 -1.176615 -0.837463
## 2022-10-16  0.235448  1.611116
## ...              ...       ...
## 2023-07-28 -2.450872 -1.417684
## 2023-07-29 -1.187070 -0.363261
## 2023-07-30 -0.254608 -1.507326
## 2023-07-31 -0.985158 -0.860842
## 2023-08-01  2.457424  1.801742
## 
## [294 rows x 2 columns]

Missing data – Series

vec_withna = pd.Series([0, np.NaN, np.nan, None, 1])
vec_withna.isnull()
## 0    False
## 1     True
## 2     True
## 3     True
## 4    False
## dtype: bool
vec_withna.isna()
## 0    False
## 1     True
## 2     True
## 3     True
## 4    False
## dtype: bool
vec_withna[vec_withna.notnull()]
## 0    0.0
## 4    1.0
## dtype: float64
vec_withna[vec_withna.notna()]
## 0    0.0
## 4    1.0
## dtype: float64
vec_withna.dropna()
## 0    0.0
## 4    1.0
## dtype: float64

Missing data – DataDrame

from numpy import nan as NA

apd = pd.DataFrame([[1,2,3], [NA, NA, NA], [4, NA, 6]])
apd.dropna()
##      0    1    2
## 0  1.0  2.0  3.0
apd.dropna(how="all")
##      0    1    2
## 0  1.0  2.0  3.0
## 2  4.0  NaN  6.0
apd[4] = NA
apd
##      0    1    2   4
## 0  1.0  2.0  3.0 NaN
## 1  NaN  NaN  NaN NaN
## 2  4.0  NaN  6.0 NaN
apd.dropna(axis=1, how="all")
##      0    1    2
## 0  1.0  2.0  3.0
## 1  NaN  NaN  NaN
## 2  4.0  NaN  6.0
apd.dropna(axis=1)
## Empty DataFrame
## Columns: []
## Index: [0, 1, 2]

Fill in missing data

apd.fillna(-99)
##       0     1     2     4
## 0   1.0   2.0   3.0 -99.0
## 1 -99.0 -99.0 -99.0 -99.0
## 2   4.0 -99.0   6.0 -99.0
#apd.fillna(apd.mean()) ## fill by mean alue
apd.fillna({1:-1, 2:-2, 4:-4})
##      0    1    2    4
## 0  1.0  2.0  3.0 -4.0
## 1  NaN -1.0 -2.0 -4.0
## 2  4.0 -1.0  6.0 -4.0
apd.fillna(-99, inplace=True)
apd
##       0     1     2     4
## 0   1.0   2.0   3.0 -99.0
## 1 -99.0 -99.0 -99.0 -99.0
## 2   4.0 -99.0   6.0 -99.0

Fill in missing data using interpolation

arr = pd.Series([0,np.nan,3,np.nan, 5, np.nan])
#apd.fillna(apd.mean()) ## fill by mean alue
arr.fillna(method="ffill")
## 0    0.0
## 1    0.0
## 2    3.0
## 3    3.0
## 4    5.0
## 5    5.0
## dtype: float64
## 
## <string>:2: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
arr.fillna(method="bfill")
## 0    0.0
## 1    3.0
## 2    3.0
## 3    5.0
## 4    5.0
## 5    NaN
## dtype: float64
## 
## <string>:1: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.

Filtering

apd = pd.DataFrame({"c1": [1,2,3,4], "c2": ["a", "b", "c", "d"]})
apd["c1"] > 2
## 0    False
## 1    False
## 2     True
## 3     True
## Name: c1, dtype: bool
np.where(apd["c1"] > 2)
## (array([2, 3]),)
apd.iloc[np.where(apd["c1"] > 2)]
##    c1 c2
## 2   3  c
## 3   4  d
apd.iloc[np.where( (apd["c1"] > 2) & (apd["c2"] == "d"))]
##    c1 c2
## 3   4  d

Duplicates

apd = pd.DataFrame({"c1": [1,1,2,2,3,3], "c2": ["a", "b", "a", "a", "a", "b"]})
apd.duplicated()
## 0    False
## 1    False
## 2    False
## 3     True
## 4    False
## 5    False
## dtype: bool
apd.drop_duplicates()
##    c1 c2
## 0   1  a
## 1   1  b
## 2   2  a
## 4   3  a
## 5   3  b
apd.drop_duplicates("c1")
##    c1 c2
## 0   1  a
## 2   2  a
## 4   3  a
apd.duplicated(["c1"], keep="last")
## 0     True
## 1    False
## 2     True
## 3    False
## 4     True
## 5    False
## dtype: bool
apd.drop_duplicates("c1", keep="last")
##    c1 c2
## 1   1  b
## 3   2  a
## 5   3  b

Map

population = pd.DataFrame({"City": ["Gainesville", "Orlando", "Tampa", "Pittsburgh", "Philadelphia"],
    "Population": [140,309,387,300,1576]}
)
city_to_state = {"Gainesville": "FL", "Orlando": "FL", "Tampa": "FL", "Pittsburgh": "PA", "Philadelphia":"PA"}
population
##            City  Population
## 0   Gainesville         140
## 1       Orlando         309
## 2         Tampa         387
## 3    Pittsburgh         300
## 4  Philadelphia        1576
city_to_state
## {'Gainesville': 'FL', 'Orlando': 'FL', 'Tampa': 'FL', 'Pittsburgh': 'PA', 'Philadelphia': 'PA'}
population["City"].map(lambda x: city_to_state[x])
## 0    FL
## 1    FL
## 2    FL
## 3    PA
## 4    PA
## Name: City, dtype: object
population.City.map(city_to_state)
## 0    FL
## 1    FL
## 2    FL
## 3    PA
## 4    PA
## Name: City, dtype: object
population["State"] = population.City.map(city_to_state)

Replacing values

apd = pd.DataFrame([[1,2,3], [2, 3, 4], [3, 4, 5]])
apd
##    0  1  2
## 0  1  2  3
## 1  2  3  4
## 2  3  4  5
apd.replace(4, np.nan)
##    0    1    2
## 0  1  2.0  3.0
## 1  2  3.0  NaN
## 2  3  NaN  5.0
apd.replace([3,4], [-3,-4])
##    0  1  2
## 0  1  2 -3
## 1  2 -3 -4
## 2 -3 -4  5
apd.replace({3:-3, 4:-4})
##    0  1  2
## 0  1  2 -3
## 1  2 -3 -4
## 2 -3 -4  5

HW question

data = pd.DataFrame({"l1": [1,2,3,18,13, 1, 15,6,-99,21,3,np.nan], 
"l2": [1,np.nan,3,7,np.nan,1, 5,-99,-99,3,3,9]})
data
##       l1    l2
## 0    1.0   1.0
## 1    2.0   NaN
## 2    3.0   3.0
## 3   18.0   7.0
## 4   13.0   NaN
## 5    1.0   1.0
## 6   15.0   5.0
## 7    6.0 -99.0
## 8  -99.0 -99.0
## 9   21.0   3.0
## 10   3.0   3.0
## 11   NaN   9.0

Renaming Axis Indexes

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1
##          b  d  c
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
pd1.index = pd1.index.map(lambda x: x[:3])
pd1
##      b  d  c
## Flo  0  1  2
## Tex  3  4  5
## Uta  6  7  8
pd1.rename(index=str.upper, columns=str.upper)
##      B  D  C
## FLO  0  1  2
## TEX  3  4  5
## UTA  6  7  8
pd1.rename(index={"Flo": "FL", "Tex": "TX"}, columns = {"b": "BB"})
##      BB  d  c
## FL    0  1  2
## TX    3  4  5
## Uta   6  7  8
pd1.rename(index={"Flo": "FL", "Tex": "TX"}, columns = {"b": "BB"}, inplace = True)

pd1
##      BB  d  c
## FL    0  1  2
## TX    3  4  5
## Uta   6  7  8

Discretization and Binning

ages = np.random.default_rng(32611).integers(low=0, high=100, size=10)
bins = [0, 20, 40, 60, 80, 100]
groups = pd.cut(ages, bins)
groups.codes
## array([0, 2, 2, 0, 1, 0, 0, 3, 4, 3], dtype=int8)
groups.categories
## IntervalIndex([(0, 20], (20, 40], (40, 60], (60, 80], (80, 100]], dtype='interval[int64, right]')
groups.value_counts()
## (0, 20]      4
## (20, 40]     1
## (40, 60]     2
## (60, 80]     2
## (80, 100]    1
## Name: count, dtype: int64
groups2 = pd.cut(ages, bins, labels = ["0-20", "20-40", "40-60", "60-80", "80-100"])
groups2.value_counts()
## 0-20      4
## 20-40     1
## 40-60     2
## 60-80     2
## 80-100    1
## Name: count, dtype: int64

Sub-Sampling

rng = np.random.default_rng(32611)
arr = pd.DataFrame(rng.standard_normal((100,3)))
index = rng.choice(range(100),3)
arr.take(index)
##            0         1         2
## 57  0.755502  1.748730 -0.549839
## 58  0.603501  0.778081 -0.741010
## 38  0.066027 -0.638866  0.302885
arr.sample(n=3)
##            0         1         2
## 42  0.175722  0.637392  0.456941
## 85  1.971000  1.961173  0.553331
## 50  0.902698  1.089878 -0.729095

Reference