Introduction to Biostatistical Computing PHC 6937

Pandas Basics

Zhiguang Huo (Caleb)

Monday Nov 14th, 2022

Outlines

Introduction to Pandas

import pandas as pd

Series

aseries = pd.Series([1,2,3])
aseries
## 0    1
## 1    2
## 2    3
## dtype: int64
aseries.values
## array([1, 2, 3])
aseries.index ## like range(3)
## RangeIndex(start=0, stop=3, step=1)
[i for i in aseries.index]
list(aseries.index)

Series

bseries = pd.Series([1.,2.2,3.5], index=['a', 'b', 'c'])
bseries
## a    1.0
## b    2.2
## c    3.5
## dtype: float64
bseries.index ## like range(3)
## [i for i in bseries.index]
## list(bseries.index)
## Index(['a', 'b', 'c'], dtype='object')

Series

bseries*10
## a    10.0
## b    22.0
## c    35.0
## dtype: float64
np.log(bseries)
## a    0.000000
## b    0.788457
## c    1.252763
## dtype: float64

Series

bseries['a']
## 1.0
bseries.a
## 1.0
bseries[['a','b']]
## a    1.0
## b    2.2
## dtype: float64
'a' in bseries
## True
adict = {"a":1, "b":2, "c":3}
pd.Series(adict) 
## a    1
## b    2
## c    3
## dtype: int64
cseries = pd.Series(adict, index=["a", "c", "b"]) 
cseries
## a    1
## c    3
## b    2
## dtype: int64
dseries = pd.Series(adict, index=["d", "c", "b"]) 
dseries ## nan will be created
## d    NaN
## c    3.0
## b    2.0
## dtype: float64
pd.isnull(dseries)
## d     True
## c    False
## b    False
## dtype: bool

Series meta data

bseries.name="XX"
bseries.index.name="YY"
## bseries
cseries.index
## Index(['a', 'c', 'b'], dtype='object')
cseries.index = ["aa", "bb", "cc"]
cseries
## aa    1
## bb    3
## cc    2
## dtype: int64

Series indexing

obj = pd.Series(np.arange(3), index=["a", "b", "c"])
obj
## a    0
## b    1
## c    2
## dtype: int64
obj.index ## index is like range()
## Index(['a', 'b', 'c'], dtype='object')
obj.index[:-1]
## Index(['a', 'b'], dtype='object')
obj.index[0] = "x"

Reindexing

data = pd.Series([1.1, 2.2, 3.3], index = ['b', 'c', 'a'])
data
## b    1.1
## c    2.2
## a    3.3
## dtype: float64
data2 = data.reindex(['a', 'b', 'c', 'd'])
data2
## a    3.3
## b    1.1
## c    2.2
## d    NaN
## dtype: float64

Drop entries in Series

data = pd.Series(np.arange(4), index = ['a', 'b', 'c', 'd'])
data
## a    0
## b    1
## c    2
## d    3
## dtype: int64
data.drop("a") ## not inplace operator
## b    1
## c    2
## d    3
## dtype: int64
data2 = data.drop(["a", "b"])
data2
## c    2
## d    3
## dtype: int64
data.drop(["a"], inplace=True)
data
## b    1
## c    2
## d    3
## dtype: int64

Series entry selection

data = pd.Series([1,2,3,4], index=["a", "b", "c", "d"])
data["b"]
## 2
data[1]
## 2
data[1:3]
## b    2
## c    3
## dtype: int64

Series entry selection (more)

data[[1,3]]
## b    2
## d    4
## dtype: int64
data[data > 2] ## by logical array
## c    3
## d    4
## dtype: int64
data["b":"c"]
##data["b":"c"] = 0
## b    2
## c    3
## dtype: int64

Series entry selection by loc/iloc

data = pd.Series([1,2,3,4], index=["a", "b", "c", "d"])
data.loc["a"]
## 1
data.loc[["a", "c"]]
## a    1
## c    3
## dtype: int64
data.iloc[1]
## 2
data.iloc[1:]
## b    2
## c    3
## d    4
## dtype: int64
data.iloc[-2]
## 3

Series membership

data = pd.Series(list("abcdadcccdaabbb"))
data.unique()
## array(['a', 'b', 'c', 'd'], dtype=object)
data.value_counts()
## a    4
## b    4
## c    4
## d    3
## dtype: int64
pd.value_counts(data, sort = False)
## a    4
## b    4
## c    4
## d    3
## dtype: int64

Series membership

mask = data.isin(["a", "b"])
data[mask]
## 0     a
## 1     b
## 4     a
## 10    a
## 11    a
## 12    b
## 13    b
## 14    b
## dtype: object
uniq_val = data.unique()
pd.Index(uniq_val).get_indexer(data)
## array([0, 1, 2, 3, 0, 3, 2, 2, 2, 3, 0, 0, 1, 1, 1])

Arithmetic and data alignment

a1 = pd.Series([1,2,3], index = ["a", "b", "c"])
a2 = pd.Series([2,3,4], index = ["a", "b", "c"])
a3 = pd.Series([2,3,4], index = ["b", "c", "d"])
a1 + a2
## a    3
## b    5
## c    7
## dtype: int64
a1 + a3
## a    NaN
## b    4.0
## c    6.0
## d    NaN
## dtype: float64

DataFrame

data = {"Name": ["Amy", "Beth", "Carl"],
        "Age": [24, 22, 19],
        "Sex": ["F", "F", "M"]        
}
data
## {'Name': ['Amy', 'Beth', 'Carl'], 'Age': [24, 22, 19], 'Sex': ['F', 'F', 'M']}
pd.DataFrame(data)
##    Name  Age Sex
## 0   Amy   24   F
## 1  Beth   22   F
## 2  Carl   19   M
pd.DataFrame(data, columns = ["Sex", "Age", "Name"])
##   Sex  Age  Name
## 0   F   24   Amy
## 1   F   22  Beth
## 2   M   19  Carl
pd.DataFrame(data, columns = ["Sex", "Age", "Name", "Email"])
##   Sex  Age  Name Email
## 0   F   24   Amy   NaN
## 1   F   22  Beth   NaN
## 2   M   19  Carl   NaN
apd = pd.DataFrame(data, columns = ["Sex", "Age", "Name"], index = ["a", "b", "c"])
apd
##   Sex  Age  Name
## a   F   24   Amy
## b   F   22  Beth
## c   M   19  Carl

reset_index and set_index

apd
##   Sex  Age  Name
## a   F   24   Amy
## b   F   22  Beth
## c   M   19  Carl
data2 = apd.reset_index()
data2
##   index Sex  Age  Name
## 0     a   F   24   Amy
## 1     b   F   22  Beth
## 2     c   M   19  Carl
data2.set_index("index")
#data2.set_index("Name")
##       Sex  Age  Name
## index               
## a       F   24   Amy
## b       F   22  Beth
## c       M   19  Carl

DataFrame - select columns

apd["Age"]
## a    24
## b    22
## c    19
## Name: Age, dtype: int64
apd.Age
## a    24
## b    22
## c    19
## Name: Age, dtype: int64
apd[["Name", "Age"]]
##    Name  Age
## a   Amy   24
## b  Beth   22
## c  Carl   19

DataFrame - select rows

## apd[1] # this won't work
apd[1:]
##   Sex  Age  Name
## b   F   22  Beth
## c   M   19  Carl
apd[apd["Age"] > 20]
##   Sex  Age  Name
## a   F   24   Amy
## b   F   22  Beth

Selection with loc

apd.loc["a"]
## Sex       F
## Age      24
## Name    Amy
## Name: a, dtype: object
apd.loc[:,"Name"]
## a     Amy
## b    Beth
## c    Carl
## Name: Name, dtype: object
apd.loc["a", ["Name", "Age"]]
## Name    Amy
## Age      24
## Name: a, dtype: object

Selection with iloc

apd.iloc[0]
## Sex       F
## Age      24
## Name    Amy
## Name: a, dtype: object
apd.iloc[-1]
## Sex        M
## Age       19
## Name    Carl
## Name: c, dtype: object
apd.iloc[:,1]
## a    24
## b    22
## c    19
## Name: Age, dtype: int64
apd.iloc[0, [0,1]]
## Sex     F
## Age    24
## Name: a, dtype: object

create columns

apd["Age20"] = apd.Age >= 20
apd
##   Sex  Age  Name  Age20
## a   F   24   Amy   True
## b   F   22  Beth   True
## c   M   19  Carl  False
apd["debt"] = np.nan
apd
##   Sex  Age  Name  Age20  debt
## a   F   24   Amy   True   NaN
## b   F   22  Beth   True   NaN
## c   M   19  Carl  False   NaN

modify columns

apd["debt"] = 15.1
apd
##   Sex  Age  Name  Age20  debt
## a   F   24   Amy   True  15.1
## b   F   22  Beth   True  15.1
## c   M   19  Carl  False  15.1
apd["debt"] = np.arange(3.)
apd
##   Sex  Age  Name  Age20  debt
## a   F   24   Amy   True   0.0
## b   F   22  Beth   True   1.0
## c   M   19  Carl  False   2.0
val = pd.Series([5., 7.7], index=["a", "c"])
apd["debt"] = val
apd
##   Sex  Age  Name  Age20  debt
## a   F   24   Amy   True   5.0
## b   F   22  Beth   True   NaN
## c   M   19  Carl  False   7.7

more on columns for pd.DataFrame

apd.columns
## Index(['Sex', 'Age', 'Name', 'Age20', 'debt'], dtype='object')
del apd["debt"]
apd.columns
## Index(['Sex', 'Age', 'Name', 'Age20'], dtype='object')
apd.T
##           a     b      c
## Sex       F     F      M
## Age      24    22     19
## Name    Amy  Beth   Carl
## Age20  True  True  False

create DataFrame via nested dictionary

pop = {"Florida": {2020: 10.9, 2021: 11.3, 2022: 13.4},
        "Texas": {2020: 20.5, 2021: 21.1}
}
bpd = pd.DataFrame(pop)
bpd
##       Florida  Texas
## 2020     10.9   20.5
## 2021     11.3   21.1
## 2022     13.4    NaN

some extra operations on pd.DataFrame

bpd.index.name = "Year"
bpd.columns.name = "States"
bpd
## States  Florida  Texas
## Year                  
## 2020       10.9   20.5
## 2021       11.3   21.1
## 2022       13.4    NaN

some attributes of pd.DataFrame

bpd.values
## array([[10.9, 20.5],
##        [11.3, 21.1],
##        [13.4,  nan]])
bpd.index
## Int64Index([2020, 2021, 2022], dtype='int64', name='Year')
2021 in bpd.index
## True
bpd.columns
## Index(['Florida', 'Texas'], dtype='object', name='States')
"Florida" in bpd.columns
## True

Reindexing for DataFrame

data = pd.DataFrame(np.arange(6).reshape(3,-1), index =['x', 'y', 'z'], columns = ["Florida", 'Texas'] )
data
##    Florida  Texas
## x        0      1
## y        2      3
## z        4      5
data2 = data.reindex(['x', 'y', 'z', 'w'])
data2
##    Florida  Texas
## x      0.0    1.0
## y      2.0    3.0
## z      4.0    5.0
## w      NaN    NaN
states = ['Utah', 'Florida', 'Texas']
data3 = data2.reindex(columns = states)
data3
##    Utah  Florida  Texas
## x   NaN      0.0    1.0
## y   NaN      2.0    3.0
## z   NaN      4.0    5.0
## w   NaN      NaN    NaN
data3.loc[["x", "y"], states]
##    Utah  Florida  Texas
## x   NaN      0.0    1.0
## y   NaN      2.0    3.0

Drop entries in DataFrame

data0 = {"Name": ["Amy", "Beth", "Carl"],
        "Age": [24, 22, 19],
        "Sex": ["F", "F", "M"]        
}
data = pd.DataFrame(data0, index = ["1", "2", "3"])
data.drop("1") ## drop by index
##    Name  Age Sex
## 2  Beth   22   F
## 3  Carl   19   M
data.drop("Name", axis=1)
##    Age Sex
## 1   24   F
## 2   22   F
## 3   19   M
data.drop(["Name", "Age"], axis="columns")

## data.drop("1", inplace=True); data
##   Sex
## 1   F
## 2   F
## 3   M

Data alignment for DataFrame

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd2 = pd.DataFrame(np.arange(12).reshape(-1,3), columns = list("bac"), index = ["Florida", "Texax", "Utah", "Ohio"])

pd1
##          b  d  c
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
pd2
##          b   a   c
## Florida  0   1   2
## Texax    3   4   5
## Utah     6   7   8
## Ohio     9  10  11
pd1 + pd2
##           a     b     c   d
## Florida NaN   0.0   4.0 NaN
## Ohio    NaN   NaN   NaN NaN
## Texax   NaN   6.0  10.0 NaN
## Utah    NaN  12.0  16.0 NaN

Arithmetic for DataFrame

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd3 = pd.DataFrame(np.arange(1,10).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])

pd1.add(pd3)
##           b   d   c
## Florida   1   3   5
## Texax     7   9  11
## Utah     13  15  17
pd1.sub(1)
##          b  d  c
## Florida -1  0  1
## Texax    2  3  4
## Utah     5  6  7
pd1.rsub(1)
##          b  d  c
## Florida  1  0 -1
## Texax   -2 -3 -4
## Utah    -5 -6 -7

Operation between DataFrame and Series

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])

series1 = pd1.iloc[0]

pd1
##          b  d  c
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
series1
## b    0
## d    1
## c    2
## Name: Florida, dtype: int64
pd1 - series1
##          b  d  c
## Florida  0  0  0
## Texax    3  3  3
## Utah     6  6  6

Sorting and Ranking

series1 = pd.Series([1,3,2,4], index = ['b', 'd', 'a', 'c'])
series1
## b    1
## d    3
## a    2
## c    4
## dtype: int64
series1.sort_index()
## a    2
## b    1
## c    4
## d    3
## dtype: int64
series1.sort_index(ascending=False)
## d    3
## c    4
## b    1
## a    2
## dtype: int64
series1.sort_values()
## b    1
## a    2
## d    3
## c    4
## dtype: int64

Sorting and Ranking

pd1 = pd.DataFrame(np.array([3,2,5,1,4,6]).reshape(3,2), index = ['c', 'a', 'b'], columns = ["x", "y"])
pd1.sort_index()
##    x  y
## a  5  1
## b  4  6
## c  3  2
pd1.sort_values(["x"])
##    x  y
## c  3  2
## b  4  6
## a  5  1
pd1.sort_values(["y", "x"]) ## ## multiple criteria
##    x  y
## a  5  1
## c  3  2
## b  4  6

A Stock Example Data

import pandas_datareader.data as web

stock_TSLA = web.get_data_yahoo("TSLA")
# stock_TSLA
stock_TSLA.keys()
## Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], dtype='object')
stock_data = {symbol: web.get_data_yahoo(symbol) for symbol in {"AAPL", "TSLA", "GOOG", "META"}}
price = pd.DataFrame({symbol: data['Adj Close'] for symbol, data in stock_data.items()})
price.head()
##                  TSLA       AAPL       GOOG        META
## Date                                                   
## 2017-11-15  20.753332  40.194538  51.045502  177.949997
## 2017-11-16  20.833332  40.674736  51.625000  179.589996
## 2017-11-17  21.003332  40.448895  50.954498  179.000000
## 2017-11-20  20.582666  40.408478  50.918999  178.740005
## 2017-11-21  21.187332  41.159698  51.724499  181.860001
price.head(n=3)
##                  TSLA       AAPL       GOOG        META
## Date                                                   
## 2017-11-15  20.753332  40.194538  51.045502  177.949997
## 2017-11-16  20.833332  40.674736  51.625000  179.589996
## 2017-11-17  21.003332  40.448895  50.954498  179.000000
returns = price.pct_change()
returns.tail()
##                 TSLA      AAPL      GOOG      META
## Date                                              
## 2022-11-08 -0.029328  0.004175  0.002933 -0.002585
## 2022-11-09 -0.071668 -0.033190 -0.016983  0.051830
## 2022-11-10  0.073934  0.088975  0.077460  0.102493
## 2022-11-11  0.027527  0.019269  0.027185  0.010280
## 2022-11-14 -0.025463 -0.009452 -0.008839  0.013650
return1000 = returns[-1000:]
return1000.shape
## (1000, 4)

Function application

return1000.apply(np.mean)
## TSLA    0.003058
## AAPL    0.001513
## GOOG    0.000834
## META    0.000266
## dtype: float64
return1000.apply(np.mean, axis = 1)
## Date
## 2018-11-26    0.033722
## 2018-11-27   -0.005580
## 2018-11-28    0.025754
## 2018-11-29   -0.002749
## 2018-11-30    0.010359
##                 ...   
## 2022-11-08   -0.006201
## 2022-11-09   -0.017503
## 2022-11-10    0.085716
## 2022-11-11    0.021065
## 2022-11-14   -0.007526
## Length: 1000, dtype: float64

Function application

f = lambda x: x.max() - x.min()
return1000.apply(f)
## TSLA    0.409577
## AAPL    0.248455
## GOOG    0.215494
## META    0.439837
## dtype: float64
return1000.apply(lambda x: x.max() - x.min())
## TSLA    0.409577
## AAPL    0.248455
## GOOG    0.215494
## META    0.439837
## dtype: float64
def f2(x):
    return pd.Series([x.min(), x.max()], index = ["min", "max"])

return1000.apply(f2)
##          TSLA      AAPL      GOOG      META
## min -0.210628 -0.128647 -0.111008 -0.263901
## max  0.198949  0.119808  0.104485  0.175936

Summary statistics

return1000.sum() ## default axis = 0
## TSLA    3.058448
## AAPL    1.513297
## GOOG    0.833620
## META    0.265683
## dtype: float64
return1000.sum(axis = 1)
## Date
## 2018-11-26    0.134890
## 2018-11-27   -0.022321
## 2018-11-28    0.103017
## 2018-11-29   -0.010997
## 2018-11-30    0.041435
##                 ...   
## 2022-11-08   -0.024805
## 2022-11-09   -0.070011
## 2022-11-10    0.342862
## 2022-11-11    0.084261
## 2022-11-14   -0.030105
## Length: 1000, dtype: float64
return1000.median()
## TSLA    0.002342
## AAPL    0.001244
## GOOG    0.001327
## META    0.000800
## dtype: float64
return1000.idxmax()
## TSLA   2020-02-03
## AAPL   2020-03-13
## GOOG   2019-07-26
## META   2022-04-28
## dtype: datetime64[ns]

Summary statistics

return1000.idxmin(axis = "columns")
## Date
## 2018-11-26    AAPL
## 2018-11-27    META
## 2018-11-28    TSLA
## 2018-11-29    TSLA
## 2018-11-30    AAPL
##               ... 
## 2022-11-08    TSLA
## 2022-11-09    TSLA
## 2022-11-10    TSLA
## 2022-11-11    META
## 2022-11-14    TSLA
## Length: 1000, dtype: object
return1000.cumsum()
##                 TSLA      AAPL      GOOG      META
## Date                                              
## 2018-11-26  0.061903  0.013524  0.024163  0.035300
## 2018-11-27  0.055892  0.011348  0.020148  0.025181
## 2018-11-28  0.067377  0.049800  0.060190  0.038218
## 2018-11-29  0.048117  0.042118  0.062096  0.052257
## 2018-11-30  0.075406  0.036716  0.067728  0.066174
## ...              ...       ...       ...       ...
## 2022-11-08  3.054117  1.447696  0.754797  0.087431
## 2022-11-09  2.982449  1.414506  0.737814  0.139260
## 2022-11-10  3.056384  1.503480  0.815274  0.241754
## 2022-11-11  3.083911  1.522749  0.842459  0.252034
## 2022-11-14  3.058448  1.513297  0.833620  0.265683
## 
## [1000 rows x 4 columns]
return1000.var() ## default axis = 0
## return1000.std(axis="columns")
## TSLA    0.001767
## AAPL    0.000482
## GOOG    0.000412
## META    0.000786
## dtype: float64

Summary statistics

return1000.describe()
##               TSLA         AAPL         GOOG         META
## count  1000.000000  1000.000000  1000.000000  1000.000000
## mean      0.003058     0.001513     0.000834     0.000266
## std       0.042038     0.021950     0.020303     0.028036
## min      -0.210628    -0.128647    -0.111008    -0.263901
## 25%      -0.017983    -0.009105    -0.008113    -0.012055
## 50%       0.002342     0.001244     0.001327     0.000800
## 75%       0.022870     0.013333     0.010832     0.014111
## max       0.198949     0.119808     0.104485     0.175936

Correlation

return1000["AAPL"].corr(return1000["TSLA"])
## 0.49711491315099615
return1000.AAPL.corr(return1000.TSLA)
## 0.49711491315099615
return1000.corr()
##           TSLA      AAPL      GOOG      META
## TSLA  1.000000  0.497115  0.423899  0.355648
## AAPL  0.497115  1.000000  0.704687  0.610412
## GOOG  0.423899  0.704687  1.000000  0.683037
## META  0.355648  0.610412  0.683037  1.000000

Covariance

return1000["AAPL"].cov(return1000["TSLA"])
## 0.00045869598425547455
return1000.cov()
##           TSLA      AAPL      GOOG      META
## TSLA  0.001767  0.000459  0.000362  0.000419
## AAPL  0.000459  0.000482  0.000314  0.000376
## GOOG  0.000362  0.000314  0.000412  0.000389
## META  0.000419  0.000376  0.000389  0.000786

Value counts for DataFrame

data = pd.DataFrame({'a':[1,2,3,3], 'b':[1,1,2,4], 'c':[2,2,3,3]})
data["a"].value_counts()
## 3    2
## 1    1
## 2    1
## Name: a, dtype: int64
data.apply(pd.value_counts)
##      a    b    c
## 1  1.0  2.0  NaN
## 2  1.0  1.0  2.0
## 3  2.0  NaN  2.0
## 4  NaN  1.0  NaN
data.apply(pd.value_counts).fillna(0)
##      a    b    c
## 1  1.0  2.0  0.0
## 2  1.0  1.0  2.0
## 3  2.0  0.0  2.0
## 4  0.0  1.0  0.0

Read in files using Pandas

afile = "https://caleb-huo.github.io/teaching/data/Python/Student_data.csv"
bfile = "https://caleb-huo.github.io/teaching/data/Python/Student_data.xlsx"
data0 = pd.read_csv("sleepstudy.csv")
data1 = pd.read_excel("sleepstudy.xlsx")

Read_csv directly through an URL

import io
import requests
url="https://caleb-huo.github.io/teaching/data/Python/Student_data.csv"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))
c.head()
##     Name       Hobby Year_in_colledge  Initial_GPA  Study_time
## 0    Dan    Football         freshman          3.1          10
## 1   Beth       Music        sophomore          3.2          20
## 2   Carl  Basketball           senior          3.6          14
## 3  Frank     Cooking        sophomore          3.4          16
## 4  Emily     Running           junior          3.3          18

Read_csv more options

data = pd.read_csv("Student_data.csv", header=None)
data.head()
##        0           1                 2            3           4
## 0   Name       Hobby  Year_in_colledge  Initial_GPA  Study_time
## 1    Dan    Football          freshman          3.1          10
## 2   Beth       Music         sophomore          3.2          20
## 3   Carl  Basketball            senior          3.6          14
## 4  Frank     Cooking         sophomore          3.4          16
data = pd.read_csv("Student_data.csv", header=None, skiprows = 1)
data.head()
##        0           1          2    3   4
## 0    Dan    Football   freshman  3.1  10
## 1   Beth       Music  sophomore  3.2  20
## 2   Carl  Basketball     senior  3.6  14
## 3  Frank     Cooking  sophomore  3.4  16
## 4  Emily     Running     junior  3.3  18

Read_csv more options

data = pd.read_csv("Student_data.csv", skiprows = 1, names = ["a", "b", "c", "d", 'e'])
data.head()
##        a           b          c    d   e
## 0    Dan    Football   freshman  3.1  10
## 1   Beth       Music  sophomore  3.2  20
## 2   Carl  Basketball     senior  3.6  14
## 3  Frank     Cooking  sophomore  3.4  16
## 4  Emily     Running     junior  3.3  18
data = pd.read_csv("Student_data.csv", index_col = "Name")
data.head()
##             Hobby Year_in_colledge  Initial_GPA  Study_time
## Name                                                       
## Dan      Football         freshman          3.1          10
## Beth        Music        sophomore          3.2          20
## Carl   Basketball           senior          3.6          14
## Frank     Cooking        sophomore          3.4          16
## Emily     Running           junior          3.3          18
data.sort_index().head()
##              Hobby Year_in_colledge  Initial_GPA  Study_time
## Name                                                        
## Amy       Swimming           senior          3.0          15
## Ashely      Skiing           senior          3.1          15
## Beth         Music        sophomore          3.2          20
## Carl    Basketball           senior          3.6          14
## Chris      Singing         freshman          3.6          19

Save results to csv or other format

data = pd.read_csv("Student_data.csv")
data.to_csv("mydata.csv") ## index=True as default
data.to_csv("mydata.csv", index=False)
data.to_csv("mydata.txt", sep="\t")
import sys
data.to_csv(sys.stdout) 
data.to_csv(sys.stdout, sep="\t") 
data.to_csv(sys.stdout, index=False) 
data.to_csv(sys.stdout, header=False) 
data.to_csv(sys.stdout, columns = ["Hobby", "Year_in_colledge"]) 

Review datetime

from datetime import datetime
from datetime import timedelta

now = datetime.now()
now
## datetime.datetime(2022, 11, 14, 11, 46, 17, 986449)
now.year, now.month, now.day
## (2022, 11, 14)
now.hour, now.minute, now.second
## (11, 46, 17)

timedelta

datetime1 = datetime(2022,10,13,7,30,0)
datetime2 = datetime(2022,10,10,5,20,0)
delta = datetime1 - datetime2
delta
## datetime.timedelta(days=3, seconds=7800)
delta.days
## 3
delta.seconds
## 7800
datetime1 + timedelta(12)
## datetime.datetime(2022, 10, 25, 7, 30)
datetime1 + timedelta(12, 2)
## datetime.datetime(2022, 10, 25, 7, 30, 2)

String and Datatime Convertion

date1 = datetime(2022,10,13)
str(date1)
## '2022-10-13 00:00:00'
date1.strftime("%Y-%m-%d")
## '2022-10-13'
val1 = "2021-10-11"
datetime.strptime(val1, "%Y-%m-%d")
## datetime.datetime(2021, 10, 11, 0, 0)
val2 = "10/11/2021"
datetime.strptime(val2, "%m/%d/%Y")
## datetime.datetime(2021, 10, 11, 0, 0)
dateStr = ["2021-10-11 12:00:01", "2021-10-12 03:40:01"]
pd.to_datetime(dateStr)
## DatetimeIndex(['2021-10-11 12:00:01', '2021-10-12 03:40:01'], dtype='datetime64[ns]', freq=None)

Time Series Basics

mydate = [datetime(2022,10,13), datetime(2022,10,14), datetime(2022,10,18)]
data = pd.Series(np.arange(30,33), index=mydate)
data
## 2022-10-13    30
## 2022-10-14    31
## 2022-10-18    32
## dtype: int64
data.index
## DatetimeIndex(['2022-10-13', '2022-10-14', '2022-10-18'], dtype='datetime64[ns]', freq=None)
data["2022/10/13"]
## 30
data["20221014"]
## 31
data["2022-10-18"]
## 32

Date range

ind1 = pd.date_range("2022-01-01", "2022-08-01")
ind1
## DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
##                '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
##                '2022-01-09', '2022-01-10',
##                ...
##                '2022-07-23', '2022-07-24', '2022-07-25', '2022-07-26',
##                '2022-07-27', '2022-07-28', '2022-07-29', '2022-07-30',
##                '2022-07-31', '2022-08-01'],
##               dtype='datetime64[ns]', length=213, freq='D')
ind2 = pd.date_range(start = "2022-01-01", periods=35)
ind2
## DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
##                '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
##                '2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
##                '2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
##                '2022-01-17', '2022-01-18', '2022-01-19', '2022-01-20',
##                '2022-01-21', '2022-01-22', '2022-01-23', '2022-01-24',
##                '2022-01-25', '2022-01-26', '2022-01-27', '2022-01-28',
##                '2022-01-29', '2022-01-30', '2022-01-31', '2022-02-01',
##                '2022-02-02', '2022-02-03', '2022-02-04'],
##               dtype='datetime64[ns]', freq='D')
ind3 = pd.date_range(end = "2022-01-01", periods=35)
ind3
## DatetimeIndex(['2021-11-28', '2021-11-29', '2021-11-30', '2021-12-01',
##                '2021-12-02', '2021-12-03', '2021-12-04', '2021-12-05',
##                '2021-12-06', '2021-12-07', '2021-12-08', '2021-12-09',
##                '2021-12-10', '2021-12-11', '2021-12-12', '2021-12-13',
##                '2021-12-14', '2021-12-15', '2021-12-16', '2021-12-17',
##                '2021-12-18', '2021-12-19', '2021-12-20', '2021-12-21',
##                '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
##                '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
##                '2021-12-30', '2021-12-31', '2022-01-01'],
##               dtype='datetime64[ns]', freq='D')
ind4 = pd.date_range(start = "2012-01-01", periods=10, freq="MS") ## MS: month start
ind4
## DatetimeIndex(['2012-01-01', '2012-02-01', '2012-03-01', '2012-04-01',
##                '2012-05-01', '2012-06-01', '2012-07-01', '2012-08-01',
##                '2012-09-01', '2012-10-01'],
##               dtype='datetime64[ns]', freq='MS')

Date selections

return1000.head()
##                 TSLA      AAPL      GOOG      META
## Date                                              
## 2018-11-26  0.061903  0.013524  0.024163  0.035300
## 2018-11-27 -0.006012 -0.002176 -0.004015 -0.010119
## 2018-11-28  0.011485  0.038453  0.040042  0.013037
## 2018-11-29 -0.019260 -0.007682  0.001906  0.014039
## 2018-11-30  0.027288 -0.005403  0.005633  0.013917
return1000.loc["2021"] ## all rows in a year
##                 TSLA      AAPL      GOOG      META
## Date                                              
## 2021-01-04  0.034152 -0.024719 -0.013494 -0.015449
## 2021-01-05  0.007317  0.012364  0.007337  0.007548
## 2021-01-06  0.028390 -0.033662 -0.003234 -0.028269
## 2021-01-07  0.079447  0.034123  0.029943  0.020622
## 2021-01-08  0.078403  0.008631  0.011168 -0.004354
## ...              ...       ...       ...       ...
## 2021-12-27  0.025248  0.022975  0.006263  0.032633
## 2021-12-28 -0.005000 -0.005767 -0.010914  0.000116
## 2021-12-29 -0.002095  0.000502  0.000386 -0.009474
## 2021-12-30 -0.014592 -0.006578 -0.003427  0.004141
## 2021-12-31 -0.012669 -0.003535 -0.009061 -0.023260
## 
## [252 rows x 4 columns]
return1000.loc["2021-10"] ## all rows in a month
##                 TSLA      AAPL      GOOG      META
## Date                                              
## 2021-10-01 -0.000335  0.008127  0.023990  0.010666
## 2021-10-04  0.008140 -0.024606 -0.019767 -0.048920
## 2021-10-05 -0.001203  0.014158  0.018032  0.020630
## 2021-10-06  0.002767  0.006307  0.008643  0.002042
## 2021-10-07  0.013874  0.009084  0.013334 -0.013248
## 2021-10-08 -0.010232 -0.002722  0.006254  0.002521
## 2021-10-11  0.008212 -0.000630 -0.008629 -0.013937
## 2021-10-12  0.017400 -0.009103 -0.015373 -0.005162
## 2021-10-13  0.006652 -0.004240  0.008682  0.002378
## 2021-10-14  0.008926  0.020226  0.025468  0.012294
## 2021-10-15  0.030196  0.007512  0.001860 -0.011475
## 2021-10-18  0.032122  0.011806  0.009074  0.032578
## 2021-10-19 -0.006712  0.015080  0.006026  0.013867
## 2021-10-20  0.001770  0.003361 -0.009783  0.002324
## 2021-10-21  0.032571  0.001474  0.002567  0.003228
## 2021-10-22  0.017539 -0.005285 -0.029104 -0.050515
## 2021-10-25  0.126616 -0.000336  0.001068  0.012569
## 2021-10-26 -0.006274  0.004575  0.006478 -0.039186
## 2021-10-27  0.019078 -0.003148  0.048367 -0.011368
## 2021-10-28  0.037751  0.024991 -0.002039  0.015054
## 2021-10-29  0.034316 -0.018155  0.014655  0.020983
return1000.loc["2021-10-12"]
## TSLA    0.017400
## AAPL   -0.009103
## GOOG   -0.015373
## META   -0.005162
## Name: 2021-10-12 00:00:00, dtype: float64
return1000.loc["2021-10-12":] ## starting from a date
##                 TSLA      AAPL      GOOG      META
## Date                                              
## 2021-10-12  0.017400 -0.009103 -0.015373 -0.005162
## 2021-10-13  0.006652 -0.004240  0.008682  0.002378
## 2021-10-14  0.008926  0.020226  0.025468  0.012294
## 2021-10-15  0.030196  0.007512  0.001860 -0.011475
## 2021-10-18  0.032122  0.011806  0.009074  0.032578
## ...              ...       ...       ...       ...
## 2022-11-08 -0.029328  0.004175  0.002933 -0.002585
## 2022-11-09 -0.071668 -0.033190 -0.016983  0.051830
## 2022-11-10  0.073934  0.088975  0.077460  0.102493
## 2022-11-11  0.027527  0.019269  0.027185  0.010280
## 2022-11-14 -0.025463 -0.009452 -0.008839  0.013650
## 
## [276 rows x 4 columns]

Reference