Introduction to Biostatistical Computing PHC 6937

Pandas Data Cleaning

Zhiguang Huo (Caleb)

Wednesday Nov 16th, 2022

Outlines

Missing data – Series

vec_withna = pd.Series([0, np.NaN, np.nan, None, 1])
vec_withna.isnull()
## 0    False
## 1     True
## 2     True
## 3     True
## 4    False
## dtype: bool
vec_withna.isna()
## 0    False
## 1     True
## 2     True
## 3     True
## 4    False
## dtype: bool
vec_withna[vec_withna.notnull()]
## 0    0.0
## 4    1.0
## dtype: float64
vec_withna[vec_withna.notna()]
## 0    0.0
## 4    1.0
## dtype: float64
vec_withna.dropna()
## 0    0.0
## 4    1.0
## dtype: float64

Missing data – DataDrame

from numpy import nan as NA

apd = pd.DataFrame([[1,2,3], [NA, NA, NA], [4, NA, 6]])
apd.dropna()
##      0    1    2
## 0  1.0  2.0  3.0
apd.dropna(how="all")
##      0    1    2
## 0  1.0  2.0  3.0
## 2  4.0  NaN  6.0
apd[4] = NA
apd
##      0    1    2   4
## 0  1.0  2.0  3.0 NaN
## 1  NaN  NaN  NaN NaN
## 2  4.0  NaN  6.0 NaN
apd.dropna(axis=1, how="all")
##      0    1    2
## 0  1.0  2.0  3.0
## 1  NaN  NaN  NaN
## 2  4.0  NaN  6.0
apd.dropna(axis=1)
## Empty DataFrame
## Columns: []
## Index: [0, 1, 2]
apd
##      0    1    2   4
## 0  1.0  2.0  3.0 NaN
## 1  NaN  NaN  NaN NaN
## 2  4.0  NaN  6.0 NaN
apd.dropna(thresh = 2) ## at least two non-missing elements
##      0    1    2   4
## 0  1.0  2.0  3.0 NaN
## 2  4.0  NaN  6.0 NaN
apd.dropna(thresh = 3) ## at least three non-missing elements
##      0    1    2   4
## 0  1.0  2.0  3.0 NaN

Fill in missing data

apd.fillna(-99)
#apd.fillna(apd.mean()) ## fill by mean alue
##       0     1     2     4
## 0   1.0   2.0   3.0 -99.0
## 1 -99.0 -99.0 -99.0 -99.0
## 2   4.0 -99.0   6.0 -99.0
apd.fillna({1:-1, 2:-2, 4:-4})
##      0    1    2    4
## 0  1.0  2.0  3.0 -4.0
## 1  NaN -1.0 -2.0 -4.0
## 2  4.0 -1.0  6.0 -4.0
apd.fillna(-99, inplace=True)
apd
##       0     1     2     4
## 0   1.0   2.0   3.0 -99.0
## 1 -99.0 -99.0 -99.0 -99.0
## 2   4.0 -99.0   6.0 -99.0

Fill in missing data using interpolation

arr = pd.Series([0,np.nan,3,np.nan, 5, np.nan])
#apd.fillna(apd.mean()) ## fill by mean alue
arr.fillna(method="ffill")
## 0    0.0
## 1    0.0
## 2    3.0
## 3    3.0
## 4    5.0
## 5    5.0
## dtype: float64
arr.fillna(method="bfill")
## 0    0.0
## 1    3.0
## 2    3.0
## 3    5.0
## 4    5.0
## 5    NaN
## dtype: float64

Filtering

apd = pd.DataFrame({"c1": [1,2,3,4], "c2": ["a", "b", "c", "d"]})
apd["c1"] > 2
## 0    False
## 1    False
## 2     True
## 3     True
## Name: c1, dtype: bool
np.where(apd["c1"] > 2)
## (array([2, 3]),)
apd.iloc[np.where(apd["c1"] > 2)]
##    c1 c2
## 2   3  c
## 3   4  d
apd.iloc[np.where( (apd["c1"] > 2) & (apd["c2"] == "d"))]
##    c1 c2
## 3   4  d

Duplicates

apd = pd.DataFrame({"c1": [1,1,2,2,3,3], "c2": ["a", "b", "a", "a", "a", "b"]})
apd.duplicated()
## 0    False
## 1    False
## 2    False
## 3     True
## 4    False
## 5    False
## dtype: bool
apd.drop_duplicates()
##    c1 c2
## 0   1  a
## 1   1  b
## 2   2  a
## 4   3  a
## 5   3  b
apd.drop_duplicates("c1")
##    c1 c2
## 0   1  a
## 2   2  a
## 4   3  a
apd.duplicated(["c1"], keep="last")
## 0     True
## 1    False
## 2     True
## 3    False
## 4     True
## 5    False
## dtype: bool
apd.drop_duplicates("c1", keep="last")
##    c1 c2
## 1   1  b
## 3   2  a
## 5   3  b

Map

population = pd.DataFrame({"City": ["Gainesville", "Orlando", "Tampa", "Pittsburgh", "Philadelphia"],
    "Population": [140,309,387,300,1576]}
)
city_to_state = {"Gainesville": "FL", "Orlando": "FL", "Tampa": "FL", "Pittsburgh": "PA", "Philadelphia":"PA"}
population
##            City  Population
## 0   Gainesville         140
## 1       Orlando         309
## 2         Tampa         387
## 3    Pittsburgh         300
## 4  Philadelphia        1576
city_to_state
## {'Gainesville': 'FL', 'Orlando': 'FL', 'Tampa': 'FL', 'Pittsburgh': 'PA', 'Philadelphia': 'PA'}
population["City"].map(lambda x: city_to_state[x])
## 0    FL
## 1    FL
## 2    FL
## 3    PA
## 4    PA
## Name: City, dtype: object
population.City.map(city_to_state)
## 0    FL
## 1    FL
## 2    FL
## 3    PA
## 4    PA
## Name: City, dtype: object
population["State"] = population.City.map(city_to_state)

Replacing values

apd = pd.DataFrame([[1,2,3], [2, 3, 4], [3, 4, 5]])
apd
##    0  1  2
## 0  1  2  3
## 1  2  3  4
## 2  3  4  5
apd.replace(4, np.nan)
##    0    1    2
## 0  1  2.0  3.0
## 1  2  3.0  NaN
## 2  3  NaN  5.0
apd.replace([3,4], [-3,-4])
##    0  1  2
## 0  1  2 -3
## 1  2 -3 -4
## 2 -3 -4  5
apd.replace({3:-3, 4:-4})
##    0  1  2
## 0  1  2 -3
## 1  2 -3 -4
## 2 -3 -4  5

In class exercise

data = pd.DataFrame({"l1": [1,2,3,18,13, 1, 15,6,-99,21,3,np.nan], 
"l2": [1,np.nan,3,7,np.nan,1, 5,-99,-99,3,3,9]})
data
##       l1    l2
## 0    1.0   1.0
## 1    2.0   NaN
## 2    3.0   3.0
## 3   18.0   7.0
## 4   13.0   NaN
## 5    1.0   1.0
## 6   15.0   5.0
## 7    6.0 -99.0
## 8  -99.0 -99.0
## 9   21.0   3.0
## 10   3.0   3.0
## 11   NaN   9.0

write the code in one line

data.replace({-99:np.nan}).dropna().drop_duplicates().sort_values(by="l1",ascending=False).astype(np.int32).l1.map(lambda x: np.sum([int(i) for i in str(x)]))
## 9    3
## 3    9
## 6    6
## 2    3
## 0    1
## Name: l1, dtype: int64

Renaming Axis Indexes

pd1 = pd.DataFrame(np.arange(9).reshape(-1,3), columns = list("bdc"), index = ["Florida", "Texax", "Utah"])
pd1
##          b  d  c
## Florida  0  1  2
## Texax    3  4  5
## Utah     6  7  8
pd1.index = pd1.index.map(lambda x: x[:3])
pd1
##      b  d  c
## Flo  0  1  2
## Tex  3  4  5
## Uta  6  7  8
pd1.rename(index=str.upper, columns=str.upper)
##      B  D  C
## FLO  0  1  2
## TEX  3  4  5
## UTA  6  7  8
pd1.rename(index={"Flo": "FL", "Tex": "TX"}, columns = {"b": "BB"})
##      BB  d  c
## FL    0  1  2
## TX    3  4  5
## Uta   6  7  8
pd1.rename(index={"Flo": "FL", "Tex": "TX"}, columns = {"b": "BB"}, inplace = True)

pd1
##      BB  d  c
## FL    0  1  2
## TX    3  4  5
## Uta   6  7  8

Discretization and Binning

ages = np.random.default_rng(32611).integers(low=0, high=100, size=10)
bins = [0, 20, 40, 60, 80, 100]
groups = pd.cut(ages, bins)
groups.codes
## array([0, 2, 2, 0, 1, 0, 0, 3, 4, 3], dtype=int8)
groups.categories
## IntervalIndex([(0, 20], (20, 40], (40, 60], (60, 80], (80, 100]], dtype='interval[int64, right]')
groups.value_counts()
## (0, 20]      4
## (20, 40]     1
## (40, 60]     2
## (60, 80]     2
## (80, 100]    1
## dtype: int64
groups2 = pd.cut(ages, bins, labels = ["0-20", "20-40", "40-60", "60-80", "80-100"])
groups2.value_counts()
## 0-20      4
## 20-40     1
## 40-60     2
## 60-80     2
## 80-100    1
## dtype: int64

Detecting Outliers

rng = np.random.default_rng(32611)
arr = pd.DataFrame(rng.standard_normal((100,3)))
arr.head()
##           0         1         2
## 0  0.195122 -0.563317  0.973858
## 1 -1.873995  0.394967 -0.132278
## 2 -0.643411 -1.046220 -0.278885
## 3 -0.039059 -0.575599  0.026662
## 4  0.414205  0.643462 -1.125217
arr.describe()
##                 0           1           2
## count  100.000000  100.000000  100.000000
## mean     0.027024    0.071957    0.002366
## std      0.980115    1.005985    0.947386
## min     -2.403350   -2.823156   -2.256457
## 25%     -0.760132   -0.606894   -0.740145
## 50%      0.155343   -0.046282   -0.078738
## 75%      0.723528    0.717592    0.559711
## max      2.170296    2.972694    3.529913
col0 = arr[0]
col0[np.abs(col0) > 2]
## 13   -2.090740
## 53    2.167734
## 74    2.170296
## 87   -2.403350
## Name: 0, dtype: float64
arr[(np.abs(arr)>3).any(axis="columns")]
##            0         1         2
## 92  0.678229 -0.692429  3.529913
arr[(np.abs(arr)>2)] = np.sign(arr) * 2
arr.describe()
##                 0           1           2
## count  100.000000  100.000000  100.000000
## mean     0.028585    0.060009   -0.015086
## std      0.961568    0.936246    0.880808
## min     -2.000000   -2.000000   -2.000000
## 25%     -0.760132   -0.606894   -0.740145
## 50%      0.155343   -0.046282   -0.078738
## 75%      0.723528    0.717592    0.559711
## max      2.000000    2.000000    2.000000

Sub-Sampling

rng = np.random.default_rng(32611)
arr = pd.DataFrame(rng.standard_normal((100,3)))
index = rng.choice(range(100),3)
arr.take(index)
##            0         1         2
## 57  0.755502  1.748730 -0.549839
## 58  0.603501  0.778081 -0.741010
## 38  0.066027 -0.638866  0.302885
arr.sample(n=3)
##            0         1         2
## 67  1.114620 -0.284325 -0.394080
## 96  0.035634 -0.493672  0.018179
## 48  0.579743 -0.404422 -0.661694

Create dummy variables

data = pd.DataFrame({"keys": ["b", "b", "a", "a", "c", "c"], "values": np.arange(6)})
data
##   keys  values
## 0    b       0
## 1    b       1
## 2    a       2
## 3    a       3
## 4    c       4
## 5    c       5
dummy = pd.get_dummies(data["keys"])
dummy
##    a  b  c
## 0  0  1  0
## 1  0  1  0
## 2  1  0  0
## 3  1  0  0
## 4  0  0  1
## 5  0  0  1
dummy.add_prefix("Group_")
##    Group_a  Group_b  Group_c
## 0        0        1        0
## 1        0        1        0
## 2        1        0        0
## 3        1        0        0
## 4        0        0        1
## 5        0        0        1
data_combine = data.join(dummy.add_prefix("Group_"))
data_combine
##   keys  values  Group_a  Group_b  Group_c
## 0    b       0        0        1        0
## 1    b       1        0        1        0
## 2    a       2        1        0        0
## 3    a       3        1        0        0
## 4    c       4        0        0        1
## 5    c       5        0        0        1

String in Series

data = pd.Series({"Alex": "alex@gmail.com", "Beth": "BETH@yahoo.com", "Carl": "Carl@ufl.edu"})
data.str.contains("ufl")
## Alex    False
## Beth    False
## Carl     True
## dtype: bool
pattern = "@|\."
data.str.split(pattern)
## Alex    [alex, gmail, com]
## Beth    [BETH, yahoo, com]
## Carl      [Carl, ufl, edu]
## dtype: object
pattern = "(\w+)@(\w+)\.(\w+)"
data.str.findall(pattern)
## Alex    [(alex, gmail, com)]
## Beth    [(BETH, yahoo, com)]
## Carl      [(Carl, ufl, edu)]
## dtype: object
data.str.findall(pattern).map(lambda x: x[0][1])
## Alex    gmail
## Beth    yahoo
## Carl      ufl
## dtype: object

Reference