Programming basics for Biostatistics 6099

Numpy basics

Zhiguang Huo (Caleb)

Tuesday Nov 2nd, 2023

Outlines

Numpy

import numpy as np 
https://numpy.org/install/

get started

n = 1000000
my_arr = np.arange(n)
my_list = list(range(n))
## run %timeit in jupyter notebook

%timeit my_arr2 = my_arr * 2

%timeit my_lis2 = [i * 2 for i in my_list]

Create ndarrays with np.array() method

data1 = [1.1, 2.2, 3.3, 4.4]
data1d = np.array(data1)
data1d
## array([1.1, 2.2, 3.3, 4.4])
data2 = [[1,2,3,4], [5,6,7,8]]
data2d = np.array(data2)
data2d
## array([[1, 2, 3, 4],
##        [5, 6, 7, 8]])

Basic attributes of ndarray

data2d
## array([[1, 2, 3, 4],
##        [5, 6, 7, 8]])
data2d.shape
## (2, 4)
data2d.size
## 8
data2d.dtype
## dtype('int64')

numpy ndarray

data1d * 10
## array([11., 22., 33., 44.])
data2d + data2d
## array([[ 2,  4,  6,  8],
##        [10, 12, 14, 16]])

Create ndarrays with zeros, ones, full.

np.zeros(10)
## array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
np.zeros((2,3))
## array([[0., 0., 0.],
##        [0., 0., 0.]])
np.ones(10)
## array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
np.ones((2,3))
## array([[1., 1., 1.],
##        [1., 1., 1.]])
np.full(10, 5)
## array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5])
np.full((2,3), 5)
## array([[5, 5, 5],
##        [5, 5, 5]])
data2 = np.array([[1,2,3,4], [5,6,7,8]])
np.zeros_like(data2)
## array([[0, 0, 0, 0],
##        [0, 0, 0, 0]])
np.ones_like(data2)
## array([[1, 1, 1, 1],
##        [1, 1, 1, 1]])
np.full_like(data2, 5)
## array([[5, 5, 5, 5],
##        [5, 5, 5, 5]])

Create ndarrays with eye and identity

np.eye(3)
## array([[1., 0., 0.],
##        [0., 1., 0.],
##        [0., 0., 1.]])
np.identity(3)
## array([[1., 0., 0.],
##        [0., 1., 0.],
##        [0., 0., 1.]])

Create ndarrays with Python range function

np.arange(10)
## array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
np.arange(3,10)
## array([3, 4, 5, 6, 7, 8, 9])
np.arange(3,10, 2)
## array([3, 5, 7, 9])
np.arange(10,3, -1)
## array([10,  9,  8,  7,  6,  5,  4])

data types for ndarrays

arr1 = np.array([1,2,3], dtype=np.float64)
arr2 = np.array([1,2,3], dtype=np.int32)
arr3 = np.array([1.1,2.3,3], dtype=np.string_)
arr1.dtype; arr2.dtype; arr3.dtype; 
arr1.astype(np.int32)
## array([1, 2, 3], dtype=int32)
arr3.astype(np.float64)
## array([1.1, 2.3, 3. ])

data types conversions for ndarrays

arr = np.arange(10, dtype=np.int32)
arr
## array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)
calibers = np.array([0.22, 0.54, 0.55, 0.0], dtype=np.float64)

arr.astype(calibers.dtype)
## array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
arr.astype(np.bool_)
## array([False,  True,  True,  True,  True,  True,  True,  True,  True,
##         True])

Arithmetic with NumPy Arrays

arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr
## array([[1., 2., 3.],
##        [4., 5., 6.]])
arr * arr
## array([[ 1.,  4.,  9.],
##        [16., 25., 36.]])
arr - arr
## array([[0., 0., 0.],
##        [0., 0., 0.]])

Arithmetic with NumPy Arrays

1 / arr
## array([[1.        , 0.5       , 0.33333333],
##        [0.25      , 0.2       , 0.16666667]])
arr ** 0.5
## array([[1.        , 1.41421356, 1.73205081],
##        [2.        , 2.23606798, 2.44948974]])
arr2 = np.array([[0., 2., 11.], [3., 12., 2.]])
arr2
## array([[ 0.,  2., 11.],
##        [ 3., 12.,  2.]])
arr2 > arr
## array([[False, False,  True],
##        [False,  True, False]])

Reshape

arr = np.arange(8)
arr.reshape((2,4))
## array([[0, 1, 2, 3],
##        [4, 5, 6, 7]])
arr.reshape(2,4)
## array([[0, 1, 2, 3],
##        [4, 5, 6, 7]])
arr.reshape(4,2)
## array([[0, 1],
##        [2, 3],
##        [4, 5],
##        [6, 7]])
arr.reshape(4,2).reshape(2,4)
## array([[0, 1, 2, 3],
##        [4, 5, 6, 7]])
arr.reshape(4,-1)
## array([[0, 1],
##        [2, 3],
##        [4, 5],
##        [6, 7]])
arr.reshape(2,-1)
## array([[0, 1, 2, 3],
##        [4, 5, 6, 7]])

Pseudorandom Number Generation

rng = np.random.default_rng()
print(rng)
## Generator(PCG64)

generate random numbers with rng

rfloat = rng.random() ## also specify size inside
rfloat
## 0.21659168820945018
type(rfloat)
## <class 'float'>
rints = rng.integers(low=0, high=10, size=3)
rints 
## array([3, 1, 7])
type(rints)
## <class 'numpy.ndarray'>

set random seed

rng = np.random.default_rng(seed=42)
arr1 = rng.random((3, 3))
arr1
## array([[0.77395605, 0.43887844, 0.85859792],
##        [0.69736803, 0.09417735, 0.97562235],
##        [0.7611397 , 0.78606431, 0.12811363]])
rng = np.random.default_rng(seed=42)
arr2 = rng.random((3, 3))
arr2
## array([[0.77395605, 0.43887844, 0.85859792],
##        [0.69736803, 0.09417735, 0.97562235],
##        [0.7611397 , 0.78606431, 0.12811363]])

Other random data

arr = np.arange(6)
rng.choice(arr, size=5)
## array([5, 2, 3, 2, 1])
rng.choice(arr, size=5, replace = False) ## by default, repalce is True
## array([5, 1, 4, 2, 3])
arr = np.array([list(range(1,3)),list(range(3,5)),list(range(5,7))])
rng.choice(arr, size=2)
## array([[1, 2],
##        [3, 4]])

Permutations (1d)

rng = np.random.default_rng(seed=42)
arr1 = np.arange(6)
arr1
## array([0, 1, 2, 3, 4, 5])
rng.permutation(arr1)
## array([3, 2, 5, 4, 1, 0])
arr1 ## the original array remains the same
## array([0, 1, 2, 3, 4, 5])
rng = np.random.default_rng(seed=42)
arr1 = np.arange(6)
rng.shuffle(arr1)
arr1
## array([3, 2, 5, 4, 1, 0])

Permutations (2d) – permutation() method

rng = np.random.default_rng(seed=42)
arr1 = np.arange(12).reshape(4,3)
rng.permutation(arr1) ## default axis=0
## array([[ 9, 10, 11],
##        [ 6,  7,  8],
##        [ 3,  4,  5],
##        [ 0,  1,  2]])
rng.permutation(arr1, axis=1)
## array([[ 2,  0,  1],
##        [ 5,  3,  4],
##        [ 8,  6,  7],
##        [11,  9, 10]])

Permutations (2d) – shuffle() method

rng = np.random.default_rng(seed=5442)
arr1 = np.arange(12).reshape(4,3)
rng.shuffle(arr1) ## default axis=0
arr1
## array([[ 3,  4,  5],
##        [ 0,  1,  2],
##        [ 6,  7,  8],
##        [ 9, 10, 11]])
arr1 = np.arange(12).reshape(4,3)
rng.shuffle(arr1, axis=1)
arr1
## array([[ 0,  2,  1],
##        [ 3,  5,  4],
##        [ 6,  8,  7],
##        [ 9, 11, 10]])

Permutations (2d) – permutated() method

rng = np.random.default_rng(seed=42)
arr1 = np.arange(12).reshape(4,3); arr1
## array([[ 0,  1,  2],
##        [ 3,  4,  5],
##        [ 6,  7,  8],
##        [ 9, 10, 11]])
# rng.permuted(arr1)
# rng.permuted(arr1, axis=0)
# rng.permuted(arr1, axis=1)

Distributions

rng = np.random.default_rng()
rng.standard_normal(size=10)
## array([ 0.21409389,  1.22708461, -0.95858637,  1.1204244 , -0.57909022,
##         0.42956404, -1.50635136, -1.03336681, -0.98637857, -0.9437289 ])
rng.standard_normal(size=(2,3))
## array([[ 1.41974814, -0.37742038,  1.1079552 ],
##        [-0.11548843,  0.70612016, -0.55047195]])
rng.normal(loc=4, scale=1, size=4)
## array([3.55334725, 4.90761414, 2.83178996, 0.91243366])

Basic indexing and slicing – 1darray

arr = np.arange(10)
arr
## array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr[4]
## 4
arr[4:7]
## array([4, 5, 6])
arr[4:7] = -1
arr
## array([ 0,  1,  2,  3, -1, -1, -1,  7,  8,  9])
arr_slice = arr[4:7]
arr_slice 
## array([-1, -1, -1])
arr_slice[1] = 99
arr
## array([ 0,  1,  2,  3, -1, 99, -1,  7,  8,  9])
arr_slice[:] = 88
arr
## array([ 0,  1,  2,  3, 88, 88, 88,  7,  8,  9])
arr = np.arange(10)

arr_slice2 = arr[4:7].copy()
arr_slice2
## array([4, 5, 6])
arr_slice2[1] = 99
arr
## array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr_slice2[:] = 88
arr
## array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

Basic indexing and slicing – 2darray

arr2d = np.array([[1,2,3], [4,5,6], [7,8,9]])
arr2d
## array([[1, 2, 3],
##        [4, 5, 6],
##        [7, 8, 9]])
arr2d[1]
## array([4, 5, 6])
arr2d[1][2]
## 6
arr2d[1,2]
## 6

indexing with slices – 2darray

arr2d = np.array([[1,2,3], [4,5,6], [7,8,9]])
arr2d[:2]
## array([[1, 2, 3],
##        [4, 5, 6]])
arr2d[:2, 1:]
## array([[2, 3],
##        [5, 6]])
arr2d[1, 1:]
## array([5, 6])
## arr2d[, :1] ## this doesn't work, we have to use : to be placeholder
arr2d[:, :1]
## array([[1],
##        [4],
##        [7]])
arr2d[::-1, ::-1]
## array([[9, 8, 7],
##        [6, 5, 4],
##        [3, 2, 1]])
arr2d[:2,1:] = 0
arr2d
## array([[1, 0, 0],
##        [4, 0, 0],
##        [7, 8, 9]])

Boolean indexing

nane_list = ["Bob", "Joe", "Will", "Bob", "Will", "James"]
names = np.array(nane_list)
data = rng.standard_normal(size=(6,3))
names
## array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'James'], dtype='<U5')
data
## array([[ 1.19167583, -0.12885116,  1.03198519],
##        [ 0.96210664,  0.32416914,  0.35553721],
##        [ 1.47360777, -0.21979707,  0.55706949],
##        [ 1.03443407,  1.08359143, -0.16322612],
##        [ 0.97912621, -1.96768607,  0.32006562],
##        [-0.10479479,  3.06244143, -0.94485631]])
names == "Bob"
## array([ True, False, False,  True, False, False])
data[names == "Bob"]
## array([[ 1.19167583, -0.12885116,  1.03198519],
##        [ 1.03443407,  1.08359143, -0.16322612]])
data[names == "Bob", 1:]
## array([[-0.12885116,  1.03198519],
##        [ 1.08359143, -0.16322612]])
names != "Bob"
## array([False,  True,  True, False,  True,  True])
~(names == "Bob")
## array([False,  True,  True, False,  True,  True])
cond = names == "Bob"
data[~cond]
## array([[ 0.96210664,  0.32416914,  0.35553721],
##        [ 1.47360777, -0.21979707,  0.55706949],
##        [ 0.97912621, -1.96768607,  0.32006562],
##        [-0.10479479,  3.06244143, -0.94485631]])
mask = (names == "Bob") | (names == "Will")
mask
## array([ True, False,  True,  True,  True, False])
data[mask]
## array([[ 1.19167583, -0.12885116,  1.03198519],
##        [ 1.47360777, -0.21979707,  0.55706949],
##        [ 1.03443407,  1.08359143, -0.16322612],
##        [ 0.97912621, -1.96768607,  0.32006562]])
mask2 = (names == "Bob") & (names == "Will")
mask2
## array([False, False, False, False, False, False])
data[mask2]
## array([], shape=(0, 3), dtype=float64)
data[data<0] = 0
data
## array([[1.19167583, 0.        , 1.03198519],
##        [0.96210664, 0.32416914, 0.35553721],
##        [1.47360777, 0.        , 0.55706949],
##        [1.03443407, 1.08359143, 0.        ],
##        [0.97912621, 0.        , 0.32006562],
##        [0.        , 3.06244143, 0.        ]])
data[names != "Will"] = 99
data
## array([[99.        , 99.        , 99.        ],
##        [99.        , 99.        , 99.        ],
##        [ 1.47360777,  0.        ,  0.55706949],
##        [99.        , 99.        , 99.        ],
##        [ 0.97912621,  0.        ,  0.32006562],
##        [99.        , 99.        , 99.        ]])

Fancy indexing

data = np.arange(20).reshape(4,5)
data[[2,0]]
## array([[10, 11, 12, 13, 14],
##        [ 0,  1,  2,  3,  4]])
data[[-2,-4]]
## array([[10, 11, 12, 13, 14],
##        [ 0,  1,  2,  3,  4]])
data[[2,0], [3,1]]
## array([13,  1])
data[[2,0]][:,[3,1]]
## array([[13, 11],
##        [ 3,  1]])

Transposing Arrays

arr = np.arange(8).reshape([4,2])
arr
## array([[0, 1],
##        [2, 3],
##        [4, 5],
##        [6, 7]])
arr.T
## array([[0, 2, 4, 6],
##        [1, 3, 5, 7]])
np.dot(arr.T, arr)
## array([[56, 68],
##        [68, 84]])
arr.T @ arr
## array([[56, 68],
##        [68, 84]])
arr.T.dot(arr)
## array([[56, 68],
##        [68, 84]])

Swapping Axes

arr = np.arange(16).reshape([2,2,4])
arr
## array([[[ 0,  1,  2,  3],
##         [ 4,  5,  6,  7]],
## 
##        [[ 8,  9, 10, 11],
##         [12, 13, 14, 15]]])
arr.swapaxes(1,2)
## array([[[ 0,  4],
##         [ 1,  5],
##         [ 2,  6],
##         [ 3,  7]],
## 
##        [[ 8, 12],
##         [ 9, 13],
##         [10, 14],
##         [11, 15]]])

Universal Functions (ufuncs): Unary ufuncs

#arr = rng.standard_normal(6) ## try this one later
arr = np.arange(6)
arr
np.sqrt(arr)
np.exp(arr)
np.floor(arr); np.ceil(arr); 
np.log(arr), np.log10(arr); np.log2(arr); np.log1p(arr)
np.isnan(np.sqrt(arr))
np.sin(arr); np.cos(arr); np.tan(arr)
arr2 = np.array([np.nan, np.NaN, np.inf, np.Inf, 1.0])
arr2
np.isnan(arr2); np.isinf(arr2); np.isfinite(arr2)

Universal Functions (ufuncs): Binary universal functions

x = rng.standard_normal(4)
y = rng.standard_normal(4)
np.maximum(x,y); np.fmax(x,y) ## fmax ignores NaN
np.minimum(x,y); np.fmin(x,y) ## fmax ignores NaN
np.add(x,y); x + y
np.subtract(x,y); x - y
np.multiply(x,y); x * y
np.divide(x,y); x / y
np.power(x,2)
arr_power = np.arange(len(x))
np.power(x,arr_power)
np.greater(x,y); x > y
np.less(x,y); x < y
np.greater_equal(x,y); x >= y
np.less_equal(x,y); x <= y
np.equal(x,y); x == y
np.not_equal(x,y); x != y

Binary universal functions – logical

x = np.array([True, True, False, False], dtype=np.bool_)
y = np.array([True, False, True, False], dtype=np.bool_)
np.logical_and(x,y); x & y
np.logical_or(x,y); x | y
np.logical_xor(x,y); x ^ y

meshgrid

points = np.arange(-5,5,0.01) ## 1000 equally spaced points
xs, ys = np.meshgrid(points, points)
# xs; ys
z = np.sqrt(xs**2 + ys**2)

import matplotlib.pyplot as plt
plt.imshow(z)

Expressing conditional logic as Array Operations

xarr = np.array([1.1,1.2,1.4,1.5])
yarr = np.array([2.1,2.2,2.4,2.5])
condition = np.array([True, False, True, False])

res = [(x if c else y) for x, y, c in zip(xarr, yarr, condition)]
res
## [1.1, 2.2, 1.4, 2.5]
np.array(res)
## array([1.1, 2.2, 1.4, 2.5])
np.where(condition, xarr, yarr)
## array([1.1, 2.2, 1.4, 2.5])

Expressing conditional logic as Array Operations

rng = np.random.default_rng(32608)
arr = rng.standard_normal(size = (3,3))
arr
## array([[ 0.21984492, -1.28342596,  1.31520866],
##        [ 0.85478637, -1.12501008, -2.36333542],
##        [ 1.72194583, -1.29984418,  0.82499496]])
arr > 0
## array([[ True, False,  True],
##        [ True, False, False],
##        [ True, False,  True]])
np.where(arr>0,1,-1)
## array([[ 1, -1,  1],
##        [ 1, -1, -1],
##        [ 1, -1,  1]])
np.where(arr>0,0,arr)
## array([[ 0.        , -1.28342596,  0.        ],
##        [ 0.        , -1.12501008, -2.36333542],
##        [ 0.        , -1.29984418,  0.        ]])

Statistical Methods (1darray)

rng = np.random.default_rng(32608)
arr = rng.standard_normal(size = 6)
arr
## array([ 0.21984492, -1.28342596,  1.31520866,  0.85478637, -1.12501008,
##        -2.36333542])
# arr.mean(); np.mean(arr)
# arr.sum(); np.sum(arr)
# arr.std(); np.std(arr)
# arr.var(); np.var(arr)
# arr.min(); np.min(arr)
# arr.max(); np.max(arr)
# arr.argmin(); np.argmin(arr)
# arr.argmax(); np.argmax(arr)
# arr.cumsum(); np.cumsum(arr)
# arr.cumprod(); np.cumprod(arr)

Statistical Methods (2darray)

Statistical Methods (2darray)

rng = np.random.default_rng(32608)
arr = rng.standard_normal(size = (4,3))
arr
## array([[ 0.21984492, -1.28342596,  1.31520866],
##        [ 0.85478637, -1.12501008, -2.36333542],
##        [ 1.72194583, -1.29984418,  0.82499496],
##        [ 0.55837717, -0.19774024, -1.2478677 ]])
arr.sum()
## -2.022065687187067
arr.sum(axis=0)
## array([ 3.35495429, -3.90602046, -1.47099951])
arr.sum(axis=1)
## array([ 0.25162762, -2.63355913,  1.2470966 , -0.88723077])

Boolean arrays

rng = np.random.default_rng(32608)
arr = rng.standard_normal(100)
(arr > 0).sum()
## 44
bools = np.array([0,0,1,0], dtype=np.bool_)
bools
## array([False, False,  True, False])
bools.any()
## True
bools.all()
## False

sorting (1darray)

rng = np.random.default_rng(32608)
arr = rng.standard_normal(5)
arr
## array([ 0.21984492, -1.28342596,  1.31520866,  0.85478637, -1.12501008])
arr.argsort()
## array([1, 4, 0, 3, 2])
arr[arr.argsort()]
## array([-1.28342596, -1.12501008,  0.21984492,  0.85478637,  1.31520866])
arr.sort()
arr
## array([-1.28342596, -1.12501008,  0.21984492,  0.85478637,  1.31520866])

sorting (2darray)

arr = np.random.default_rng(32608).standard_normal((4,3))
arr
## array([[ 0.21984492, -1.28342596,  1.31520866],
##        [ 0.85478637, -1.12501008, -2.36333542],
##        [ 1.72194583, -1.29984418,  0.82499496],
##        [ 0.55837717, -0.19774024, -1.2478677 ]])
arr.sort(); arr
## array([[-1.28342596,  0.21984492,  1.31520866],
##        [-2.36333542, -1.12501008,  0.85478637],
##        [-1.29984418,  0.82499496,  1.72194583],
##        [-1.2478677 , -0.19774024,  0.55837717]])
arr = np.random.default_rng(32608).standard_normal((4,3))
arr.sort(axis=0); arr
## array([[ 0.21984492, -1.29984418, -2.36333542],
##        [ 0.55837717, -1.28342596, -1.2478677 ],
##        [ 0.85478637, -1.12501008,  0.82499496],
##        [ 1.72194583, -0.19774024,  1.31520866]])

Unique and Other set logic

x = np.array([0,1,0,2,5,6,5,0])
np.unique(x)
## array([0, 1, 2, 5, 6])
y = np.array([0,1,2,3])
np.intersect1d(x,y)
## array([0, 1, 2])
np.union1d(x,y)
## array([0, 1, 2, 3, 5, 6])
np.in1d(x,y)
## array([ True,  True,  True,  True, False, False, False,  True])
np.setdiff1d(x,y)
## array([5, 6])

matrix operators

x = np.arange(6).reshape(3,2)
y = np.arange(6).reshape(2,3)

y.dot(x) ## np.dot(y,x)
## array([[10, 13],
##        [28, 40]])
z = y.dot(y.T)
np.trace(z)
## 55
np.linalg.det(z)
## 53.99999999999996
np.linalg.norm(z)
## 54.00925846556311

matrix operators

np.linalg.inv(z)
## array([[ 0.92592593, -0.25925926],
##        [-0.25925926,  0.09259259]])
z.dot(np.linalg.inv(z))
## array([[1.00000000e+00, 8.32667268e-17],
##        [2.22044605e-16, 1.00000000e+00]])
eigen_value, eigen_vector = np.linalg.eigh(z)
eigen_vector.dot(np.diag(eigen_value)).dot(eigen_vector.T)
## array([[ 5., 14.],
##        [14., 50.]])

Reference