Up-to-date version: Python Cheat Sheet

Table of Contents

Numpy

In [2]:
import numpy as np

Create Numpy Array from Python List

In [10]:
x = np.array([1, 2, 3, 4, 5])
print(x)
print(type(x))
print(x.dtype)
print(x.shape)
print(x.size)

x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
print(x)
print(type(x))
print(x.dtype)
print(x.shape)
print(x.size)
[1 2 3 4 5]
<class 'numpy.ndarray'>
int64
(5,)
5
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
<class 'numpy.ndarray'>
int64
(4, 3)
12

Create Numpy Array from Built-in Functions

In [32]:
x = np.zeros((3, 4))
print(x)
print(x.dtype)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
float64
In [33]:
x = np.ones((3, 4), dtype=int)
print(x)
print(x.dtype)
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]
int64
In [34]:
x = np.full((3, 4), 5)
print(x)
[[5 5 5 5]
 [5 5 5 5]
 [5 5 5 5]]
In [35]:
x = np.eye(5, dtype=int)
print(x)
[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]
In [36]:
x = np.diag([10, 20, 30, 40])
print(x)
[[10  0  0  0]
 [ 0 20  0  0]
 [ 0  0 30  0]
 [ 0  0  0 40]]
In [37]:
x = np.arange(4, 10)
print(x)
[4 5 6 7 8 9]
In [41]:
x = np.arange(1, 20, 3)
print(x)
[ 1  4  7 10 13 16 19]
In [42]:
x = np.linspace(1, 20, 3)
print(x)
[ 1.  10.5 20. ]
In [44]:
x = np.arange(20)
print(x)

x = np.reshape(x, (4, 5))
print(x)

x = np.arange(20).reshape(4, 5)
print(x)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]
In [48]:
# Defaults to range [0, 1)
x = np.random.random((3, 3))
print(x)

x = np.random.randint(4, 10, (3, 3))
print(x)
[[0.22087021 0.53229498 0.58663932]
 [0.21300366 0.86993844 0.56059265]
 [0.86554777 0.38157681 0.78204005]]
[[7 5 9]
 [7 4 7]
 [9 9 7]]
In [56]:
# mean = 0, std = 0.1
x = np.random.normal(0, 0.1, (5, 5))
print(x)

print(x.mean())
print(x.std())
[[ 0.0122765  -0.12708003  0.11426993 -0.04997364 -0.02526457]
 [-0.04439879 -0.12928117 -0.07242298  0.060275    0.06836317]
 [-0.02163878  0.15118322 -0.09682757 -0.04438684  0.11186937]
 [ 0.03933767 -0.08154594  0.00507315 -0.05448884  0.06592437]
 [ 0.06140125 -0.0002377  -0.07852702 -0.02126833  0.17878217]]
0.0008565443649252224
0.08241037010812466

Accessing, Deleting and Inserting Elements into NDArrays

In [59]:
x = np.array([1, 2, 3, 4, 5])

print(x[0])
print(x[2])
print(x[-1])
print(x[-3])
1
3
5
3
In [105]:
## Get  diagonal of a 2d array

x = np.arange(25).reshape(5, 5)
print(x)

print(np.diag(x))
print(np.diag(x, k=1))
print(np.diag(x, k=-2))
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]
[ 0  6 12 18 24]
[ 1  7 13 19]
[10 16 22]
In [106]:
## Get unique elements of an array

x = np.array([1, 2, 3, 4, 2, 1, 1, 2, 5])
print(np.unique(x))
[1 2 3 4 5]
In [62]:
x = np.arange(1, 10).reshape(3, 3)

print(x)
print(x[0, 0])
print(x[1, 0])
print(x[2, 1])

## Modify element
x[2, 2] = -9
print(x)
[[1 2 3]
 [4 5 6]
 [7 8 9]]
1
4
8
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8 -9]]
In [67]:
## Delete Rows by Index

x = np.arange(9).reshape(3, 3)
print(x)
print(np.delete(x, [0, 2], axis=0))
[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[3 4 5]]
In [70]:
## Delete Columns by Index

x = np.arange(9).reshape(3, 3)
print(x)
print(np.delete(x, [0, 2], axis=1))
[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[1]
 [4]
 [7]]
In [72]:
## Append Row

x = np.arange(9).reshape(3, 3)
print(x)
print(np.append(x, [[9, 10, 11]], axis=0))
[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
In [74]:
## Append Column

x = np.arange(9).reshape(3, 3)
print(x)
print(np.append(x, [[9], [10], [11]], axis=1))
[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[ 0  1  2  9]
 [ 3  4  5 10]
 [ 6  7  8 11]]
In [77]:
## Insert Elements - 1D / Rank 1 Arrays

x = np.array([1, 2, 5, 6, 7, 8, 9, 10])
print(x)
print(np.insert(x, 2, [3, 4]))
[ 1  2  5  6  7  8  9 10]
[ 1  2  3  4  5  6  7  8  9 10]
In [79]:
## Insert Row at Specified Index - 2D Array

x = np.array([[1, 2, 3], [7, 8, 9]])
print(x)
print(np.insert(x, 1, [4, 5, 6], axis=0))
[[1 2 3]
 [7 8 9]]
[[1 2 3]
 [4 5 6]
 [7 8 9]]
In [83]:
x = np.array([[1, 2], [4, 5]])
print(x)
print(np.insert(x, 2, [3, 6], axis=1))
print(np.insert(x, 2, 9, axis=1))
[[1 2]
 [4 5]]
[[1 2 3]
 [4 5 6]]
[[1 2 9]
 [4 5 9]]
In [90]:
## Stack 2 Arrays - Vertically

x = np.array([1, 2])
y = np.array([[3, 4], [5, 6]])
print(f"x=\n{x}")
print(f"y=\n{y}")
print(f"vstack=\n {np.vstack((x, y))}")
x=
[1 2]
y=
[[3 4]
 [5 6]]
vstack=
 [[1 2]
 [3 4]
 [5 6]]
In [93]:
## Stack 2 Arrays - Horizontally

x = np.array([[3], [6]])
y = np.array([[1, 2], [4, 5]])
print(f"x=\n{x}")
print(f"y=\n{y}")
print(f"hstack=\n {np.hstack((y, x))}")
x=
[[3]
 [6]]
y=
[[1 2]
 [4 5]]
hstack=
 [[1 2 3]
 [4 5 6]]

Slicing NDArrays

Slicing only creates new "views" on the original array, not new copies of the sliced array. To create a copy, use the copy() method.

In [99]:
x = np.arange(1, 21).reshape(4, 5)
print(x)

print(x[0:2, 0:2])

## Notice the subtle difference between the followig
print(x[:, 0:1])
print(x[:, 0])
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]
[[1 2]
 [6 7]]
[[ 1]
 [ 6]
 [11]
 [16]]
[ 1  6 11 16]

Boolean Indexing

In [109]:
x = np.arange(25).reshape(5, 5)
print(x)

print(x[(x > 10) & (x < 17)])
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]
[11 12 13 14 15 16]

Set Operations

In [110]:
x = np.array([1, 2, 3, 4, 5])
y = np.array([6, 8, 3, 2, 9])

print(np.intersect1d(x, y))
print(np.setdiff1d(x, y))
print(np.union1d(x, y))
[2 3]
[1 4 5]
[1 2 3 4 5 6 8 9]

Sorting

In [119]:
x = np.random.randint(1, 11, size=(10, ))
print(x)

## Out-of-place sorting
print(f"oop sorted= \n {np.sort(x)}")
print(f"original= \n {x}")

## In-place sorting
x.sort()
print(f"ip sorted= \n {x}")
[9 1 8 9 6 1 7 6 2 8]
oop sorted= 
 [1 1 2 6 6 7 8 8 9 9]
original= 
 [9 1 8 9 6 1 7 6 2 8]
ip sorted= 
 [1 1 2 6 6 7 8 8 9 9]

Pandas

In [121]:
import pandas as pd

Pandas Series

Create

In [128]:
### With default integer indices
groceries = pd.Series(data=[30, 6, 'Foo', 'Bar'])
print(groceries)

### With custom indices
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries)
0     30
1      6
2    Foo
3    Bar
dtype: object
egg        30
apples      6
milk      Yes
bread      No
dtype: object

Attributes

In [130]:
print(groceries.shape)
print(groceries.ndim)
print(groceries.size)
print(groceries.index)
print(groceries.values)
print('bananas' in groceries)
print('apples' in groceries)
(4,)
1
4
Index(['egg', 'apples', 'milk', 'bread'], dtype='object')
[30 6 'Yes' 'No']
False
True

Accessing Data

In [144]:
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries['egg'])

print('====\n')

## By labels
print(groceries[['egg', 'apples']])
print(groceries.loc[['egg', 'apples']])
print('====\n')

## By index
print(groceries[[0, -1]])
print(groceries.iloc[[0, -1]])
30
====

egg       30
apples     6
dtype: object
egg       30
apples     6
dtype: object
====

egg      30
bread    No
dtype: object
egg      30
bread    No
dtype: object

Modify Series

In [145]:
## Change Element Values
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
groceries[['egg']] = 31
print(groceries)
egg        31
apples      6
milk      Yes
bread      No
dtype: object
In [153]:
## Drop Elements - Out-of-Place
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries.drop(['apples']))
print(groceries)
egg       30
milk     Yes
bread     No
dtype: object
egg        30
apples      6
milk      Yes
bread      No
dtype: object
In [155]:
## Drop Elements - In-Place
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
groceries.drop(['apples'], inplace=True)
print(groceries)
egg       30
milk     Yes
bread     No
dtype: object

Arithmetic Operations

In [157]:
fruits= pd.Series(data = [10, 6, 3,], index = ['apples', 'oranges', 'bananas'])
fruits + 1
Out[157]:
apples     11
oranges     7
bananas     4
dtype: int64
In [158]:
np.sqrt(fruits)
Out[158]:
apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64
In [166]:
fruits[['bananas', 'oranges']] * 10
Out[166]:
bananas    30
oranges    60
dtype: int64

Pandas DataFrames

Create

In [171]:
# We create a dictionary of Pandas Series 
items = {'Bob' : pd.Series(data = [245, 25, 55]),
         'Alice' : pd.Series(data = [40, 110, 500, 45])}

# We print the type of items to see that it is a dictionary
print(type(items))
shopping_carts = pd.DataFrame(items)
shopping_carts
<class 'dict'>
Out[171]:
Bob Alice
0 245.0 40
1 25.0 110
2 55.0 500
3 NaN 45
In [246]:
## Create DF from csv
# df = pd.read_csv('myfile.csv')
In [172]:
# We create a dictionary of Pandas Series 
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

# We print the type of items to see that it is a dictionary
print(type(items))
shopping_carts = pd.DataFrame(items)
shopping_carts
<class 'dict'>
Out[172]:
Bob Alice
bike 245.0 500.0
book NaN 40.0
glasses NaN 110.0
pants 25.0 45.0
watch 55.0 NaN
In [178]:
## Creating DF Using Subset of Dict

# We Create a DataFrame that only has selected items for Alice
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses', 'bike'], columns = ['Alice'])
alice_sel_shopping_cart
Out[178]:
Alice
glasses 110
bike 500

Attributes

In [174]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
shopping_carts = pd.DataFrame(items)

shopping_carts.shape
Out[174]:
(5, 2)
In [175]:
shopping_carts.ndim
Out[175]:
2
In [176]:
shopping_carts.columns
Out[176]:
Index(['Bob', 'Alice'], dtype='object')
In [177]:
shopping_carts.values
Out[177]:
array([[245., 500.],
       [ nan,  40.],
       [ nan, 110.],
       [ 25.,  45.],
       [ 55.,  nan]])

Accessing Data

In [265]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
df
Out[265]:
Bob Alice Charlie
bike 245.0 500.0 70.0
book NaN 40.0 45.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0
watch 55.0 NaN NaN

Access column(s) by label

In [266]:
df[['Bob', 'Alice']]
Out[266]:
Bob Alice
bike 245.0 500.0
book NaN 40.0
glasses NaN 110.0
pants 25.0 45.0
watch 55.0 NaN
In [256]:
df.loc[:, ['Bob', 'Alice']]
Out[256]:
Bob Alice
bike 245.0 500.0
book NaN 40.0
glasses NaN 110.0
pants 25.0 45.0
watch 55.0 NaN

Access column(s) by index

In [262]:
display(df)
df.iloc[:, [0, 2]]
Bob Alice Charlie
bike 245.0 500.0 70.0
book NaN 40.0 45.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0
watch 55.0 NaN NaN
Out[262]:
Bob Charlie
bike 245.0 70.0
book NaN 45.0
glasses NaN 90.0
pants 25.0 450.0
watch 55.0 NaN

Access row(s) by label

In [285]:
df.loc[['bike', 'pants']]
Out[285]:
Bob Alice Charlie
bike 24500 40 4500
pants 2500 110 9000

Access row(s) by index

In [270]:
display(df)
df.iloc[[0, 2]]
Bob Alice Charlie
bike 245.0 500.0 70.0
book NaN 40.0 45.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0
watch 55.0 NaN NaN
Out[270]:
Bob Alice Charlie
bike 245.0 500.0 70.0
glasses NaN 110.0 90.0

Get N rows from a DF

In [287]:
df[:3]
Out[287]:
Bob Alice Charlie
bike 24500 40 4500
pants 2500 110 9000
watch 5500 500 7000

Get N random rows from a DF

In [295]:
df.sample(n=2)
Out[295]:
Bob Alice Charlie
watch 5500 500 7000
pants 2500 110 9000

Access element by row and column label

In [188]:
df['Alice']['bike']  # Column label always comes first
Out[188]:
500.0

Get all rows where column value satisfies condition

In [259]:
display(df)
df.loc[df['Bob'] > 40]
Bob Alice Charlie
bike 245.0 500.0 70.0
book NaN 40.0 45.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0
watch 55.0 NaN NaN
Out[259]:
Bob Alice Charlie
bike 245.0 500.0 70.0
watch 55.0 NaN NaN

Modify DF

In [212]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
df
Out[212]:
Bob Alice Charlie
bike 245.0 500.0 70.0
book NaN 40.0 45.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0
watch 55.0 NaN NaN

Add column

In [201]:
df['Dan'] = [1, 2, 3, 4, 5]
df
Out[201]:
Bob Alice Charlie Dan
bike 245.0 500.0 70.0 1
book NaN 40.0 45.0 2
glasses NaN 110.0 90.0 3
pants 25.0 45.0 450.0 4
watch 55.0 NaN NaN 5

Append columns from a DF to another DF

In [272]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
df

items_new = {'Dan' : pd.Series(data = [1, 2, 3], index = ['bike', 'pants', 'watch']),}
df_new = pd.DataFrame(items_new)
df.join(df_new)
Out[272]:
Bob Alice Charlie Dan
bike 245.0 500.0 70.0 1.0
book NaN 40.0 45.0 NaN
glasses NaN 110.0 90.0 NaN
pants 25.0 45.0 450.0 2.0
watch 55.0 NaN NaN 3.0

Insert column at index

In [216]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)

df.insert(1, 'Dan', [1, 2, 3, 4, 5])
df
Out[216]:
Bob Dan Alice Charlie
bike 245.0 1 500.0 70.0
book NaN 2 40.0 45.0
glasses NaN 3 110.0 90.0
pants 25.0 4 45.0 450.0
watch 55.0 5 NaN NaN

Add column using sum of previous columns values

In [217]:
df['Total'] = df['Bob'] + df['Alice'] + df['Charlie'] + df['Dan']
df
Out[217]:
Bob Dan Alice Charlie Total
bike 245.0 1 500.0 70.0 816.0
book NaN 2 40.0 45.0 NaN
glasses NaN 3 110.0 90.0 NaN
pants 25.0 4 45.0 450.0 524.0
watch 55.0 5 NaN NaN NaN

Add rows

In [218]:
new_item = {'Bob': 1, 'Alice': 2, 'Charlie': 2}
new_df = pd.DataFrame(new_item, index = ['phones'])
display(new_df)

display(df.append(new_df))
Bob Alice Charlie
phones 1 2 2
Bob Dan Alice Charlie Total
bike 245.0 1.0 500.0 70.0 816.0
book NaN 2.0 40.0 45.0 NaN
glasses NaN 3.0 110.0 90.0 NaN
pants 25.0 4.0 45.0 450.0 524.0
watch 55.0 5.0 NaN NaN NaN
phones 1.0 NaN 2.0 2.0 NaN

Delete column

In [226]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
display(df)
df.pop('Bob')
display(df)
Bob Alice Charlie
bike 245.0 500.0 70.0
book NaN 40.0 45.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0
watch 55.0 NaN NaN
Alice Charlie
bike 500.0 70.0
book 40.0 45.0
glasses 110.0 90.0
pants 45.0 450.0
watch NaN NaN

Delete multiple columns

In [231]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
display(df)

display(df.drop(['Bob', 'Alice'], axis=1)) # 1 = columns
Bob Alice Charlie
bike 245.0 500.0 70.0
book NaN 40.0 45.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0
watch 55.0 NaN NaN
Charlie
bike 70.0
book 45.0
glasses 90.0
pants 450.0
watch NaN

Delete multiple rows

In [232]:
display(df.drop(['watch', 'book'], axis=0)) # axis=0 => row / index
Bob Alice Charlie
bike 245.0 500.0 70.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0

Transform values of selected columns

In [281]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500], index = ['bike', 'pants', 'watch']),
         'Charlie': pd.Series(data = [45, 90, 70], index = ['bike', 'pants', 'watch'])}

df = pd.DataFrame(items)
display(df)

columns_to_tranform = ['Bob', 'Charlie']
df[columns_to_tranform] = df[columns_to_tranform].apply(lambda x: x * 100)
display(df)
Bob Alice Charlie
bike 245 40 45
pants 25 110 90
watch 55 500 70
Bob Alice Charlie
bike 24500 40 4500
pants 2500 110 9000
watch 5500 500 7000
In [284]:
#### Substitute values in columns of a DF
df.replace([40, 7000], ['Foo', 'Bar'])
Out[284]:
Bob Alice Charlie
bike 24500 Foo 4500
pants 2500 110 9000
watch 5500 500 Bar

 Dealing with NaN

In [240]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450, 1], index = ['book', 'glasses', 'bike', 'pants', 'watch'])}

df = pd.DataFrame(items)
display(df)
Bob Alice Charlie
bike 245.0 500.0 70
book NaN 40.0 45
glasses NaN 110.0 90
pants 25.0 45.0 450
watch 55.0 NaN 1
In [235]:
## Counting NaNs
df.isnull().sum().sum()
Out[235]:
4
In [237]:
## Counting non-NaNs
df.count().sum()
Out[237]:
11
In [238]:
## Drop rows with NaNs
display(df.dropna(axis=0))
Bob Alice Charlie
bike 245.0 500.0 70.0
pants 25.0 45.0 450.0
In [241]:
## Drop columns with NaNs
display(df.dropna(axis=1))
Charlie
bike 70
book 45
glasses 90
pants 450
watch 1
In [242]:
## Replace all NaNs with 0
display(df.fillna(0))
Bob Alice Charlie
bike 245.0 500.0 70
book 0.0 40.0 45
glasses 0.0 110.0 90
pants 25.0 45.0 450
watch 55.0 0.0 1
In [244]:
## Forward fill NaNs (value of previous row)
display(df)
display(df.fillna(method='ffill', axis=0)) # Other methods = 'backfill', 'linear'. Axis can be 1
Bob Alice Charlie
bike 245.0 500.0 70
book NaN 40.0 45
glasses NaN 110.0 90
pants 25.0 45.0 450
watch 55.0 NaN 1
Bob Alice Charlie
bike 245.0 500.0 70
book 245.0 40.0 45
glasses 245.0 110.0 90
pants 25.0 45.0 450
watch 55.0 45.0 1

Statistical Analysis

In [247]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500], index = ['bike', 'pants', 'watch']),
         'Charlie': pd.Series(data = [45, 90, 70], index = ['bike', 'pants', 'watch'])}

df = pd.DataFrame(items)
display(df)
Bob Alice Charlie
bike 245 40 45
pants 25 110 90
watch 55 500 70
In [248]:
## Describe statistical information of DF
df.describe()
Out[248]:
Bob Alice Charlie
count 3.000000 3.000000 3.000000
mean 108.333333 216.666667 68.333333
std 119.303534 247.857486 22.546249
min 25.000000 40.000000 45.000000
25% 40.000000 75.000000 57.500000
50% 55.000000 110.000000 70.000000
75% 150.000000 305.000000 80.000000
max 245.000000 500.000000 90.000000
In [249]:
df['Bob'].describeribe()
Out[249]:
count      3.000000
mean     108.333333
std      119.303534
min       25.000000
25%       40.000000
50%       55.000000
75%      150.000000
max      245.000000
Name: Bob, dtype: float64
In [250]:
df.mean()
Out[250]:
Bob        108.333333
Alice      216.666667
Charlie     68.333333
dtype: float64
In [251]:
df.max()
Out[251]:
Bob        245
Alice      500
Charlie     90
dtype: int64

Data Visualisation

In [2]:
import pandas as pd
import seaborn as sb

df = pd.read_csv('pokemon.csv')
display(df.head())
id species generation_id height weight base_experience type_1 type_2 hp attack defense speed special-attack special-defense
0 1 bulbasaur 1 0.7 6.9 64 grass poison 45 49 49 45 65 65
1 2 ivysaur 1 1.0 13.0 142 grass poison 60 62 63 60 80 80
2 3 venusaur 1 2.0 100.0 236 grass poison 80 82 83 80 100 100
3 4 charmander 1 0.6 8.5 62 fire NaN 39 52 43 65 60 50
4 5 charmeleon 1 1.1 19.0 142 fire NaN 58 64 58 80 80 65

Univariate Data

Categorical data frequency/count as bar chart

In [4]:
sb.countplot(data = df, x = 'generation_id')
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e8d9f10>
In [6]:
## Single color bars
base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'generation_id', color = base_color)
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e7c1e10>
In [10]:
## Sort left to right
gen_order = df['generation_id'].value_counts().index
sb.countplot(data = df, x = 'generation_id', order = gen_order)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e21bf10>
In [32]:
## Rotate x tick labels

## Without rotation
sb.countplot(data = df, x = 'type_1')
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f25b1d0>
In [33]:
## With rotation
import matplotlib.pyplot as plt

plt.xticks(rotation = 90)

sb.countplot(data = df, x = 'type_1')
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f5571d0>
In [36]:
## Plot Y Bars
sb.countplot(data = df, y = 'type_1')
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a20035790>

Categorical data relative frequency as bar chart

In [51]:
import numpy as np
import matplotlib.pyplot as plt

n_points = df.shape[0]
max_count = df['generation_id'].value_counts().max()
max_percent = max_count / n_points

tick_props = np.arange(0, max_percent,  0.05)
tick_names = ['{:0.2f}'.format(v) for v in tick_props]

sb.countplot(data = df, x = 'generation_id')
plt.yticks(tick_props * n_points, tick_names)
plt.ylabel('proportion')
Out[51]:
Text(0, 0.5, 'proportion')

Using Barplot to visualise processed data (not already stored as a column value)

In [55]:
df.isna().sum()
sb.barplot(df.isna().sum().index.values, df.isna().sum())
plt.xticks(rotation = 90)
Out[55]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]),
 <a list of 14 Text xticklabel objects>)

Numerical data histograms

In [57]:
df.head()
Out[57]:
id species generation_id height weight base_experience type_1 type_2 hp attack defense speed special-attack special-defense
0 1 bulbasaur 1 0.7 6.9 64 grass poison 45 49 49 45 65 65
1 2 ivysaur 1 1.0 13.0 142 grass poison 60 62 63 60 80 80
2 3 venusaur 1 2.0 100.0 236 grass poison 80 82 83 80 100 100
3 4 charmander 1 0.6 8.5 62 fire NaN 39 52 43 65 60 50
4 5 charmeleon 1 1.1 19.0 142 fire NaN 58 64 58 80 80 65
In [63]:
plt.hist(data = df, x = 'speed', bins = 20);
In [64]:
bins = np.arange(0, df['speed'].max()+5, 5)
plt.hist(data = df, x = 'speed', bins = bins);
In [68]:
sb.distplot(df['speed']);
In [67]:
sb.distplot(df['speed'], kde=False);

Subplots (Stack Plots Horizontally)

In [74]:
import matplotlib.pyplot as plt

plt.figure(figsize = [15, 5])

plt.subplot(1, 2, 1)                    # 1 row, 2 cols, subplot 1
sb.distplot(df['speed'], kde=False);

plt.subplot(1, 2, 2)                    # 1 row, 2 cols, subplot 2
sb.distplot(df['speed']);

Plot Subset of Data (Axis Range Limits)

In [79]:
plt.hist(data = df, x = 'height');
In [85]:
plt.hist(data = df, x = 'height');
plt.xlim((0, 2))
Out[85]:
(0, 2)

Axis Transformations (Log Scale)

In [87]:
## Original plot (with linear scale)
plt.hist(data = df, x = 'weight');
In [97]:
## Plots with log scales for x-axis
plt.figure(figsize = [15, 5])

plt.subplot(1, 2, 1)    
sb.distplot(df['weight'], kde=False)
plt.xscale('log')

plt.subplot(1, 2, 2)    
plt.hist(data = df, x = 'weight');
plt.xscale('log')
In [98]:
## Changing x range, whilst in log scale to better visualise data distribution
plt.xscale('log')

min = np.log10(df['weight'].min())
max = np.log10(df['weight'].max())
bins = 10 ** np.arange(min, max + 0.1, 0.1)

plt.hist(data = df, x = 'weight', bins = bins);

Bivariate Data

Pairwise Relationship Between Numerical Columns

In [102]:
sb.pairplot(df, hue='generation_id');

Categorical daya grouped-by another label

In [3]:
df.head()
Out[3]:
id species generation_id height weight base_experience type_1 type_2 hp attack defense speed special-attack special-defense
0 1 bulbasaur 1 0.7 6.9 64 grass poison 45 49 49 45 65 65
1 2 ivysaur 1 1.0 13.0 142 grass poison 60 62 63 60 80 80
2 3 venusaur 1 2.0 100.0 236 grass poison 80 82 83 80 100 100
3 4 charmander 1 0.6 8.5 62 fire NaN 39 52 43 65 60 50
4 5 charmeleon 1 1.1 19.0 142 fire NaN 58 64 58 80 80 65
In [26]:
import matplotlib.pyplot as plt

chart = sb.catplot(x='type_1', kind='count', hue='generation_id', data=df, height=10, aspect=10/7.5);
plt.xticks(rotation = 90);

Anaconda

List envs

conda info --envs

Activate env

conda activate <env_name>

Update all packages

conda upgrade -all

Install package

conda install package_name

## specifying package version
conda install numpy=1.10

Remove package

conda remove package_name

Search package

conda search *search_term*

List packages

conda list

Jupyter

Convert notebook to html

jupyter nbconvert --to html notebook.ipynb

# Other formats
# https://nbconvert.readthedocs.io/en/latest/usage.html
In [ ]: