================
by Jawad Haider
Chpt 2 - Data Manipulation with Pandas
04 - Handling Missing Data
None: Pythonic missing data
import numpy as np
import pandas as pd
vals1 = np . array ([ 1 , None , 3 , 4 ])
vals1
array([1, None, 3, 4], dtype=object)
for dtype in [ 'object' , 'int' ]:
print ( "dtype = " , dtype )
% timeit np . arange ( 1E6 , dtype = dtype ) . sum ()
print ()
dtype = object
57.5 ms ± 707 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
dtype = int
1.09 ms ± 35.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
# My get error on arregate functions of numpy
vals1 . sum ()
TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
NaN: Missing numerical data
vals2 = np . array ([ 1 , np . nan , 3 , 4 ])
vals2 . dtype
vals2 . sum (), vals2 . min (), vals2 . max ()
NumPy does provide some special aggregations that will ignore these
missing values
Detecting null values
Pandas data structures have two useful methods for detecting null data:
isnull() and notnull(). Either one will return a Boolean mask over the
data.
# Deleting all null values
data = pd . Series ([ 1 , np . nan , 'hello' , None ])
data
NameError: name 'df' is not defined
rng = np . random . RandomState ( 42 )
ser = pd . Series ( rng . randint ( 0 , 10 , 4 ))
df = pd . DataFrame ( rng . randint ( 0 , 10 ,( 3 , 4 )), columns = [ 'A' , "B" , 'C' , 'D' ])
df
A
B
C
D
0
6
9
2
6
1
7
4
3
7
2
7
2
5
4
A
B
C
D
3
0
6
9
2
6
NaN
1
7
4
3
7
NaN
2
7
2
5
4
NaN
1
2
3
4
5
0
6
9
2
6
NaN
1
7
4
3
7
NaN
2
7
2
5
4
NaN
df . dropna ( axis = 'columns' , how = 'all' )
1
2
3
4
0
6
9
2
6
1
7
4
3
7
2
7
2
5
4
df . dropna ( axis = 'rows' , thresh = 3 )
1
2
3
4
5
0
6
9
2
6
NaN
1
7
4
3
7
NaN
2
7
2
5
4
NaN
Filling null values
Sometimes rather than dropping NA values, you’d rather replace them with
a valid value. This value might be a single number like zero, or it
might be some sort of imputation or interpolation from the good values.
You could do this in-place using the isnull() method as a mask, but
because it is such a common operation Pandas provides the fillna()
method, which returns a copy of the array with the null values replaced.
data = pd . Series ([ 1 , np . nan , 2 , None , 3 ], index = list ( 'abcde' ))
data
a 1.0
b NaN
c 2.0
d NaN
e 3.0
dtype: float64
# filling na values with a single 0
sum ( data . isnull ())
a 1.0
b -1.0
c 2.0
d -1.0
e 3.0
dtype: float64
#forward-fill --> propagates the previous value forward
data . fillna ( method = 'ffill' )
a 1.0
b 1.0
c 2.0
d 2.0
e 3.0
dtype: float64
# Back fill, to propgate the next value backward
data . fillna ( method = 'bfill' )
a 1.0
b 2.0
c 2.0
d 3.0
e 3.0
dtype: float64
1
2
3
4
5
0
6
9
2
6
NaN
1
7
4
3
7
NaN
2
7
2
5
4
NaN
df . fillna ( method = 'ffill' , axis = 1 )
# if the previous value is not available during a forward fill, the NA value remains
1
2
3
4
5
0
6.0
9.0
2.0
6.0
6.0
1
7.0
4.0
3.0
7.0
7.0
2
7.0
2.0
5.0
4.0
4.0