细分构建机器学习应用程序的流程-数据预处理
sklearn数据预处理官方文档地址:https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
1.1 缺失值处理
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn import datasets
%matplotlib inline
font = FontProperties(fname='/Library/Fonts/Heiti.ttc')
现实生活中的数据往往是不全面的,很多样本的属性值会有缺失,例如某个人填写的个人信息不完整或者对个人隐私的保护政策导致建模时可能无法得到所需要的特征,尤其是在数据量较大时,这种缺失值的产生会对模型的性能造成很大的影响。接下来将通过鸢尾花数据讨论缺失值处理的方法。
# 缺失值处理示例
from io import StringIO
iris_data = '''
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
,,,
'''
iris = datasets.load_iris()
df = pd.read_csv(StringIO(iris_data), header=None)
df.columns = iris.feature_names
df = df.iloc[:, :4]
df
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
6 |
NaN |
NaN |
NaN |
NaN |
1.1.1 删除缺失值
# axis=0删除有NaN值的行,axis=1删除有NaN值的列
df.dropna(axis=0)
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
# 删除全为NaN值得行或列
df.dropna(how='all')
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
# 删除行不为4个值的
df.dropna(thresh=4)
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
# 删除花萼长度中有NaN值的数据
df.dropna(subset=['sepal length (cm)'])
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
4.6.1.2 填充缺失值