sklearn数据预处理官方文档地址:https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| from matplotlib.font_manager import FontProperties |
| from sklearn import datasets |
| %matplotlib inline |
| font = FontProperties(fname='/Library/Fonts/Heiti.ttc') |
现实生活中的数据往往是不全面的,很多样本的属性值会有缺失,例如某个人填写的个人信息不完整或者对个人隐私的保护政策导致建模时可能无法得到所需要的特征,尤其是在数据量较大时,这种缺失值的产生会对模型的性能造成很大的影响。接下来将通过鸢尾花数据讨论缺失值处理的方法。
| |
| from io import StringIO |
| |
| iris_data = ''' |
| 5.1,,1.4,0.2 |
| 4.9,3.0,1.4,0.2 |
| 4.7,3.2,,0.2 |
| 7.0,3.2,4.7,1.4 |
| 6.4,3.2,4.5,1.5 |
| 6.9,3.1,4.9, |
| ,,, |
| ''' |
| |
| iris = datasets.load_iris() |
| df = pd.read_csv(StringIO(iris_data), header=None) |
| df.columns = iris.feature_names |
| df = df.iloc[:, :4] |
| df |
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
6 |
NaN |
NaN |
NaN |
NaN |
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
| |
| df.dropna(subset=['sepal length (cm)']) |
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |