# 数据预处理

data2 = data2.drop(data.isna().sum().idxmax(),axis=1)

#删除缺失值最多的列
data2 = data
print(data2)

print("\ndata2.isna():")
print(type(data2.isna()))
print(data2.isna())

print("\ndata2.isna().sum():")
print(type(data2.isna().sum()))
print(data2.isna().sum())

print("\ndata2.isna().sum().values:")
print(type(data2.isna().sum().values))
print(data2.isna().sum().values)

print(“delete column by iloc:”)
data2 = data2.iloc[:, data2.isna().sum().values < data2.isna().sum().max()]
print(data2)

print("\n\n\nanother way:")
data3 = data

print("\ndata3.isna().sum().idxmax():")
print(type(data3.isna().sum().idxmax()))
print(data3.isna().sum().idxmax())

print("\ndelete by drop:")
print(type(data))
data3 = data3.drop(data3.isna().sum().idxmax(),axis=1)
print(data3)
print(id(data3))

NumRooms Alley Price
0 NaN Pave 127500
1 2.0 NaN 106000
2 4.0 NaN 178100
3 NaN NaN 140000

data2.isna():
<class ‘pandas.core.frame.DataFrame’>
NumRooms Alley Price
0 True False False
1 False True False
2 False True False
3 True True False

data2.isna().sum():
<class ‘pandas.core.series.Series’>
NumRooms 2
Alley 3
Price 0
dtype: int64

data2.isna().sum().values:
<class ‘numpy.ndarray’>
[2 3 0]
delete column by iloc:
NumRooms Price
0 NaN 127500
1 2.0 106000
2 4.0 178100
3 NaN 140000

another way:

data3.isna().sum().idxmax():
<class ‘str’>
Alley

delete by drop:
<class ‘pandas.core.frame.DataFrame’>
NumRooms Price
0 NaN 127500
1 2.0 106000
2 4.0 178100
3 NaN 140000
2383388074912

#处理后的数据转换为张量格式
import tensorflow as tf
Z = tf.constant(data2.values)
Z

<tf.Tensor: shape=(4, 2), dtype=float64, numpy=
array([[ nan, 1.275e+05],
[2.000e+00, 1.060e+05],
[4.000e+00, 1.781e+05],
[ nan, 1.400e+05]])>

#删除缺失值最多的行
data3 = data
data3 = data3.drop(data3.isna().sum(axis=1).idxmax())
data3

inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
print(inputs)
The above code will occur error:can only concatenate str (not “int”) to str
so we can fix code as : inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
numeric_data = inputs.drop(columns=[‘Alley’])
inputs = inputs.fillna(numeric_data.mean())
print(inputs)

1 Like

Thx, this problem occurred to me too. I wonder if there is a way to categorize all the numeric columns.