Tensorflow2.0 数据处理

openoker 2022-01-24 11:01:22 资料仓库 Tensorflow 收藏

0 / 1593

方法1：（借用三方sklearn库）

因为sklearn的train_test_split只能切2份，所以我们需要切2次：

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y,                # x,y是原始数据
    test_size=0.2        # test_size默认是0.25
)  # 返回的是 剩余训练集+测试集

x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train,    # 把上面剩余的 x_train, y_train继续拿来切
    test_size=0.2        # test_size默认是0.25
)  # 返回的是 二次剩余训练集+验证集

切分好的数据，一般需要做 batch_size， shuffle等，可以使用 tf.keras模型的 fit() 一步传递！

eg:
model.compile(
    loss=keras.losses.mean_squared_error, 
    optimizer=keras.optimizers.SGD(),
    metrics=['acc']    # 注意这个metrics参数，下面一会就提到
)

history = model.fit(
    x_train, 
    y_train, 
    validation_data=(x_valid, y_valid),     # 验证集在这里用了！！！
    epochs=100, 
    batch_size = 32      #  batch_size 不传也行，因为默认就是32
    shuffle=True,        #  shuffle    不传也行，因为默认就是True
    # callbacks=callbacks, #
)
度量指标 = model.evaluate(x_test, y_test)    # 返回的是指标（可能包括loss,acc）
# 这里说一下，为什么我说可能包括。
# 因为这个返回结果取决于 你的  model.compile() 传递的参数
    # 如果你传了  metrics=['acc']， 那么这个度量指标的返回结果就是 (loss, acc)
    # 如果你没传 metrics ，         那么这个度量指标的返回结果就是一个 loss

y_predict = model.predict(x_test)            # 返回的是预测结果

方法2：（tf.split）

自己封装的代码：功能包括： 3切分，乱序数据集，分批操作一体化！！！
定义data_valid_split.py：

import tensorflow as tf

class HandlerData:
    def __init__(self, x, y):
        """我封装的类，数据通过实例化传进来保存"""
        self.x = x
        self.y = y

    def shuffle_and_batch(self, x, y, batch_size=None):
        """默认定死乱序操作，batch_size可选参数， 其实乱序参数也应该设置可选的。懒了"""
        data = tf.data.Dataset.from_tensor_slices((x, y))    # 封装 dataset数据集格式

        data_ = data.shuffle(        # 乱序
            buffer_size=x.shape[0],  # 官方文档说明 shuffle的buffer_size 必须大于或等于样本数量
        )
        if batch_size:
            data_ = data_.batch(batch_size)
        return data_

    def train_test_valid_split(self, 
        test_size=0.2,                 # 测试集的切割比例
        valid_size=0.2,                # 验证集的切割比例
        batch_size=32,                 # batch_size 默认我设为了32
        is_batch_and_shuffle=True      # 这个是需不需要乱序和分批，默认设为使用乱序和分批
    ):
    
        sample_num = self.x.shape[0]    # 获取样本总个数
        train_sample = int(sample_num * (1 - test_size - valid_size))  # 训练集的份数
        test_sample = int(sample_num * test_size)                      # 测试集测份数
        valid_train = int(sample_num * valid_size)                     # 验证集的份数
        # 这三个为什么我用int包裹起来了，因为我调试过程中发现，有浮点数计算精度缺失现象。
        # 所以必须转整形
        
        # tf.split()  此语法上一篇我讲过，分n份，每份可不同数量
        x_train, x_test, x_valid = tf.split(  
            self.x,
            num_or_size_splits=[train_sample, test_sample, valid_train],
            axis=0
        )
        y_train, y_test, y_valid = tf.split(
            self.y,
            [train_sample, test_sample, valid_train],
            axis=0
        )
        # 因为份数是我切割x,y之前计算出来的公共变量。所以不用担心 x,y不匹配的问题。
            
        if is_batch_and_shuffle:   # 是否使用乱序和分批，默认是使用的，所以走这条
            return (
                self.shuffle_and_batch(x_train, y_train, batch_size=batch_size),
                self.shuffle_and_batch(x_test, y_test, batch_size=batch_size),
                self.shuffle_and_batch(x_valid, y_valid, batch_size=batch_size),
            )
        else:    # 如果你只想要切割后的原生数据，那么你把is_batch_and_shuffle传False就走这条路了
            return (
                (x_train, y_train),
                (x_test, y_test),
                (x_valid, y_valid)
            )

调用案例：

import tensorflow as tf

from data_valid_split import HandlerData

x = tf.ones([1000, 5000])
y = tf.ones([1000, 1])

data_obj = HandlerData(x,y)   # x是原生的样本数据，x是原生的label数据

# 方式1：使用乱序，使用分批，就是一个参数都不用传，全是默认值
train, test, valid = data_obj.train_test_valid_split(
    # test_size=0.2, 
    # valid_size=0.2, 
    # batch_size=32, 
    # is_batch_and_shuffle=True
) # 这些参数你都可以不传，这都是设置的默认值。

print(train)
print(test)
print(valid)

# 结果
>>> <BatchDataset shapes: ((None, 5000), (None, 1)), types: (tf.float32, tf.float32)>
>>> <BatchDataset shapes: ((None, 5000), (None, 1)), types: (tf.float32, tf.float32)>
>>> <BatchDataset shapes: ((None, 5000), (None, 1)), types: (tf.float32, tf.float32)>

# 虽然你看见了样本数为None，但是没关系，因为你还没使用，遍历一下就明白了    
for x_train,y_train in train:
    print(x_train.shape,y_train.shape)

# 结果  600 // 32 == 18 （你可以查一下正好18个）
# 结果  600 % 32 == 24 （你可以看一下最后一个就是24）
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(32, 5000) (32, 1)
(24, 5000) (24, 1)   # 32个一批，最后一个就是余数 24个了。


# 方式2：不使用乱序，使用分批，只要原生数据，
(x_train, y_train), (x_test, y_test), (x_valid, y_valid) = data_obj.train_test_valid_split(
    # test_size=0.2,
    # valid_size=0.2,