2022-03-31发表NLP / Python3 分钟读完 (大约375个字)

2_数据集制作

第二步：数据集分割制作

提前做个5折交叉的准备，注意代码中zip的使用

# -*- coding:utf-8 -*-
import os
import random
from collections import defaultdict


def split_train(train_file, dev_ratio=0.2, to_folder=None):
    # split train into train & dev
    with open(train_file, "r", encoding="utf-8") as f:
        dict_label_name2sents = defaultdict(list)
        for i, line in enumerate(f):
            if i == 0:
                continue

            line = line.strip()
            if not line:
                continue

            id, sent, label_name = line.split(",")
            dict_label_name2sents[label_name].append(sent)  # 这里的sent包括逗号，是几个短句的组合

        to_train_file = os.path.join(to_folder, "train.txt")
        to_dev_file = os.path.join(to_folder, "dev.txt")
        to_test_file = os.path.join(to_folder, "test.txt")

        train_samples = []
        dev_samples = []
        for label_name, sents in dict_label_name2sents.items():
            random.shuffle(sents)  # 把每个lable下的sents随机打乱

            train_sents_ = sents[int(dev_ratio * len(sents)) + 1 :]
            dev_sents_ = sents[: int(dev_ratio * len(sents)) + 1]

            train_samples.extend([(w, label_name) for w in train_sents_])
            dev_samples.extend([(w, label_name) for w in dev_sents_])

        for samps, file_path in zip(
            [train_samples, dev_samples], [to_train_file, to_dev_file]
        ):
            f_out = open(file_path, "w", encoding="utf-8")
            for i, samp in enumerate(samps):
                f_out.write(
                    "%d,%s,%s" % (i, samp[0], samp[1]) + "\n"
                )  # samp[0]是sent，samp[1]是label_name
            f_out.close()


def split_train_to_5folds(train_file, to_folder, num_folds=5):
    os.makedirs(to_folder, exist_ok=True)
    for i in range(num_folds):
        to_folder_i = os.path.join(to_folder, "fold_%d" % i)
        os.makedirs(to_folder_i, exist_ok=True)
        split_train(train_file, dev_ratio=0.2, to_folder=to_folder_i)


if __name__ == "__main__":
    rootdir = os.sep.join(os.path.dirname(__file__).strip().split(os.sep)[:-2])
    train_dir = os.path.join(rootdir, "dataset/datagrand_2021_train.csv")
    # test_dir = os.path.join(rootdir, "dataset/datagrand_2021_test.csv")
    to_folder = os.path.join(rootdir, "dataset/phase_1/splits")
    split_train_to_5folds(train_dir, to_folder, num_folds=5)

2_数据集制作

https://dustofstars.github.io/NLP/Python/2-数据集制作/

作者

Gavin

发布于

2022-03-31

更新于

2022-03-31

许可协议

CC BY-NC-SA 4.0

#Python NLP

2_数据集制作

第二步：数据集分割制作

作者

发布于

更新于

许可协议

目录

分类

标签

Your browser is out-of-date!