2_数据集制作

第二步:数据集分割制作

提前做个5折交叉的准备,注意代码中zip的使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding:utf-8 -*-
import os
import random
from collections import defaultdict


def split_train(train_file, dev_ratio=0.2, to_folder=None):
# split train into train & dev
with open(train_file, "r", encoding="utf-8") as f:
dict_label_name2sents = defaultdict(list)
for i, line in enumerate(f):
if i == 0:
continue

line = line.strip()
if not line:
continue

id, sent, label_name = line.split(",")
dict_label_name2sents[label_name].append(sent) # 这里的sent包括逗号,是几个短句的组合

to_train_file = os.path.join(to_folder, "train.txt")
to_dev_file = os.path.join(to_folder, "dev.txt")
to_test_file = os.path.join(to_folder, "test.txt")

train_samples = []
dev_samples = []
for label_name, sents in dict_label_name2sents.items():
random.shuffle(sents) # 把每个lable下的sents随机打乱

train_sents_ = sents[int(dev_ratio * len(sents)) + 1 :]
dev_sents_ = sents[: int(dev_ratio * len(sents)) + 1]

train_samples.extend([(w, label_name) for w in train_sents_])
dev_samples.extend([(w, label_name) for w in dev_sents_])

for samps, file_path in zip(
[train_samples, dev_samples], [to_train_file, to_dev_file]
):
f_out = open(file_path, "w", encoding="utf-8")
for i, samp in enumerate(samps):
f_out.write(
"%d,%s,%s" % (i, samp[0], samp[1]) + "\n"
) # samp[0]是sent,samp[1]是label_name
f_out.close()


def split_train_to_5folds(train_file, to_folder, num_folds=5):
os.makedirs(to_folder, exist_ok=True)
for i in range(num_folds):
to_folder_i = os.path.join(to_folder, "fold_%d" % i)
os.makedirs(to_folder_i, exist_ok=True)
split_train(train_file, dev_ratio=0.2, to_folder=to_folder_i)


if __name__ == "__main__":
rootdir = os.sep.join(os.path.dirname(__file__).strip().split(os.sep)[:-2])
train_dir = os.path.join(rootdir, "dataset/datagrand_2021_train.csv")
# test_dir = os.path.join(rootdir, "dataset/datagrand_2021_test.csv")
to_folder = os.path.join(rootdir, "dataset/phase_1/splits")
split_train_to_5folds(train_dir, to_folder, num_folds=5)

作者

Gavin

发布于

2022-03-31

更新于

2022-03-31

许可协议

CC BY-NC-SA 4.0

Your browser is out-of-date!

Update your browser to view this website correctly.&npsb;Update my browser now

×