1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
| import os import random from collections import defaultdict
def split_train(train_file, dev_ratio=0.2, to_folder=None): with open(train_file, "r", encoding="utf-8") as f: dict_label_name2sents = defaultdict(list) for i, line in enumerate(f): if i == 0: continue
line = line.strip() if not line: continue
id, sent, label_name = line.split(",") dict_label_name2sents[label_name].append(sent)
to_train_file = os.path.join(to_folder, "train.txt") to_dev_file = os.path.join(to_folder, "dev.txt") to_test_file = os.path.join(to_folder, "test.txt")
train_samples = [] dev_samples = [] for label_name, sents in dict_label_name2sents.items(): random.shuffle(sents)
train_sents_ = sents[int(dev_ratio * len(sents)) + 1 :] dev_sents_ = sents[: int(dev_ratio * len(sents)) + 1]
train_samples.extend([(w, label_name) for w in train_sents_]) dev_samples.extend([(w, label_name) for w in dev_sents_])
for samps, file_path in zip( [train_samples, dev_samples], [to_train_file, to_dev_file] ): f_out = open(file_path, "w", encoding="utf-8") for i, samp in enumerate(samps): f_out.write( "%d,%s,%s" % (i, samp[0], samp[1]) + "\n" ) f_out.close()
def split_train_to_5folds(train_file, to_folder, num_folds=5): os.makedirs(to_folder, exist_ok=True) for i in range(num_folds): to_folder_i = os.path.join(to_folder, "fold_%d" % i) os.makedirs(to_folder_i, exist_ok=True) split_train(train_file, dev_ratio=0.2, to_folder=to_folder_i)
if __name__ == "__main__": rootdir = os.sep.join(os.path.dirname(__file__).strip().split(os.sep)[:-2]) train_dir = os.path.join(rootdir, "dataset/datagrand_2021_train.csv") to_folder = os.path.join(rootdir, "dataset/phase_1/splits") split_train_to_5folds(train_dir, to_folder, num_folds=5)
|