3_lable词表制作

第三步:制作lable词表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding:utf-8 -*-
import json
import os

def vocab_process(data_dir, to_folder):
# label, 注意这个模型,标签是分为两层的, level_1是大类, level_2是全部的lable,算作小类
vocab_file_level_1 = os.path.join(to_folder, "labels_level_1.txt")
vocab_file_level_2 = os.path.join(to_folder, "labels_level_2.txt")

# label2freq
label2freq_level_1_file = os.path.join(to_folder, "label2freq_level_1.json")
label2freq_level_2_file = os.path.join(to_folder, "label2freq_level_2.json")

with open(data_dir, "r", encoding="utf-8") as f:
vocab_level_1 = {}
vocab_level_2 = {}
for i, line in enumerate(f):
if i == 0:
continue
label_ = line.strip().split(",")[-1]
label_level_1 = label_.strip().split("-")[0]
label_level_2 = label_

if label_level_1 not in vocab_level_1:
vocab_level_1[label_level_1] = 0
vocab_level_1[label_level_1] += 1

if label_level_2 not in vocab_file_level_2:
vocab_level_2[label_level_2] = 0
vocab_level_2[label_level_2] += 1

json.dump(vocab_level_1, open(label2freq_level_1_file, "w", encoding="utf-8"))
json.dump(vocab_level_2, open(label2freq_level_2_file, "w", encoding="utf-8"))

vocab_level_1 = list(vocab_level_1.items()) # dict.items() 输出所有的(k, v)元组组成的list
vocab_level_1 = sorted(
vocab_level_1, key=lambda x: x[1], reverse=True
) # 按照lable的数量进行排序,从大到小
print("vocab_level_1:", vocab_level_1)

vocab_level_1 = [w[0] for w in vocab_level_1]

vocab_level_2 = list(vocab_level_2.items())
vocab_level_2 = sorted(vocab_level_2, key=lambda x: x[1], reverse=True)
print("vocab_level_2:", vocab_level_2)
vocab_level_2 = [w[0] for w in vocab_level_2]

with open(vocab_file_level_1, "w", encoding="utf-8") as f_out:
for lab in vocab_level_1:
f_out.write(lab + "\n")

with open(vocab_file_level_2, "w", encoding="utf-8") as f_out:
for lab in vocab_level_2:
f_out.write(lab + "\n")


if __name__ == "__main__":
rootdir = os.sep.join(os.path.dirname(__file__).strip().split(os.sep)[:-2])
train_dir = os.path.join(rootdir, "dataset/datagrand_2021_train.csv")
# test_dir = os.path.join(rootdir, "dataset/datagrand_2021_test.csv")
phase_1_dir = os.path.join(rootdir, "dataset/phase_1")
vocab_process(train_dir, phase_1_dir)

生成的文件及内容

作者

Gavin

发布于

2022-03-31

更新于

2022-03-31

许可协议

CC BY-NC-SA 4.0

Your browser is out-of-date!

Update your browser to view this website correctly.&npsb;Update my browser now

×