1_数据探索

第一步:简单观察训练集和测试集

使用pyplot做直方图观察句子长度分布情况

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# -*- coding:utf-8 -*-
import os
from re import S
import numpy as np
import matplotlib.pyplot as plt


def length_process(data_dir):
train_dir = data_dir
with open(train_dir, "r", encoding="utf-8") as f:
tmp_x = []
for i, line in enumerate(f):
if i == 0:
continue # skip the first line

# line 的格式为:0,7442 27878 9601 235 4004 , 9601 4004 , 8194 2281 10893,5-30
# 行号 英文逗号 短句 中文逗号 短句 英文逗号 行号
sent_ = line.strip().split(",")[1]
sent_ = [item.strip() for item in sent_.split(",")]
sent_ = " ".join(item for item in sent_).strip().split(" ")

tmp_x.append(len(sent_))

n, bins, patches = plt.hist(x=tmp_x, bins="auto", alpha=0.7, rwidth=0.85)
plt.grid(axis="y", alpha=0.75)
plt.xlabel("sentence length")
plt.ylabel("Frequency")
plt.title("Histogram: sentence length")
maxfreq = n.max()
# 设置y轴的上限
plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
print("maxfreq = {0}".format(maxfreq))
plt.show()


if __name__ == "__main__":
rootdir = os.sep.join(os.path.dirname(__file__).strip().split(os.sep)[:-2])
train_dir = os.path.join(rootdir, "dataset/datagrand_2021_train.csv")
test_dir = os.path.join(rootdir, "dataset/datagrand_2021_test.csv")

length_process(train_dir)
length_process(test_dir)
作者

Gavin

发布于

2022-03-31

更新于

2022-03-31

许可协议

CC BY-NC-SA 4.0

Your browser is out-of-date!

Update your browser to view this website correctly.&npsb;Update my browser now

×