1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
| import os from re import S import numpy as np import matplotlib.pyplot as plt
def length_process(data_dir): train_dir = data_dir with open(train_dir, "r", encoding="utf-8") as f: tmp_x = [] for i, line in enumerate(f): if i == 0: continue
sent_ = line.strip().split(",")[1] sent_ = [item.strip() for item in sent_.split(",")] sent_ = " ".join(item for item in sent_).strip().split(" ")
tmp_x.append(len(sent_))
n, bins, patches = plt.hist(x=tmp_x, bins="auto", alpha=0.7, rwidth=0.85) plt.grid(axis="y", alpha=0.75) plt.xlabel("sentence length") plt.ylabel("Frequency") plt.title("Histogram: sentence length") maxfreq = n.max() plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10) print("maxfreq = {0}".format(maxfreq)) plt.show()
if __name__ == "__main__": rootdir = os.sep.join(os.path.dirname(__file__).strip().split(os.sep)[:-2]) train_dir = os.path.join(rootdir, "dataset/datagrand_2021_train.csv") test_dir = os.path.join(rootdir, "dataset/datagrand_2021_test.csv")
length_process(train_dir) length_process(test_dir)
|