-
Notifications
You must be signed in to change notification settings - Fork 1
/
data.py
68 lines (56 loc) · 2.42 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os, json
if __name__ == "__main__":
# Data input:
file = open("./resource/pinyin_dict/pinyin_dict_utf.txt", encoding="utf-8")
# Define the variable:
strs = file.readlines()
file.close()
py_dict = {}
data_dict_list = []
counter1_dict = {} # 1 character freq
counter2_dict = {} # 2 character freq
counter_total = 0 # total freq
not_count_char = "!@#$%^&*()!……「」【】{}|\::;\"“”()。,;,.1234567890《》<>??/、~` qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
# News data input:
data_path = "./resource/sina_news_utf8/"
data_path_list = os.listdir(data_path)
for i in range(len(data_path_list)):
file = open(data_path + data_path_list[i], encoding="utf-8")
#file = open(data_path+"test.txt", encoding="utf-8")
while True:
line = file.readline()
if not line:
break
data_dict_list.append(json.loads(line))
# --- Counting ---
for i in range(len(data_dict_list)):
string = data_dict_list[i]['html']
for j in range(len(string) - 1):
# Ignore the character not Chinese
# Counting 1-gram
if string[j] in not_count_char:
continue
if string[j] not in not_count_char:
if string[j] not in counter1_dict:
counter1_dict.update({string[j]: 1})
counter_total += 1
else:
counter1_dict[string[j]] += 1
counter_total += 1
# counter1_dict's type: { char1 : value , char2 : value}
# Counting 2-gram
if string[j+1] in not_count_char:
j += 1
continue
if string[j] not in counter2_dict:
counter2_dict.update({string[j]: {string[j+1]: 1}})
elif string[j+1] not in counter2_dict[string[j]]:
counter2_dict[string[j]].update({string[j+1]: 1})
else:
counter2_dict[string[j]][string[j+1]] += 1
# counter2_dict's type: { char_a1: { char_b1: value, char_b2: value }, char_a2: ... }
file1 = open("./counter1_dict.txt", "w", encoding="utf-8")
file2 = open("./counter2_dict.txt", "w", encoding="utf-8")
file1.write(json.dumps(counter1_dict, ensure_ascii=False))
file2.write(json.dumps(counter2_dict, ensure_ascii=False))
print(counter_total)