import os, json if __name__ == "__main__": # Data input: file = open("./resource/pinyin_dict/pinyin_dict_utf.txt", encoding="utf-8") # Define the variable: strs = file.readlines() file.close() py_dict = {} data_dict_list = [] counter1_dict = {} # 1 character freq counter2_dict = {} # 2 character freq counter_total = 0 # total freq not_count_char = "!@#$%^&*()!……「」【】{}|\::;\"“”()。,;,.1234567890《》<>??/、~` qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM" # News data input: data_path = "./resource/sina_news_utf8/" data_path_list = os.listdir(data_path) for i in range(len(data_path_list)): file = open(data_path + data_path_list[i], encoding="utf-8") #file = open(data_path+"test.txt", encoding="utf-8") while True: line = file.readline() if not line: break data_dict_list.append(json.loads(line)) # --- Counting --- for i in range(len(data_dict_list)): string = data_dict_list[i]['html'] for j in range(len(string) - 1): # Ignore the character not Chinese # Counting 1-gram if string[j] in not_count_char: continue if string[j] not in not_count_char: if string[j] not in counter1_dict: counter1_dict.update({string[j]: 1}) counter_total += 1 else: counter1_dict[string[j]] += 1 counter_total += 1 # counter1_dict's type: { char1 : value , char2 : value} # Counting 2-gram if string[j+1] in not_count_char: j += 1 continue if string[j] not in counter2_dict: counter2_dict.update({string[j]: {string[j+1]: 1}}) elif string[j+1] not in counter2_dict[string[j]]: counter2_dict[string[j]].update({string[j+1]: 1}) else: counter2_dict[string[j]][string[j+1]] += 1 # counter2_dict's type: { char_a1: { char_b1: value, char_b2: value }, char_a2: ... } file1 = open("./counter1_dict.txt", "w", encoding="utf-8") file2 = open("./counter2_dict.txt", "w", encoding="utf-8") file1.write(json.dumps(counter1_dict, ensure_ascii=False)) file2.write(json.dumps(counter2_dict, ensure_ascii=False)) print(counter_total)