""" -*- coding:utf-8 -*- @Time :2021/3/1 18:56 @Author : @File :cet4.py @Version:1.0 """ # # 1 数据来源 # 中国教育在线网_词汇:https://www.eol.cn/html/en/cetwords/cet4.shtml # # 2 词性(10种) # 1、名词 noun n # 2、代词 pronoun pron # 3、形容词 adjective adj # 4、副词 adverb adv # 5、verb v # 6、数词 numeral num # 7、冠词 article art # 8、介词 preposition prep # 9、连词 conjunction conj # 10、感叹词 interjection interj # # from html.parser import HTMLParser import xlwt def del_kg(str_d: str): """ 删除字符串前面的空格 :param str_d: :return: """ if str_d.startswith(' '): return False, str_d[1:] return True, str_d class My(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.str_data = [] def handle_data(self, data): if self.lasttag == 'p': if not data.startswith(' ') and not data.find('.') == -1: while True: re, me = del_kg(data) if not re: data = me else: self.str_data.append(data) break def write_excel(file, data: list): wb = xlwt.Workbook() st = wb.add_sheet('aa') rows = len(data) for i in range(len(data)): for i_i in range(len(data[i])): st.write(i, i_i, data[i][i_i]) wb.save(file) if __name__ == '__main__': file = 'C:\Users\Administrator\Desktop\cet4.txt' with open(file, 'r', encoding='utf-8') as f: da = f.read() pa = My() pa.feed(da) pa.handle_data(da) # 去重 re_l_da = list(set(pa.str_data)) re_l_da.sort(key=pa.str_data.index) pa.str_data = re_l_da a = [] b = [] for i in pa.str_data: result = i.split(' ') if len(result) == 2: a_ls = [] a_ls.append(result[0]) a_ls.append(result[1]) a.append(a_ls) else: b.append(i) # print(*pa.str_data, sep=' ') print('aa=', len(a)) # 4471 print(*a, sep=' ') print(len(b)) print(len(pa.str_data)) print(*b, sep=' ') da_da = [] write_excel('aaa.xls', a)