今天完成了提取字符串关键词的这一部分,代码如下:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 # -*- coding: gbk -*- 2 import jieba.analyse 3 import pymysql 4 5 6 # 打开连接 7 def open_conn(dbname): 8 db = pymysql.connect( 9 host="localhost", 10 port=3306, 11 user="root", 12 passwd="******", 13 db=dbname, 14 charset="utf8") 15 return db 16 17 # 遍历查询 18 def query(db): 19 cursor = db.cursor() 20 sql = "select ID,key_words,result_summary from summary_table" 21 cursor.execute(sql) 22 for each in cursor.fetchall(): 23 ID = each[0] 24 key_words = each[1] 25 result_summary = each[2] 26 if key_words is None: 27 if len(result_summary) > 10: 28 new_key_word = get_keyword_by_TFIDF(result_summary) 29 update(db,ID,new_key_word) 30 print(ID,new_key_word) 31 32 # 修改数据 参数(db,ID,关键词) 33 def update(db,ID,keyword): 34 cursor = db.cursor() 35 sql = " update summary_table set key_words = %s where ID = %s" 36 cursor.execute(sql,(keyword,ID)) 37 db.commit() 38 39 ##词性:n:名词,v:动词,ns:地名,vn:名动词,nt:机构团体 40 # TF-IDF算法提取关键词 41 def get_keyword_by_TFIDF(result_summary): 42 keywords = " ".join(jieba.analyse.extract_tags(result_summary, topK=5, withWeight=False, allowPOS=(["ns", "n", "vn", "v","nt"]))) 43 return keywords 44 #TextRank算法 45 def get_keyword_by_TextRank(result_summary): 46 keywords = " ".join(jieba.analyse.textrank(result_summary, topK=5, withWeight=False, allowPOS=("ns", "n", "vn", "v","nt"))) 47 return keywords 48 49 50 if __name__ == '__main__': 51 query(open_conn("datax"))