今天我们来做下个人作业,爬取一下计算机视觉顶级会议热词
代码如下:
import operator from nltk.corpus import stopwords import pymysql as pymysql from selenium import webdriver from lxml import etree # 构建数组 data_s = [] fen_ci = "" # 通过webdriver启动chrome获取数据 driver = webdriver.Chrome() # 2019 driver.get("https://openaccess.thecvf.com/ICCV2019?day=2019-10-29") html = driver.page_source # 正则表达式搜寻链接 htmlc=etree.HTML(html) indexs=htmlc.xpath('//dl/dd/a[1]/@href') base_url='https://openaccess.thecvf.com/' title=htmlc.xpath('//dl/dt/a/text()') print(len(title)) for i in range(0,len(title)): fen_ci=fen_ci+title[i] url=base_url+indexs[i] data_s.append([title[i], url, 2019]) driver.get("https://openaccess.thecvf.com/ICCV2019?day=2019-10-30") html = driver.page_source # 连接数据库 conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", password="260702266", database="topclass", charset="utf8") cursor = conn.cursor() print(data_s) count = 0 sql="insert into iccv(title,links,year) values (%s,%s,%s)" try: count = count+1 cursor.executemany(sql, data_s) conn.commit() except: conn.rollback() count = count + 1 # 统计词频 fen_cil = fen_ci.lower().split() dic = {} for word in fen_cil: if word not in dic: dic[word] = 1 else: dic[word] = dic[word] + 1 swd = sorted(dic.items(),key=operator.itemgetter(1),reverse=True) # print(swd)
实际效果: