import pymysql import requests from lxml import etree class Spider: def __init__(self): self.url = "http://openaccess.thecvf.com/CVPR2019?day=2019-06-18" self.header = { "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"} self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='yangjiang', db='words', charset='utf8') self.cursor = self.db.cursor() self.html_list = [] def getHtmlList(self): response = requests.get(self.url, headers=self.header) html_body = etree.HTML(response.text) title = html_body.xpath("//dt[@class='ptitle']/a/@href") for item in title: self.html_list.append("http://openaccess.thecvf.com/" + item) def getContent(self, url): try: response = requests.get(url, headers=self.header) body = etree.HTML(response.text) title = body.xpath("//div[@id='papertitle']/text()")[0] abstract = body.xpath("//div[@id='abstract']/text()")[0] down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/") year=2019 sql = '''insert into paper(title,year,link,abstract,keywords) values("{}","{}","{}","{}","{}")'''.format(title,year, down_url, abstract,title) self.cursor.execute(sql) print(title + "插入成功!") self.db.commit() except Exception as e: print(e) def run(self): self.getHtmlList() for url in self.html_list: self.getContent(url) if __name__ == '__main__': spwder = Spider() spwder.run()