zoukankan      html  css  js  c++  java
  • Python 爬虫 招聘信息并存入数据库

    新学习了selenium,啪一下腾讯招聘
     1 from lxml import etree
     2 from selenium import webdriver
     3 import pymysql
     4 def Geturl(fullurl):#获取每个招聘网页的链接
     5     browser.get(fullurl)
     6     shouye_html_text = browser.page_source
     7     shouye_ele = etree.HTML(shouye_html_text)
     8     zp_list = shouye_ele.xpath('//*[@id="position"]/div[1]/table/tbody/tr/td/a/@href')#链接url
     9     zp_url_list  = []
    10     for zp_url_lost in zp_list:
    11         zp_url  = 'https://hr.tencent.com/'+zp_url_lost
    12         zp_url_list.append(zp_url)
    13     return zp_url_list
    14 def Getinfo(zp_url_list):#获取每个招聘链接内部的内容
    15     for zp_url in zp_url_list:
    16         browser.get(zp_url)
    17         zp_info_html = browser.page_source
    18         zp_ele = etree.HTML(zp_info_html)
    19         zp_info_title = str(zp_ele.xpath('//*[@id="sharetitle"]/text()')[0])
    20         zp_info_location = str(zp_ele.xpath('//*[@id="position_detail"]/div/table/tbody/tr[2]/td[1]/text()')[0])
    21         zp_info_type = str(zp_ele.xpath('//*[@id="position_detail"]/div/table/tbody/tr[2]/td[2]/text()')[0])
    22         zp_info_num = str(zp_ele.xpath('//*[@id="position_detail"]/div/table/tbody/tr[2]/td[3]/text()')[0])
    23         zp_info_need = str(zp_ele.xpath('//*[@id="position_detail"]/div/table/tbody/tr[3]/td/ul/li/text()'))
    24         connection = pymysql.connect(host='localhost', user='root', password='1234', db='txzp', )
    25         try:
    26             with connection.cursor() as cursor:
    27                 sql = "INSERT INTO `txzp_info` (`title`, `location`,`type`,`num`,`need`) VALUES (%s,%s,%s,%s, %s)"
    28                 cursor.execute(sql, (zp_info_title,zp_info_location,zp_info_type,zp_info_num,zp_info_need))
    29             connection.commit()
    30         finally:
    31             connection.close()
    32         print(zp_info_title,zp_info_location,zp_info_type,zp_info_num,zp_info_need)
    33 if __name__ == '__main__':
    34     browser = webdriver.Chrome()
    35     pags = int(input('需要几页?'))
    36     for i in range(0,pags):
    37         url = 'https://hr.tencent.com/position.php?keywords=&tid=0&start={}'
    38         fullurl = url.format(str(i*10))
    39         zp_url_list = Geturl(fullurl)
    40         Getinfo(zp_url_list)
    41     browser.close()
  • 相关阅读:
    赵栋 201771010137 第三周学习总结
    赵栋 201771010137 《面向对象程序设计(java)》课程进度表
    赵栋 201771010137 《面向对象程序设计(java)》第二周学习总结
    赵栋 201771010137 《面向对象程序设计(java)》
    防止电源反接的方法
    dsPIC单片机的波特率的计算
    PIC单片机编译器自带的延时程序
    python3.7 64bit安装pygame1.9.3
    dsPIC单片机的CAN引脚设置
    TJA1040
  • 原文地址:https://www.cnblogs.com/pantom0122/p/9501578.html
Copyright © 2011-2022 走看看