zoukankan      html  css  js  c++  java
  • python 爬虫 5i5j房屋信息 获取并存储到数据库

     1 from lxml import etree
     2 from selenium import webdriver
     3 import pymysql
     4 
     5 def Geturl(fullurl):#获取每个招聘网页的链接
     6     browser.get(fullurl)
     7     shouye_html_text = browser.page_source
     8     shouye_ele = etree.HTML(shouye_html_text)
     9     zf_list = shouye_ele.xpath('/html/body/div[4]/div[1]/div[2]/ul/li/div/h3/a/@href')#链接url
    10     zf_url_list  = []
    11     for zf_url_lost in zf_list:
    12         zf_url  = 'https://bj.5i5j.com'+zf_url_lost
    13         zf_url_list.append(zf_url)
    14     return zf_url_list
    15 def Getinfo(zp_url_list):
    16     for zp_url in zp_url_list:
    17         browser.get(zp_url)
    18         zp_info_html = browser.page_source
    19         zp_ele = etree.HTML(zp_info_html)
    20         zp_info_title = str(zp_ele.xpath('//html/body/div[3]/div[1]/div[1]/h1/text()')[0])
    21         zp_info_num = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/div/p[1]/text()')[0])+'元/月'#价格
    22         zp_info_type = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[2]/div/p[1]/text()')[0])#户型
    23         zp_info_zone = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[3]/div/p[1]/text()')[0])+'平米'#房屋大小
    24         zp_info_need_1 = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/ul/li[1]/span/text()')[0])#房屋信息
    25         zp_info_need_2 = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/ul/li[1]/a/text()')[0])#房屋信息
    26         zp_info_need = zp_info_need_1+zp_info_need_2
    27         connection = pymysql.connect(host='localhost', user='root', password='1234', db='5i5j', )
    28         try:
    29             with connection.cursor() as cursor:
    30                 sql = "INSERT INTO `5i5j_info` (`title`,`num`,`type`, `zone`,`need`) VALUES (%s,%s,%s,%s, %s)"
    31                 cursor.execute(sql, (zp_info_title,zp_info_num,zp_info_type,zp_info_zone,zp_info_need))
    32             connection.commit()
    33         finally:
    34             connection.close()
    35         print(zp_info_title,zp_info_num,zp_info_type,zp_info_zone,zp_info_need)
    36 if __name__ == '__main__':
    37     browser = webdriver.Chrome()
    38     pags = int(input('需要几页?'))
    39     for i in range(1,pags+1):
    40         url = 'https://bj.5i5j.com/zufang/huilongguan/n{}/'
    41         fullurl = url.format(str(i))
    42         zf_url_list = Geturl(fullurl)
    43         print(fullurl)
    44         # print(zf_url_list)
    45         Getinfo(zf_url_list)
    46     browser.close()
  • 相关阅读:
    Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.
    DHCP "No subnet declaration for xxx (no IPv4 addresses)" 报错
    Centos安装前端开发常用软件
    kubernetes学习笔记之十:RBAC(二)
    k8s学习笔记之StorageClass+NFS
    k8s学习笔记之ConfigMap和Secret
    k8s笔记之chartmuseum搭建
    K8S集群集成harbor(1.9.3)服务并配置HTTPS
    Docker镜像仓库Harbor1.7.0搭建及配置
    Nginx自建SSL证书部署HTTPS网站
  • 原文地址:https://www.cnblogs.com/pantom0122/p/9508514.html
Copyright © 2011-2022 走看看