zoukankan      html  css  js  c++  java
  • Python+Requests+Xpath(解析)爬取某站点简历图片(数据分析三)

    1、环境安装

    pip install lxml

    2、解析原理

    • 使用通用爬虫爬取网页数据
    • 实例化etree对象,且将页面数据加载到该对象中
    • 使用xpath函数结合xpath表达式进行标签定位和指定数据提取

    3、实战案例

    - 项目需求:解析房天下新房的相关数据

    import requests
    import os
    from lxml import etree
    import json
    import csv

    if __name__ == '__main__':
    url = 'https://huizhou.newhouse.fang.com/house/s/'
    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
    }
    if not os.path.exists('./fangtianxiaLibs'):
    os.makedirs('./fangtianxiaLibs')
    response = requests.get(url=url,headers=headers)
    # 手动设置响应数据的编码格式
    response.encoding = 'utf-8'
    page_text = response.text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//div[@id = "newhouse_loupai_list"]/ul/li')
    # 爬取的数据信息放到列表里面
    datas = []
    for li in li_list:
    # 解析标题
    try:
    detail_url = li.xpath('.//div[@class="nlcd_name"]/a/@href')[0]
    if detail_url != []:
    detail_url = 'https:'+detail_url
    detail_text = requests.get(url=detail_url,headers=headers).text
    # 字符串替换Url后缀
    detail_url_new = detail_url.replace('.htm','/housedetail.htm')
    tree = etree.HTML(detail_text)
    # 解析二级页面的描述和价格(均价)
    title = tree.xpath('//div[@class="information"]//div[@class="tit"]/h1/strong/text()')[0]
    price = "".join(tree.xpath('//div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/h3/text() | //div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/span/text() | //div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/text()')).strip(' ')
    # 二级页面再次发起请求
    detail_text_new = requests.get(url=detail_url_new,headers=headers).text
    tree_new = etree.HTML(detail_text_new)
    # 解析详情页信息
    tree_list = tree_new.xpath('//div[@id="Configuration"]')
    # print(tree_list[0].xpath('./h3/text()'))

    for index in tree_list:
    zhoubian = "".join(index.xpath('./h3/text()')).strip(' ')
    jiaotong = "".join(index.xpath('./ul[@class="sheshi_zb"]/li/span/text()|./ul[@class="sheshi_zb"]/li[@class="jiaotong_color"]/text()')).strip(' ')
    qita = "".join(index.xpath('./ul[@class="sheshi_zb"]/li/span/text()|./ul[@class="sheshi_zb"]/li/text()')).strip(' ')
    desc = zhoubian+":"+jiaotong+':'+qita+' '
    dic = {
    'title':title,
    'desc':desc,
    'price':price
    }
    datas.append(dic)
    except Exception as msg:
    pass
    # print('报错原因:{}'.format(msg))
    fileName = './fangtianxiaLibs/'+title+'.txt'
    print(datas)
    title_header = ['title','desc','price']
    with open(fileName,'a',encoding='utf-8') as fp:
    writer = csv.DictWriter(fp,title_header)
    writer.writeheader()
    writer.writerows(datas)

    - 项目需求:解析图片数据:http://pic.netbian.com/4kmeinv/

    import requests
    from lxml import etree
    url = 'http://pic.netbian.com/4kmeinv/'
    headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    response = requests.get(url=url,headers=headers)
    #获取页面原始编码格式
    print(response.encoding)
    page_text = response.text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//div[@class="slist"]/ul/li')
    for li in li_list:
    img_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
    img_name = li.xpath('./a/img/@alt')[0]
    img_name = img_name.encode('iso-8859-1').decode('gbk')
    print(img_url,img_name)

    - 项目需求:解析出所有城市名称https://www.aqistudy.cn/historydata/
    import requests
    from lxml import etree
    url = 'https://www.aqistudy.cn/historydata/'
    headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    response = requests.get(url=url,headers=headers)
    #获取页面原始编码格式
    print(response.encoding)
    page_text = response.text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul//li')
    for li in li_list:
    city_name = li.xpath('./a/text()')[0]
    city_url = 'https://www.aqistudy.cn/historydata/'+li.xpath('./a/@href')[0]
    print(city_name,city_url)

    - 项目需求:下载网站站点简历中的图片数据:https://sc.chinaz.com/

    import requests
    from lxml import etree
    import os

    # 新建文件夹
    if not os.path.exists('./jianliLibs'):
    os.makedirs('./jianliLibs')

    # 站点第一层 进入简历门户站点
    url = 'https://sc.chinaz.com/'
    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
    }
    response_text = requests.get(url=url,headers=headers).text
    # 解析获取模板信息
    tree = etree.HTML(response_text)
    # 解析出简历模板Url
    def page_index(latest):
    for index in range(1,latest):
    if index == 1:
    muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@class="nav"]//li[@class="nos no3"]/a/@href')[3]
    # print("one",muban_url)
    else:
    muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@class="nav"]//li[@class="nos no3"]/a/@href')[3] + 'index_{}.html'.format(index)
    # print("two",muban_url)
    # 模板简历站点获取每个简历的信息
    response = requests.get(muban_url,headers=headers)
    # 手动设置响应数据的编码格式
    response.encoding = 'utf-8'
    muban_text = response.text
    # print(muban_text)
    # 解析获取简历信息
    jianli_tree = etree.HTML(muban_text)
    # 解析出简历信息的Url
    jianli_url_list = jianli_tree.xpath('//div[@class="main_list jl_main"]//a/@href')
    # print(jianli_url_list)
    for jianli_url in jianli_url_list:
    jianli_url = "https:"+jianli_url
    # print(jianli_url)
    # 第三层获取简历信息
    jianli_detail = requests.get(jianli_url,headers=headers).text
    detail_tree = etree.HTML(jianli_detail)
    img_src_list = detail_tree.xpath('//div[@class="show_warp jl_warp clearfix"]//img/@src')
    for img_src in img_src_list:
    img_src = 'https:'+img_src
    # print(img_src)
    img_src_content = requests.get(img_src,headers=headers).content
    # print(img_src_content)
    # 生成图片的名称
    imgName = img_src.split('/')[-2]
    # print(imgName)
    # 图片路径
    imgPath = './jianliLibs/'+imgName+'.jpg'
    # 持久化存储
    with open(imgPath, 'wb') as fp:
    fp.write(img_src_content)
    print('简历:'+imgName, '下载成功!!!')

    if __name__ == '__main__':
    while True:
    try:
    values = int(input('请输入站点页分页数:'))
    page_index(values)
    except Exception as msg:
    print('输入错误,错误信息为{}'.format(msg))
    finally:
    break

  • 相关阅读:
    Laravel获取所有的数据库表及结构
    larave5.6 将Excel文件数据导入数据库代码实例
    艾伟_转载:Visual Studio调试之断点技巧篇 狼人:
    艾伟_转载:自用扩展方法分享 狼人:
    艾伟_转载:.NET Discovery 系列之三深入理解.NET垃圾收集机制(上) 狼人:
    艾伟_转载:.Net Discovery 系列之五Me JIT(上) 狼人:
    艾伟_转载:.NET Discovery 系列之六Me JIT(下) 狼人:
    艾伟_转载:.NET Discovery 系列之七深入理解.NET垃圾收集机制(拾贝篇) 狼人:
    艾伟_转载:Visual Studio调试之断点基础篇 狼人:
    艾伟_转载:.NET Discovery 系列之一string从入门到精通(上) 狼人:
  • 原文地址:https://www.cnblogs.com/Teachertao/p/14732392.html
Copyright © 2011-2022 走看看