zoukankan      html  css  js  c++  java
  • python 爬虫启航2.0

    文章解析:

    1.正则表达式解析

    2.beautifulsoup,BeautifulSoup是一个复杂的树形结构,她的每一个节点都是一个python对象,获取网页的内容就是一个提取对象内容的过程,它的提取方法可以归类为三种,1)遍历文档树 2)搜索文档树 3)css选择器

    # -*- coding: utf-8 -*-
    # @Time : 2018/11/28 17:23
    # @Author : Bo
    # @Email : mat_wu@163.com
    # @File : re_spider.py
    # @Software: PyCharm
    import requests
    import re
    from bs4 import BeautifulSoup
    from lxml import etree
    def get_title_re_spider():
    url = "http://www.santostang.com/"
    headers = {
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
    }

    r = requests.get(url,headers= headers,timeout =10)
    html = r.text
    title_list = re.findall('<h1 class="post-title"><a href=.*?>(.*?)</a></h1>',html)
    print(title_list)

    def beautifulsoup_spider():
    url = "http://www.santostang.com/"
    headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
    }

    r = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(r.text,"html.parser")
    title_list = soup.find_all("h1",class_="post-title")
    for i in range(len(title_list)):
    title = title_list[i].a.text.strip()
    print("第 %s篇文章的标题是:%s" %(i+1,title))

    def beautiful_methods():
    url = "http://www.santostang.com/"
    headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
    }

    r = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")
    # print(soup.prettify()) #soup对代码进行优化

    #遍历文档树
    one_element = soup.header.h1 #获取具体的标签节点
    elements = soup.header.contents #获取header标签的子节点
    first_element = soup.header.contents[1] #标签都在奇数项
    # print(one_element)
    # print(elements)
    # # print(first_element)
    # #获取子节点(只有下一级)
    # for child in soup.body.children:
    # print(child)
    # #获取所有子子孙孙的节点
    # for child in soup.body.descendants:
    # print(child)
    # a_tag = soup.header.div.a
    # a_parent = a_tag.parent
    # print(a_parent)
    # #搜索节点 find() 和find_all()

    #css选择器
    print(soup.select("header h1"))
    print(soup.select("header > h1"))

    #css也可以实现文档搜索功能

    #使用lxml解析网页

    def lxml_spider():
    url = "http://www.santostang.com/"
    headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
    }

    r = requests.get(url, headers=headers, timeout=10)

    html = etree.HTML(r.text)
    title_list = html.xpath("/html/body/div[1]/div/div[1]/article/header/h1/a/text()")
    print(title_list)

    # 项目实践-爬取安居客二手房信息
    def second_house_spider():
    url = "https://weihai.anjuke.com/sale/gaoqu/?from=SearchBar"

    headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
    }
    r = requests.get(url,headers = headers,timeout = 10)

    #使用BeautifulSoup解析网页
    soup = BeautifulSoup(r.text,'lxml')

    house_list = soup.find_all("li",class_="list-item")

    for house in house_list:
    name = house.find("div",class_="house-title").a.text.strip()
    price = house.find("span",class_="price-det").text.strip()
    price_area = house.find("span",class_="unit-price").text.strip()

    no_room = house.find("div",class_='details-item').span.text.strip()
    area = house.find("div",class_="details-item").contents[3].text
    floor = house.find("div",class_="details-item").contents[5].text

    address = house.find("span",class_="comm-address").text.strip()

    address = address.replace('xa0xa0 ',' ')
    tag_list = house.find_all("span",class_="item-tags")
    tag = [i.text for i in tag_list]

    with open('b.txt', "a+",encoding="utf-8") as f:
    f.write(address)
    print(name)
    print(price)
    print(price_area)
    print(no_room)
    print(area)
    print(floor)
    print(address)
    print(tag)

    if __name__ == "__main__":
    # get_title_re_spider()
    # beautifulsoup_spider()
    # beautiful_methods()
    lxml_spider()


    学习网址:

    https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/#id27

  • 相关阅读:
    echarts中3D地球模型
    面试题68
    Java正确创建对象数组
    8.Arrays类和比较器
    7.Base64类和UUID类
    6.大数字处理类
    3.JVM重要知识点
    2.JVM基础知识点
    1.JVM入门知识
    6.适配器模式
  • 原文地址:https://www.cnblogs.com/icat-510/p/10042488.html
Copyright © 2011-2022 走看看