zoukankan      html  css  js  c++  java
  • 爬虫(二)

      在开发工具内获取“请求头”来伪装成浏览器,以便更好地抓取数据

    !/usr/bin/env python
    -*- encoding:UTF-8 -*-
    
    import requests
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
    res = requests.get('http://bj.xiaozhu.com/',headers=headers)    # get方法加入请求头
    try:
        print(res.text)
    except ConnectionError:
        print('拒绝连接')
    
    
    # 通过BeautiSoup库解析得到的Soup文档是标准结构化数据比上面的更好
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
    res = requests.get('http://bj.xiaozhu.com/',headers=headers)    # get方法加入请求头
    try:
        soup = BeautifulSoup(res.text, 'html.parser')
        print(soup.prettify())
    except ConnectionError:
        print('拒绝连接')

    更新后:

    price = soup.select('#page_list > ul > li:nth-child(1) > div.result_btm_con.lodgeunitname > div:nth-child(1) > '
                        'span.result_price > i')

    完整代码

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 '
                      'Safari/537.36 '
    }
    res = requests.get('http://bj.xiaozhu.com/', headers=headers)  # get方法加入请求头
    
    soup = BeautifulSoup(res.text, 'html.parser')
    # 定位元素位置并通过selector方法提取
    prices = soup.select(
        '#page_list > ul > li > div.result_btm_con.lodgeunitname > div:nth-child(1) >  span.result_price > i')
    for price in prices:
        print(price.get_text())
      # print(prince) 带有标签

    爬取北京地区短租房信息:

    import random
    
    import requests
    from bs4 import BeautifulSoup
    import time
    
    # 加入请求头伪装成浏览器
    headers = {
        # 通过Chrome浏览器复制User-Agent
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
    }
    
    
    # 定义判断用户性别的函数
    def judgment_sex(class_name):
        if class_name == ['member_ico1']:
            return ''
        else:
            return ''
    
    
    # 获取详细页URL函数
    def get_links(url):
        try:
            wb_date = requests.get(url, headers)
        except ConnectionAbortedError:
            print('拒绝连接')
        soup = BeautifulSoup(wb_date.text, 'lxml')
        links = soup.select('#page_list > ul > li > a')
        for link in links:
            herf = link.get("href")
            get_info(herf)
    
    
    # 获取网页信息函数
    def get_info(url):
        wb_date = requests.get(url, headers)
        soup = BeautifulSoup(wb_date.text, 'lxml')
        # 通过浏览器copy selector
        tittles = soup.select('div.pho_info > h4')
        addresses = soup.select('span.pr5')
        prises = soup.select('#pricePart > div.day_l > span')
        images = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
        names = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
        sexs = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
        for tittle, address, prise, image, name, sex in zip(tittles, addresses, prises, images, names, sexs):
            date = {
                'tittle': tittle.get_text().strip(),
                'address': address.get_text().strip(),
                'price': prise.get_text(),
                'image': image.get("src"),
                'name': name.get_text(),
                'sex': judgment_sex(sex.get("class"))
            }
            print(date)
    
    
    if __name__ == '__main__':
    
        urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1, 14)]
        for single_url in urls:
            get_links(single_url)
            # 休眠十秒,防止被封IP
            time.sleep(random.randint(10, 13))
    
            # 缺点:缺少IP管理,采用休眠方法,效率低

     爬取酷狗top1.0版:

    #!/usr/bin/env python
    # -*- encoding:UTF-8 -*-
    
    from bs4 import BeautifulSoup
    import requests
    import time,random
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
    }
    
    def get_info(url):
        """获取信息函数"""
        wb_data = requests.get(url,headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        ranks = soup.select('span.pc_temp_num')
        titles =soup.select('div.pc_temp_songlist>ul>li>a')
        times = soup.select('span.pc_temp_tips_r>span')
        for rank,title,time in zip(ranks,titles,times):
            data = {
                'rank':rank.get_text().strip(),
                'singer':title.get_text().split('-')[0],
                'song':title.get_text().split('-')[1],
                'time':time.get_text().strip()
            }
            print(data)
    
    
    
    if __name__ == '__main__':
        """主程序入口"""
        urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i) for i in range(1,25)]
        for url in urls:
            get_info(url)
        time.sleep(random.randint(3,5))

     爬取酷狗top1.1版:

    #!/usr/bin/env python
    # -*- encoding:UTF-8 -*-
    
    from bs4 import BeautifulSoup
    import requests
    import time,random
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
    }
    
    def get_info(url):
        """获取信息函数"""
        wb_data = requests.get(url,headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        ranks = soup.select('span.pc_temp_num')
        titles =soup.select('a.pc_temp_songname')
        times = soup.select('span.pc_temp_time')
        for rank,title,time in zip(ranks,titles,times):
            data = {
                'rank':rank.get_text().strip(),
                'singer':title.get_text().split('-')[0],
                'song':title.get_text().split('-')[1],
                'time':time.get_text().strip()
            }
            print(data)
    
    
    
    if __name__ == '__main__':
        """主程序入口"""
        urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i) for i in range(1,25)]
        for url in urls:
            get_info(url)
        time.sleep(random.randint(3,5))

     爬取价格:

    import re
    import requests
    res = requests.get('http://bj.xiaozhu.com/')
    prices = re.findall('<span class="result_price">&#165;<i>(.*?)</i>起/晚</span>', res.text)
    for price in prices:
        print(price)

    注意:

    <span class="result_price">¥<i>488</i>起/晚</span>
    ¥   和  &#165;等价,但爬取时,不能出现¥ 
  • 相关阅读:
    Vue小实例
    Bootstrap进度条
    Bootstrap导航栏
    Bootstrap表单
    java date类
    正则表达式(java)规则大全
    正则表达式(java)
    java Stringbuffer类
    java String类
    object类
  • 原文地址:https://www.cnblogs.com/King-boy/p/10901389.html
Copyright © 2011-2022 走看看