zoukankan      html  css  js  c++  java
  • 第三次作业

    作业一:

  • 要求:指定一个网站,爬取这个网站中的所有的所有图片
  • 单线程:
  • 代码:

    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import urllib.parse
    import time
    start_time=time.clock()
    start_url="http://www.weather.com.cn/weather/101280601.shtml"
    headers={
        "User-Agent":"Mozilla/5.0(Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2"
    }
    def imageSpider(start_url):
        try:
            urls=[]
            req=urllib.request.Request(start_url,headers=headers)
            data=urllib.request.urlopen(req)
            data =data.read()
            dammit=UnicodeDammit(data,["utf-8","gbk"])
            data=dammit.unicode_markup
            soup=BeautifulSoup(data,"lxml")
            images=soup.select("img")
            for image in images:
                try:
                    src=image["src"]
                    url=urllib.parse.urljoin(start_url,src)
                    if url not in urls:
                        urls.append(url)
                        print(url)
                        download(url)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
    
    def download(url):
        global count
        try:
            count=count+1
            if(url[len(url)-4]=='.'):
                ext=url[len(url)-4:]
            else:
                ext=''
            req=urllib.request.Request(url,headers=headers)
            data=urllib.request.urlopen(req,timeout=100)
            data=data.read()
            fobj=open("images\"+str(count)+ext,"wb")
            fobj.write(data)![](https://img2020.cnblogs.com/blog/2145485/202010/2145485-20201020220214167-737185274.png)
    
    
            fobj.close()
            print("download"+str(count)+ext)
        except Exception as err:
            print(err)
    count=0
    imageSpider(start_url)
    end_time=time.clock()
    print("耗时:",end_time-start_time)
    

    结果展示:

  • 多线程:
  • 代码:

    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import urllib.parse
    import threading
    import time
    start_time=time.clock()
    start_url="http://www.weather.com.cn/weather/101280601.shtml"
    headers={
        "User-Agent":"Mozilla/5.0(Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2"
    }
    def imageSpider(start_url):
        global threads
        global count
        try:
            urls=[]
            req=urllib.request.Request(start_url,headers=headers)
            data=urllib.request.urlopen(req)
            data=data.read()
            dammit=UnicodeDammit(data,["utf-8","gbk"])
            data = dammit.unicode_markup
            soup=BeautifulSoup(data,'lxml')
            images=soup.select("img")
            for image in images:
                try:
                    src=image["src"]
                    url=urllib.parse.urljoin(start_url,src)
                    if url not in urls:
                        print(url)
                        count=count+1
                        T=threading.Thread(target=download,args=(url,count))
                        T.setDaemon(False)
                        T.start()
                        threads.append(T)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
    def download(url,count):
        try:
            if (url[len(url) - 4] == '.'):
                ext = url[len(url) - 4:]
            else:
                ext = ''
            req = urllib.request.Request(url, headers=headers)
            data = urllib.request.urlopen(req, timeout=100)
            data = data.read()
            fobj = open("images\" + str(count) + ext, "wb")
            fobj.write(data)
            fobj.close()
            print("download" + str(count) + ext)
        except Exception as err:
            print(err)
    count=0
    threads=[]
    imageSpider(start_url)
    for t in threads:
        t.join()
    end_time=time.clock()
    print("The End")
    print("耗时:",end_time-start_time)
    

    结果展示:

    心得:

    相比下多线程爬取的效率快很多,但爬取到的图片效果上有些重复要做些调整

    作业二:

  • 要求:使用scrapy框架复现作业一
  • WeatherSpider.py:

    import scrapy
    from scrapy.selector import Selector
    from Spider.items import ImgItem
    class WeatherSpider(scrapy.Spider):
        name = 'WeatherSpider'
    
        def start_requests(self):
            url='http://www.weather.com.cn/weather/101280601.shtml'
            yield scrapy.Request(url=url,callback=self.parse)
    
        def parse(self, response):
            print(response.url)
            data=response.body.decode()
            #print(data)
            selector = Selector(text=data)#建立Selector对象,使用Xpath查找元素
            print(selector)
            s= selector.xpath("//img/@src").extract()#提取标签中img的资源地址src
            print(s)#s为图片地址形成的列表
            # print(s[0])
            for src in s:
                item=ImgItem()
                item["src"]=src
                yield item
    

    items.py:

    import scrapy
    
    class ImgItem(scrapy.Item):
        src = scrapy.Field()
        pass
    

    pipelines.py

    import urllib
    
    from itemadapter import ItemAdapter
    
    
    class SpiderPipeline(object):
        count=0
        def process_item(self, item, spider):
            SpiderPipeline.count+=1
            try:
                url=item["src"]
                print(url)
                if (url[len(url) - 4] == '.'):
                    ext = url[len(url) - 4:]
                else:
                    ext = ''
                req = urllib.request.Request(url)
                data = urllib.request.urlopen(req)
                data = data.read()
                fobj = open("F:\数据采集\10_14\images\" + str(SpiderPipeline.count) + ext, "wb")
                fobj.write(data)
                fobj.close()
                print("download" + str(SpiderPipeline.count) + ext)
            except Exception as err:
                print(err)
            return item
    

    settings.py取消该段代码注释:

    ITEM_PIPELINES = {
       'Spider.pipelines.SpiderPipeline': 300,
    }
    

    run.py:

    from scrapy import cmdline
    cmdline.execute("scrapy crawl shareSpider -s LOG_ENABLED=False".split())
    

    结果展示:

    心得:

    学习使用"/@attrName"得到一个Selector元素的attrName属性结点对象,通过extract()获取属性值

    作业三:

  • 要求:使用scrapy框架爬取股票相关信息
  • 代码:

    shareSpider.py:

    import re
    from Spider.items import lineItem
    import scrapy
    from Spider.pipelines import SpiderPipeline
    from scrapy.selector import Selector
    class shareSpider(scrapy.Spider):
        name="shareSpider"
    
        def start_requests(self):
            url = 'http://77.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124041523442512990894_1603196582234&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18&_=1603196582235'
            # 在url中pn参数是第i页,pz参数是返回i条股票信息,f2:"最新报价"f3:"涨跌幅"f4:"涨跌额"f5:"成交量"f6:"成交额"f7:"振幅"f12:"股票代码"f14:"股票名称"f15:"最高"f16:"最低"f17:"今开"f18:"昨收"
            yield scrapy.Request(url=url,callback=self.parse)
    
        def parse(self, response):
            data=response.body.decode()
            data = re.findall(r'"diff":[(.*?)]',data)
            datas = data[0].strip("{").strip("}").split("},{")
            #print(datas)
            for data_line in datas:  # 按行处理数据
                line_item=data_line.split(',')
                item=lineItem()
                item["id"]=line_item[6].split(":")[1]
                item["name"]=line_item[7].split(":")[1]
                item["new_price"]=line_item[0].split(":")[1]
                item["up_rate"]=line_item[1].split(":")[1]
                item["down_rate"]=line_item[2].split(":")[1]
                item["pass_number"] = line_item[3].split(":")[1]
                item["pass_money"] = line_item[4].split(":")[1]
                item["rate"] = line_item[5].split(":")[1]
                item["highest"] = line_item[8].split(":")[1]
                item["lowest"] = line_item[9].split(":")[1]
                item["today"] = line_item[10].split(":")[1]
                item["yesterday"] = line_item[11].split(":")[1]
                yield item
            print(SpiderPipeline.tb)
    
    

    pipelines.py:

    import prettytable as pt
    
    class SpiderPipeline(object):
        count = 0
        tb = pt.PrettyTable(["序号", "股票代码", "股票名称", "最新报价", "涨跌幅", "涨跌额", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收"])
    
        def process_item(self, item, spider):
            SpiderPipeline.count+=1
            SpiderPipeline.tb.add_row(
                [SpiderPipeline.count, item["id"], item["name"], item["new_price"], item["up_rate"], item["down_rate"],
                 item["pass_number"], item["pass_money"], item["rate"], item["highest"], item["lowest"], item["today"],
                 item["yesterday"]])
            return item
    

    items.py:

    import scrapy
    
    class lineItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        id=scrapy.Field()
        name=scrapy.Field()
        new_price=scrapy.Field()
        up_rate=scrapy.Field()
        down_rate=scrapy.Field()
        pass_number=scrapy.Field()
        pass_money=scrapy.Field()
        rate=scrapy.Field()
        highest=scrapy.Field()
        lowest=scrapy.Field()
        today=scrapy.Field()
        yesterday=scrapy.Field()
        pass
    

    settings.py中修改:

    ROBOTSTXT_OBEY = False

    取消注释:

    ITEM_PIPELINES = {
       'Spider.pipelines.SpiderPipeline': 300,
    }
    

    run.py:

    from scrapy import cmdline
    cmdline.execute("scrapy crawl shareSpider -s LOG_ENABLED=False".split())
    

    结果展示:

    心得:

    理清楚Spider中的数据获取与pipeline中的数据处理后,结合上次流程,实现起来就不会太难了。在刚开始遇到shareSpider中parse没有运行的情况,修改了ROBOTSTXT_OBEY = False。
    ROBOTSTXT应该是robots 协议内容,也叫机器人协议,它用来限定爬虫程序可以爬取的内容范围,通常写在 robots.txt 文件中,该文件保存在网站的服务器上。
    爬虫程序访问网站时首先查看此文件。修改了ROBOTSTXT_OBEY = False,爬取内容不符合该协议且仍要爬取时

查看全文
  • 相关阅读:
    python字典
    python中List添加、删除元素的几种方法
    python数据处理之基本函数
    python批量处理
    python正则表达式
    python模块学习:os模块
    Hough transform(霍夫变换)
    MODBUS TCP/IP协议规范详细介绍
    Linux下run文件的直接运行
    双边滤波和引导滤波的原理
  • 原文地址:https://www.cnblogs.com/fzujch/p/13849589.html
  • Copyright © 2011-2022 走看看