zoukankan      html  css  js  c++  java
  • 数据采集与融合技术第三次作业

    数据采集与融合技术第三次作业

    作业一

    要求

    指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。分别使用单线程和多线程的方式爬取。将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。

    思路

    • 选择 中国气象网主页:http://www.weather.com.cn/
    • 分别用单线程和多线程实现爬取图片
    • 将各个部件封装成函数,在线程内部调用

    code

    单线程

    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import time
    from urllib.parse import urlparse
    import re
    from random import random
    
    def imageSpider(start_url):
        try:
            # nextUrl = []
            urls=[]
            req=urllib.request.Request(start_url,headers=headers)
            data=urllib.request.urlopen(req)
            data=data.read()
            dammit=UnicodeDammit(data,["utf-8","gbk"])
            data=dammit.unicode_markup
            soup=BeautifulSoup(data,"lxml")
            images=soup.select("img")
            for image in images: 
                try:
                    src=image["src"]
                    url=urllib.request.urljoin(start_url,src) 
                    if url not in urls:
                        urls.append(url) 
                        print(url)
                        download(url)
                except Exception as err:
                    print(err)
            # nextUrl = getInternalLinks(soup,nextUrl)
            # start_url = nextUrl[random(len(nextUrl))]
            return start_url
        except Exception as err:
                print(err)
    
    def download(url): 
        global count
        try:
            count=count+1
            #提取文件后缀扩展名
            if(url[len(url)-4]=="."):
                ext=url[len(url)-4:]
            else:
                ext=""
            req=urllib.request.Request(url,headers=headers)
            data=urllib.request.urlopen(req,timeout=100) 
            data=data.read() 
            fobj=open("Object3/singleThreadImages/"+str(count)+ext,"wb") 
            fobj.write(data)
            fobj.close()
            print("downloaded "+str(count)+ext)
        except Exception as err: 
            print(err)
    
    start_time = time.time()
    start_url = "http://www.weather.com.cn/"
    # start_url="http://www.weather.com.cn/weather/101280601.shtml"
    headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
    count=0
    imageSpider(start_url)
    print(time.time()-start_time)
    
    单线程结果


    耗时大约20s

    多线程

    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import threading
    import time
    
    def imageSpider(start_url):
        global threads
        global count
        try:
            urls=[]
            req=urllib.request.Request(start_url,headers=headers)
            data=urllib.request.urlopen(req)
            data=data.read()
            dammit=UnicodeDammit(data,["utf-8","gbk"])
            data=dammit.unicode_markup
            soup=BeautifulSoup(data,"lxml")
            images=soup.select("img")
            for image in images:
                try:
                    src=image["src"]
                    url=urllib.request.urljoin(start_url,src) 
                    if url not in urls:
                        print(url)
                        count=count+1
                        T=threading.Thread(target=download,args=(url,count))
                        T.setDaemon(False)
                        T.start() 
                        threads.append(T)
                        urls.append(url)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
    
    def download(url,count):
        try:
            if(url[len(url)-4]=="."):
                ext=url[len(url)-4:]
            else:
                ext=""
            req=urllib.request.Request(url,headers=headers)
            data=urllib.request.urlopen(req,timeout=100)
            data=data.read()
            fobj=open("object3/mutliThreadImages/"+str(count)+ext,"wb")
            fobj.write(data)
            fobj.close()
            print("downloaded "+str(count)+ext)
        except Exception as err:
            print(err)
    
    start_time = time.time()
    start_url = "http://www.weather.com.cn/"
    # start_url="http://www.weather.com.cn/weather/101280601.shtml"
    headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"}
    count=0
    threads=[]
    imageSpider(start_url)
    for t in threads:
        t.join()
    print("The End")
    print(time.time()-start_time)
    
    多线程运行结果


    耗时4.34s

    心得

    由于课本上有源码,所以总体上没有什么难度,多线程的代码加了一行修掉了一个小bug(就是imageSpider函数内部增加的urls.append(url)

    作业二

    要求

    使用scrapy框架复现作业一

    思路

    思路类似,只是改用scrapy套个壳。

    code

    单线程

    import scrapy
    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import time
    from urllib.parse import urlparse
    import re
    from random import random
    
    class ArticleSpider(scrapy.Spider):
        name = 'weather'
    
        def start_requests(self):
            urls = ['http://www.weather.com.cn/']
            return [scrapy.Request(url = url,callback = self.parse) for url in urls]
    
        def parse(self,response):
            url = response.url
            start_time = time.time()
            # start_url = "http://www.weather.com.cn/"
            # start_url="http://www.weather.com.cn/weather/101280601.shtml"
            headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
            count=0
            self.imageSpider(url,headers,count)
            # while(start_url is not None):
                # start_url = imageSpider(start_url)
                # print(start_url)
            print(time.time()-start_time)
            # imageSpider(url)
            # print("end")
            # title = response.css('title::text').extract_first()
            # print("======================================")
            # print('URL is: {}'.format(url))
            # print('Title is: {}'.format(title))
            # print("======================================")
    
        def imageSpider(self,start_url,headers,count):
            try:
                # nextUrl = []
                urls=[]
                req=urllib.request.Request(start_url,headers=headers)
                data=urllib.request.urlopen(req)
                data=data.read()
                dammit=UnicodeDammit(data,["utf-8","gbk"])
                data=dammit.unicode_markup
                soup=BeautifulSoup(data,"lxml")
                images=soup.select("img")
                for image in images: 
                    try:
                        src=image["src"]
                        url=urllib.request.urljoin(start_url,src) 
                        if url not in urls:
                            urls.append(url) 
                            print(url)
                            self.download(url,headers,count)
                            count += 1
                    except Exception as err:
                        print(err)
                # nextUrl = getInternalLinks(soup,nextUrl)
                # start_url = nextUrl[random(len(nextUrl))]
                return start_url
            except Exception as err:
                    print(err)
    
        def download(self,url,headers,count):
            try:
                if(url[len(url)-4]=="."):
                    ext=url[len(url)-4:]
                else:
                    ext=""
                req=urllib.request.Request(url,headers=headers)
                data=urllib.request.urlopen(req,timeout=100)
                data=data.read()
                fobj=open("scrapyImages/"+str(count)+ext,"wb")
                fobj.write(data)
                fobj.close()
                print("downloaded "+str(count)+ext)
            except Exception as err:
                print(err)
    
    单线程运行结果


    耗时约15s

    多线程

    import scrapy
    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import threading
    import time
    
    class ArticleSpider(scrapy.Spider):
        name = 'weather'
    
        def start_requests(self):
            urls = ['http://www.weather.com.cn/']
            return [scrapy.Request(url = url,callback = self.parse) for url in urls]
    
        def parse(self,response):
            start_time = time.time()
            start_url = "http://www.weather.com.cn/"
            # start_url="http://www.weather.com.cn/weather/101280601.shtml"
            headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"}
            count=0
            threads=[]
            self.imageSpider(start_url,headers,count)
            for t in threads:
                t.join()
            print("The End")
            print(time.time()-start_time)
    
        def imageSpider(self,start_url,headers,count):
            global threads
            # global count
            try:
                urls=[]
                req=urllib.request.Request(start_url,headers=headers)
                data=urllib.request.urlopen(req)
                data=data.read()
                dammit=UnicodeDammit(data,["utf-8","gbk"])
                data=dammit.unicode_markup
                soup=BeautifulSoup(data,"lxml")
                images=soup.select("img")
                for image in images:
                    try:
                        src=image["src"]
                        url=urllib.request.urljoin(start_url,src) 
                        if url not in urls:
                            print(url)
                            count=count+1
                            T=threading.Thread(target=self.download,args=(url,headers,count))
                            count += 1
                            T.setDaemon(False)
                            T.start() 
                            threads.append(T)
                            urls.append(url)
                    except Exception as err:
                        print(err)
            except Exception as err:
                print(err)
    
        def download(self,url,headers,count):
            try:
                if(url[len(url)-4]=="."):
                    ext=url[len(url)-4:]
                else:
                    ext=""
                req=urllib.request.Request(url,headers=headers)
                data=urllib.request.urlopen(req,timeout=100)
                data=data.read()
                fobj=open("scrapyImages/"+str(count)+ext,"wb")
                fobj.write(data)
                fobj.close()
                print("downloaded "+str(count)+ext)
            except Exception as err:
                print(err)
    
    多线程运行结果


    耗时大约1.37s

    心得

    • 多线程套进scrapy里面需要注意一些参数的传递,以及把函数放在class内部时定义和调用都要注意self的使用。

    作业三

    要求

    使用scrapy框架爬取股票相关信息。候选网站:东方财富网新浪股票

    思路

    与之前用json请求爬取数据类似,scrapy套个壳。

    code

    ## scrapy runspider .getStocks.py -s LOG_FILE=all.log
    import scrapy
    import re
    import requests
    
    class ArticleSpider(scrapy.Spider):
        name = 'stocks'
    
        def start_requests(self):
            url_head = 'http://97.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406971740416068926_1601446076156&pn='
            url_tail = '&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1601446076157'
            urls = []
            for i in range(1,6):
                urls.append(url_head+str(i)+url_tail)
            # global count
            # count = 1
            # global count
            print('%-6s %-8s %10s %10s %12s %10s %10s %12s'%('代码','名称','最新价','涨跌幅(%)','跌涨额(¥)','成交量(手)','成交额(¥)','涨幅(%)'))
            return [scrapy.Request(url = url,callback = self.parse) for url in urls]
    
        def parse(self,response):
            # global count
            url = response.url
            # print("=====================")
            
            # count = 1
            # global count
            # for i in range(1,6):
            self.get_stock(url)
            # self.get_stock(self,url,count)
    
        def get_stock(self,url):
            # global count
            json_page = requests.get(url).content.decode(encoding='utf-8')
            # json_page = json_page.read()
            pat = ""diff":[{.*}]"
            table = re.compile(pat,re.S).findall(json_page)
            pat = "},{"
            stocks = re.split(pat,table[0])
            # count = 1
            for stock in stocks:
                # print(stock)
                pat = ","
                infs = re.split(pat,stock)
                # print(infs[13])
                pat = ":"
                name = re.split(pat,infs[13])
                money = re.split(pat,infs[1])
                num = re.split(pat,infs[11])
                Quote_change = re.split(pat,infs[2])  # 涨跌幅
                Ups_and_downs = re.split(pat,infs[3])  # 涨跌额
                Volume = re.split(pat,infs[4])  #成交量
                Turnover = re.split(pat,infs[5])  #成交额
                Increase = re.split(pat,infs[6])  #涨幅
                # print(count,num[1],name[1],money[1],Quote_change[1]+"%",Ups_and_downs[1]+"¥",str(Volume[1])+"手",Turnover[1]+"¥",Increase[1]+"%")
                print('%-10s %-10s %10s %10s %15s %15s %18s %12s'%(num[1],name[1],money[1],Quote_change[1],Ups_and_downs[1],Volume[1],Turnover[1],Increase[1]))
                # count += 1
            # return count
    

    运行结果

    心得

    • 要把程序套进scrapy里面还是有一点难度的,要注意一下参数的传递或者设置为全局变量。
    • 为了避免命令行输出过多debug信息导致杂乱,可以在运行命令后加上-s LOG_FILE=all.log
  • 相关阅读:
    x64 平台开发 Mapxtreme 编译错误
    hdu 4305 Lightning
    Ural 1627 Join(生成树计数)
    poj 2104 Kth Number(可持久化线段树)
    ural 1651 Shortest Subchain
    hdu 4351 Digital root
    hdu 3221 Bruteforce Algorithm
    poj 2892 Tunnel Warfare (Splay Tree instead of Segment Tree)
    hdu 4031 Attack(BIT)
    LightOJ 1277 Looking for a Subsequence
  • 原文地址:https://www.cnblogs.com/holmze/p/13813775.html
Copyright © 2011-2022 走看看