zoukankan      html  css  js  c++  java
  • python 爬虫相关含Scrapy框架

    1、从酷狗网站爬取 新歌首发的新歌名字、播放时长、链接等

    from bs4 import BeautifulSoup as BS
    import requests
    import re
    import json
    class StockCrawler():
        
        def __init__(self):
            pass
        
        def get_stockinfo(self,url):
            res=requests.get(url)
            res.encoding='utf-8'
            soup=BS(res.text,'html.parser')
            
            stock_info=[]
            div=soup.find_all('div',id='SongtabContent')  #定位最外层的新歌区域标签
            li=div[0].find_all('li')
            
            for i in li:  #遍历每首歌的标签,分别取歌名、歌播放时长、链接
                print(i)
                
                link='http:'+i.find_all('a')[0]['href']
                print('link:',link)
                
                songname=i.a.find_all('span',class_='songName')[0].text
                songtime=i.a.find_all('span',class_='songTime')[0].text
                print('songname:',songname)
                print('songtime:',songtime)
                
                stock_info.append((songname,songtime,link))
                
            return stock_info
    
        def write_file(self,data,file_name):
            with open(file_name,'w',encoding='utf-8') as fp:
                for i in data:
                    fp.write(i[0]+','+i[1]+','+i[2]+'
    ')   #写入文件
    
    if __name__=='__main__':
        
        url='http://www.kugou.com/'
        stockcrawler=StockCrawler()
        data=stockcrawler.get_stockinfo(url)
        stockcrawler.write_file(data,'f:\test\pppp.txt')

    2、sohu网站首页 爬取包含"体育"2个字的链接

    r = requests.get("http://www.sohu.com")
    r.encoding="utf-8"
    html  = r.text
    soup = BeautifulSoup(html,"html.parser")#html可以是html内容
    
    links = []
    for i in list(soup.find_all(["a"])):
        try:
            print(i["href"])
            if i["href"].strip().startswith(r"//"):
                print(1)
                i["href"]="http:"+i["href"]
            if i["href"].find("javascript")>=0:
                print(2)
                continue
            if i["href"].find("mailto")>=0:
                print(3)
                continue
            if len(i["href"].strip())==1:
                print(4)
                continue
            #print(i["href"])
            links.append(i["href"].strip())
            #print("*"*50)
        except Exception as e:
            print(e)
    
    for link in links:
        print(link)
    
    x=1
    for link in links:
        r = requests.get(link)
        r.encoding = "utf-8"
        if "体育" in r.text:
            with open("e:\pic\"+str(x)+".txt","w",encoding="utf-8") as fp:
                fp.write(r.text)
                x+=1

    3、使用代理服务器 发送请求

    proxy='168.0.86.146:8080'
    
    #如果代理需要验证,只需要在前面加上用户名密码,如下所示
    
    # proxy='username:password@124.243.226.18:8888'
    proxies={
        'http':'http://'+proxy,
        'https':'https://'+proxy,
    }
    try:
        response=requests.get('http://httpbin.org/get',proxies=proxies)
        print(response.text)
    except requests.exceptions.ConnectionError as e:
        print("Error",e.args)

    4、Srapy 爬虫框架

    #Scrapy 爬虫框架
    '''
    scrapy startproject testman
    
    1)items.py 存储你要爬取的数据的变量。
    类似于字典。
    
    2)pipelines.py(保存爬取后的数据):保存你抓取网页,分析后的存储的
    变量中的数据存入到某个地方。(json文件)txt文件、excel 、数据库。
    
    3)settings.py:设定spider的优先级,自动生成的,取消掉数据就可以了。
    ITEM_PIPELINES = {'gr.pipelines.GrPipeline': 300,}
    
    gr:抓取器的名字
    
    4)在spider的目录下,写一下分析逻辑(从网页中取想要的数据,保存到items.py声明的变量中。)
    
    
    框架使用的步骤:
    1 新建scrapy工程,在任意目录下,cmd中执行:scrapy startproject groad
    2 生成工程的内容,在scrapy的根目录下cmd中
    执行:scrapy genspider newsong www.kugou.com
    3 编写相关代码
    4 在scrapy的根目录下,cmd执行抓取:scrapy crawl newsong
    
    
    '''



    5、Scrapy框架的例子,酷狗爬取新歌首发的歌曲名称、播放时长、歌曲链接等

    #items.py文件内容
    import scrapy
    
    class GroadItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        #定义项目的字段
        songname=scrapy.Field() #歌曲名称
        songtime=scrapy.Field()#歌曲播放时间
        href_song=scrapy.Field()#歌曲播放链接
    
    
    if __name__=='__main__':
        g=GroadItem()
        print(g['songname'])
    
    
    #pipelines.py 文件内容
    
    
    import json
    class GroadPipeline(object):
        def __init__(self):
            
            self.filename=open('f:\test\newsong.txt','w',encoding='utf-8')
    
        def process_item(self, item, spider):
            text=json.dumps(dict(item),ensure_ascii=False)+'
    '
            self.filename.write(text)
            return item
        
        def close_spider(self,spider):
            self.filename.close()
    
    
    #settings.py文件,放开注释
    
    ITEM_PIPELINES = {
        'groad.pipelines.GroadPipeline': 300,
    }
    
    
    #newsong.py 文件
    
    
    import scrapy
    from groad.items import GroadItem
    
    class NewsongSpider(scrapy.Spider):
        name = 'newsong'
        allowed_domains = ['www.kugou.com']
        start_urls = ['http://www.kugou.com/']
    
        def parse(self, response):
            item=GroadItem()
            for i in range(1,len(response.xpath('//*[@id="SongtabContent"]/ul'))+1):
                for j in range(1,len(response.xpath('//*[@id="SongtabContent"]/ul[%s]/li' % i))+1):
                    item['songname']=response.xpath('//*[@id="SongtabContent"]/ul[%s]/li[%s]/a/span[1]/text()' % (i,j)).extract()[0]
                    item['songtime'] =response.xpath('//*[@id="SongtabContent"]/ul[%s]/li[%s]/a/span[@class="songTime"]/text()' % (i, j)).extract()[0]
                    item['href_song'] = response.xpath('//*[@id="SongtabContent"]/ul[%s]/li[%s]/a/@href' % (i, j)).extract()[0]
    
                    yield item
        
  • 相关阅读:
    Java代理(jdk静态代理、动态代理和cglib动态代理)
    Hive安装
    Spark 集群安装
    Flume 远程写HDFS
    Spark Idea Maven 开发环境搭建
    oracle 通不过网络的原因
    oracle一些基本问题
    linux-redhat配置yum源
    liunx虚拟机网络连接
    redhat安装jdk、tomcat、mysql
  • 原文地址:https://www.cnblogs.com/xiaoxiao075/p/10925542.html
Copyright © 2011-2022 走看看