zoukankan      html  css  js  c++  java
  • scrapy爬虫框架

    scrapy爬虫框架爬取百度贴吧帖子并存库

    参考:http://cuiqingcai.com/3472.html/3

    scrapySpiders.mysqlpipelines.pipelines.ScrapyPipelines
    # -*- coding: utf-8 -*-
    __author__ = 'mmc'
    
    from sql import Sql
    from scrapySpiders.items import ScrapyspidersItem
    
    
    class ScrapyPipelines(object):
        @classmethod
        def process_item(self, item, spider):
            if isinstance(item, ScrapyspidersItem):
                print "start save data in database"
                tb_name = item['name']
                tb_content = item['content']
                Sql.insert_content(tb_name, tb_content)

    在settings.py文件中修改配置即可自动进行数据存储

    spiders

    # -*- coding: utf-8 -*-
    __author__ = 'mmc'
    
    import re
    import scrapy
    from bs4 import BeautifulSoup
    from scrapy.http import Request
    from scrapySpiders.items import ScrapyspidersItem
    from scrapySpiders.mysqlpipelines.pipelines import ScrapyPipelines
    
    
    class Myspiders(scrapy.Spider):
    
        name = 'TieBa'
        base_url = 'https://tieba.baidu.com/p/'
    
        def start_requests(self):
            url = self.base_url+'4420237089'+'?see_lz=1'
            yield Request(url, callback=self.parse)
    
        def parse(self, response):
            max_num = BeautifulSoup(response.text, 'lxml').find('li', class_='l_reply_num').find_all('span')[-1].get_text()
            print max_num
            # print response.text
            baseurl = str(response.url)
            for num in range(1, int(max_num)+1): #int(max_num)+1
                url = baseurl+'&pn='+str(num)
                # print url
                yield Request(url, callback=self.get_name)
    
        def get_name(self, response):
            response = BeautifulSoup(response.text, 'lxml')
            item = ScrapyspidersItem()
            tmp = response.findAll("div", {"class": "d_post_content j_d_post_content "})
            vmp = response.findAll("span", {"class": "tail-info"})
            for val, f in zip(tmp, vmp[1:-1:3]):
                # print val.get_text()
                # print f.get_text()
                item['name'] = val.get_text().strip()
                item['content'] = f.get_text().strip()
                print item
                ScrapyPipelines().process_item(item,'')
            return item
            # item['name'] = tmp
            # item['content'] = vmp
            # # print tmp
            # # print vmp
            # return item

    sql.py

    # -*- coding: utf-8 -*-
    __author__ = 'mmc'
    
    import MySQLdb
    
    
    db_para = {'host': '127.0.0.1',
               'port': 3306,
               'user': 'mmc',
               'passwd': 'mmc123456',
               'db': 'test'}
    dbcon = MySQLdb.connect(**db_para)
    cur = dbcon.cursor()
    #出现UnicodeEncodeError: 'latin-1' codec can't encode character执行以下代码解决 dbcon.set_character_set(
    'utf8') cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;')

    class Sql: @classmethod def insert_content(cls, tb_name, tb_content): sql = """INSERT INTO `test`.`tb_name` (`tb_name`, `tb_content`) VALUES (%(tb_content)s,%(tb_name)s) """ value = {'tb_name': tb_name, 'tb_content': tb_content} cur.execute(sql, value) dbcon.commit()

    scrapy查询语法:

    当我们爬取大量的网页,如果自己写正则匹配,会很麻烦,也很浪费时间,令人欣慰的是,scrapy内部支持更简单的查询语法,帮助我们去html中查询我们需要的标签和标签内容以及标签属性。下面逐一进行介绍:

    • 查询子子孙孙中的某个标签(以div标签为例)://div
    • 查询儿子中的某个标签(以div标签为例):/div
    • 查询标签中带有某个class属性的标签://div[@class=’c1′]即子子孙孙中标签是div且class=‘c1’的标签
    • 查询标签中带有某个class=‘c1’并且自定义属性name=‘alex’的标签://div[@class=’c1′][@name=’alex’]
    • 查询某个标签的文本内容://div/span/text() 即查询子子孙孙中div下面的span标签中的文本内容
    • 查询某个属性的值(例如查询a标签的href属性)://a/@href

    示例代码:

    
    

    from scrapy.selector import Selector
    from scrapy.http import HtmlResponse

    def parse(self, response):
           # 分析页面
           # 找到页面中符合规则的内容(校花图片),保存
           # 找到所有的a标签,再访问其他a标签,一层一层的搞下去
     
           hxs = HtmlXPathSelector(response)#创建查询对象
     
           # 如果url是 http://www.xiaohuar.com/list-1-d+.html
           if re.match('http://www.xiaohuar.com/list-1-d+.html', response.url): #如果url能够匹配到需要爬取的url,即本站url
               items = hxs.select('//div[@class="item_list infinite_scroll"]/div') #select中填写查询目标,按scrapy查询语法书写
               for i in range(len(items)):
                   src = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/a/img/@src' % i).extract()#查询所有img标签的src属性,即获取校花图片地址
                   name = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/span/text()' % i).extract() #获取span的文本内容,即校花姓名
                   school = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/div[@class="btns"]/a/text()' % i).extract() #校花学校
                   if src:
                       ab_src = "http://www.xiaohuar.com" + src[0]#相对路径拼接
                       file_name = "%s_%s.jpg" % (school[0].encode('utf-8'), name[0].encode('utf-8')) #文件名,因为python27默认编码格式是unicode编码,因此我们需要编码成utf-8
                       file_path = os.path.join("/Users/wupeiqi/PycharmProjects/beauty/pic", file_name)
                       urllib.urlretrieve(ab_src, file_path)
  • 相关阅读:
    梦断代码读后感一
    二阶段之五
    二柱子阶段二
    动手动脑
    二柱子
    开学测试
    jdk的安装
    软工人8月30日学习记录
    软工人8月29日学习记录
    软工人8月28日学习记录
  • 原文地址:https://www.cnblogs.com/cmm2016/p/6964559.html
Copyright © 2011-2022 走看看