zoukankan      html  css  js  c++  java
  • python scrapy简单使用

    最近因为项目需求,需要写个爬虫爬取一些题库。在这之前爬虫我都是用node或者php写的。一直听说python写爬虫有一手,便入手了python的爬虫框架scrapy.

    下面简单的介绍一下scrapy的目录结构与使用:

    首先我们得安装scrapy框架

    pip install scrapy

    接着使用scrapy命令创建一个爬虫项目:

    scrapy startproject questions

    相关文件简介:

    • scrapy.cfg: 项目的配置文件
    • questions/: 该项目的python模块。之后您将在此加入代码。
    • questions/items.py: 项目中的item文件.
    • questions/pipelines.py: 项目中的pipelines文件.
    • questions/settings.py: 项目的设置文件.
    • questions/spiders/: 放置spider代码的目录.
    • questions/spiders/xueersi.py: 实现爬虫的主体代码.

    xueersi.py  爬虫主体

    # -*- coding: utf-8 -*-
    import scrapy
    import time
    import numpy
    import re
    from questions.items import QuestionsItem
    
    
    class xueersiSpider(scrapy.Spider):
        name = "xueersi" # 爬虫名字
        allowed_domains = ["tiku.xueersi.com"] # 目标的域名
      # 爬取的目标地址 start_urls
    = [ "http://tiku.xueersi.com/shiti/list_1_1_0_0_4_0_1", "http://tiku.xueersi.com/shiti/list_1_2_0_0_4_0_1", "http://tiku.xueersi.com/shiti/list_1_3_0_0_4_0_1", ] levels = ['偏易','中档','偏难'] subjects = ['英语','语文','数学']
       # 爬虫开始的时候,自动调用该方法,如果该方法不存在会自动调用parse方法
    # def start_requests(self): # yield scrapy.Request('http://tiku.xueersi.com/shiti/list_1_2_0_0_4_0_39',callback=self.getquestion)
      # start_requests方法不存在时,parse方法自动被调用 def parse(self, response):
         # xpath的选择器语法不多介绍,可以直接查看官方文档 arr
    = response.xpath("//ul[@class='pagination']/li/a/text()").extract() total_page = arr[3]
         # 获取分页
    for index in range(int(total_page)): yield scrapy.Request(response.url.replace('_0_0_4_0_1',"_0_0_4_0_"+str(index)),callback=self.getquestion) # 发出新的请求,获取每个分页所有题目 # 获取题目 def getquestion(self,response): for res in response.xpath('//div[@class="main-wrap"]/ul[@class="items"]/li'): item = QuestionsItem() # 实例化Item类 # 获取问题 questions = res.xpath('./div[@class="content-area"]').re(r'<div class="content-area">?([sS]+?)<(table|/td|div|br)') if len(questions): # 获取题目 question = questions[0].strip() item['source'] = question dr = re.compile(r'<[^>]+>',re.S) question = dr.sub('',question) content = res.extract() item['content'] = question # 获取课目 subject = re.findall(ur'http://tiku.xueersi.com/shiti/list_1_(d+)',response.url) item['subject'] = self.subjects[int(subject[0])-1] # 获取难度等级 levels = res.xpath('//div[@class="info"]').re(ur'难度:([sS]+?)<') item['level'] = self.levels.index(levels[0])+1 # 获取选项 options = re.findall(ur'[A-D][..]([sS]+?)<(/td|/p|br)',content) item['options'] = options if len(options): url = res.xpath('./div[@class="info"]/a/@href').extract()[0] request = scrapy.Request(url,callback=self.getanswer) request.meta['item'] = item # 缓存item数据,传递给下一个请求 yield request #for option in options: # 获取答案 def getanswer(self,response): res = response.xpath('//div[@class="part"]').re(ur'<td>([sS]+?)</td>') con = re.findall(ur'([sS]+?)<br>[sS]+?([A-D])',res[0]) # 获取含有解析的答案 if con: answer = con[0][1] analysis = con[0][0] # 获取解析 else: answer = res[0] analysis = '' if answer: item = response.meta['item'] # 获取item item['answer'] = answer.strip() item['analysis'] = analysis.strip() item['answer_url'] = response.url yield item # 返回item,输出管道(pipelines.py)会自动接收该数据
    
    

    items.py 数据结构定义:

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class QuestionsItem(scrapy.Item):
        content = scrapy.Field()
        subject = scrapy.Field()
        level = scrapy.Field()
        answer = scrapy.Field()
        options = scrapy.Field()
        analysis = scrapy.Field()
        source = scrapy.Field()
        answer_url = scrapy.Field()
        pass

    pipelines.py 输出管道(本例子输出的数据写入本地数据库):

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import pymysql
    import md5
    
    
    class QuestionsPipeline(object):
        def __init__(self):  
            # 建立数据库连接  
            self.connect = pymysql.connect('localhost','root','','question',use_unicode=True,charset='utf8')  
            # 获取游标 
            self.cursor = self.connect.cursor()  
            print("connecting mysql success!")  
            self.answer = ['A','B','C','D']
        def process_item(self, item, spider):
            content = pymysql.escape_string(item['content'])
         # 获取题目hash值,使用该字段过滤重复的题目 m1
    = md5.new() m1.update(content) hash = m1.hexdigest() selectstr = "select id from question where hash='%s'"%(hash) self.cursor.execute(selectstr) res = self.cursor.fetchone() # 过滤相同的题目 if not res:
           # 插入题目 sqlstr
    = "insert into question(content,source,subject,level,answer,analysis,hash,answer_url) VALUES('%s','%s','%s','%s','%s','%s','%s','%s')"%(content,pymysql.escape_string(item['source']),item['subject'],item['level'],item['answer'],pymysql.escape_string(item['analysis']),hash,item['answer_url']) self.cursor.execute(sqlstr) qid = self.cursor.lastrowid
           # 插入选项
    for index in range(len(item['options'])): option = item['options'][index] answer = self.answer.index(item['answer']) if answer==index: ans = '2' else: ans = '1' sqlstr = "insert into options(content,qid,answer) VALUES('%s','%s','%s')"%(pymysql.escape_string(option[0]),qid,ans) self.cursor.execute(sqlstr) self.connect.commit() #self.connect.close() return item

    爬虫构建完毕后,在项目的根目录下运行

    scrapy crawl xueersi  # scrapy crawl 爬虫的名称
  • 相关阅读:
    android monkey测试学习
    学习python的*args和 **kwargs
    TotoiseSVN 使用参考文章
    脚本判断访问终端是什么内核的浏览器
    Git两分钟指南-学习入门参考
    访问WebServcie遇到配额不足的时候,请增加配额
    错误1083:配置成在该可执行程序中运行的这个服务不能执行该服务 【解决】
    C# 如何获取错误所在行数
    mysql中如何使用一句话将一个表的数据导入到另一个表中:insert into ...select
    Windows Server 2008 MetaFile设置占用内存限制
  • 原文地址:https://www.cnblogs.com/dudeyouth/p/8795409.html
Copyright © 2011-2022 走看看