zoukankan      html  css  js  c++  java
  • Scrapy 采集需要登录注册的网站

    方法一:使用账号密码登录网址并采集(如遇到图形验证码等,需要额外进行处理)

    #!/usr/bin/py2
    # -*- coding: utf-8 -*-
    #encoding=utf-8
    
    from bs4 import BeautifulSoup
    from scrapy.http import Request, FormRequest
    from spider_test.items import *
    from scrapy.spiders import CrawlSpider
    from spider_test import settings
    
    class ScrapyTestSpider(CrawlSpider):
    
        name = "spider_test"
        allowed_domains = [settings.SPIDER_DOMAIN]    # 这是在settings.py文件中的自定义配置,其值类似于 www.demo.com
    
        def start_requests(self):
            """第一次请求一下登录页面,设置开启cookie使其得到cookie,设置回调函数"""
            yield Request('http://%s/admin/account/login.html' % settings.SPIDER_DOMAIN, meta={'cookiejar': 1}, callback=self.parse)
    
        def parse(self, response):
            data = dict(username="xiaoming",    # 登录页表单的账号字段
                        password="888888")        # 登录页表单的密码字段
    
            print('登录中....!')
            """第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权"""
            yield FormRequest(url='http://%s/admin/account/dologin.html' % settings.SPIDER_DOMAIN,  # 真实post地址
                              meta={'cookiejar': 1},
                              formdata=data,
                              callback=self.jump_list)
    
        def jump_list(self, response):
            print('正在请需要登录才可以访问的页面....!')
            yield Request('http://%s/admin/office/getofficelist.html' % settings.SPIDER_DOMAIN,
                          meta={'cookiejar': 1}, callback=self.parser_list)
    
        def parser_list(self, response):
            soup = BeautifulSoup(response.body, 'html.parser')
            page_list = soup.find(attrs={'class': 'pagination'}).find_all('a')
            if page_list:
                for page in page_list:
                    page_url = 'http://%s%s' % (settings.SPIDER_DOMAIN, page.get('href'))
                    yield Request(page_url, meta={'cookiejar': 1}, callback=self.parser_list)
    
            office_list = soup.find_all('a', attrs={'class': 'ui-office-list'})
            if office_list:
                for office in office_list:
                    office_url = 'http://%s%s' % (settings.SPIDER_DOMAIN, office.attrs['href'])
                    yield Request(office_url, meta={'cookiejar': 1}, callback=self.parse_article)
    
        def parse_article(self, response):
            test_item = SpiderTestItem()
            soup = BeautifulSoup(response.body, 'html.parser')
            container = soup.find('table', attrs={'class': 'index-statistics-table'})
            test_item['source_url'] = response.url
            test_item['title'] = soup.title.get_text()
            test_item['article_content'] = container.prettify()
            return test_item

    方法二:使用已登录的Cookie信息进行采集(可规避图形验证码)

    #!/usr/bin/py2
    # -*- coding: utf-8 -*-
    #encoding=utf-8
    
    from bs4 import BeautifulSoup
    from scrapy.http import Request, FormRequest
    from spider_test.items import *
    from scrapy.spiders import CrawlSpider
    from spider_test import settings
    
    class ScrapyTestSpider(CrawlSpider):
    
        name = "spider_test"
        allowed_domains = [settings.SPIDER_DOMAIN]
    
        cookies = dict(PHPSESSID='qwertyuiopasdfghjklzxcvbnm')  # 登录后得到的会话ID
    
        def start_requests(self):
            print('正在请需要登录才可以访问的页面....!')
            yield Request('http://%s/admin/office/getofficelist.html' % settings.SPIDER_DOMAIN, cookies=self.cookies,
                        callback=self.parser_list)
    
        def parser_list(self, response):
            soup = BeautifulSoup(response.body, 'html.parser')
            page_list = soup.find(attrs={'class': 'pagination'}).find_all('a')
            if page_list:
                for page in page_list:
                    page_url = 'http://%s%s' % (settings.SPIDER_DOMAIN, page.get('href'))
                    yield Request(page_url, cookies=self.cookies, callback=self.parser_list)
    
            office_list = soup.find_all('a', attrs={'class': 'ui-office-list'})
            if office_list:
                for office in office_list:
                    office_url = 'http://%s%s' % (settings.SPIDER_DOMAIN, office.attrs['href'])
                    yield Request(office_url, cookies=self.cookies, callback=self.parse_article)
    
        def parse_article(self, response):
            test_item = SpiderTestItem()
            soup = BeautifulSoup(response.body, 'html.parser')
            container = soup.find('table', attrs={'class': 'index-statistics-table'})
            test_item['source_url'] = response.url
            test_item['title'] = soup.title.get_text()
            test_item['article_content'] = container.prettify()
            return test_item
  • 相关阅读:
    JavaScript 实现打印操作
    linux-centos7.6设置固定IP网络方法
    VMware网络配置三种网络模式(桥接、NAT、Host-only)
    SVN 执行cleanup报错:Cleanup failed to process the following paths
    word生成目录的pdf
    win10开机后将存在多个系统选择,改为直接进入系统无需选择
    Win10删除或是不显示快速访问中最近使用文件记录
    Dell T30解决报Alert! Cover was previously removed.
    WIN10安装.net报0x800F081F解决方法
    查看SVN当前登录用户
  • 原文地址:https://www.cnblogs.com/funsion/p/12623276.html
Copyright © 2011-2022 走看看