zoukankan html css js c++ java

scrapy

__author__ = 'Administrator'
# -*- encoding:utf-8 -*-
import scrapy
class QuoteSpider(scrapy.Spider):
    name = 'poxiao'
    start_urls=['https://www.poxiao.com/type/movie/']
    def parse(self, response):#固定的
        quotes=response.xpath('//li/h3')#内容
        for quote in quotes:
            yield {
                'name':quote.xpath('./a/text()').extract_first(),
                'author':'https://www.poxiao.com'+quote.xpath('./a/@href').extract_first()
            }
            next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
            if next_page:
                yield response.follow(next_page,self.parse)

用SCRAPY爬取某网页链接地址

scrapy runspider ***.py 运行此工程

SCRAPY runspider ***.py -o aa.json 保存成JSON文件

scrap runspider ***.py -o aa.csv -t csv 保存成EXCEL

# -*- coding: utf-8 -*-
import scrapy


class MovieSpider(scrapy.Spider):
    name = 'movie'
    allowed_domains = ['poxiao.com']
    start_urls = ['https://www.poxiao.com/type/movie/index_2.html',
                  'https://www.poxiao.com/type/movie/index_3.html'
                  ]

    def parse(self, response):
        filname=response.url.split('/')[-1].split('.')[-2]
        with open(filname,'wb')as f:
            f.write(response.body)

爬取HTML源文件

# -*- coding: utf-8 -*-
import scrapy
from meiju.items import MeijuItem

class Mj100Spider(scrapy.Spider):
    name = 'mj100'
    allowed_domains = ['meijutt.com']
    start_urls = ['https://www.meijutt.com/new100.html']

    def parse(self, response):
        movies=response.xpath('//h5/a')
        for each_movie in movies:
            item=MeijuItem()
            item['name']=each_movie.xpath('./text()').extract_first()
            yield item

class MeijuPipeline(object):
    def process_item(self, item, spider):
        with open('my_meiju.txt','a')as fp:
            fp.write(item['name']+'
')

class MeijuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name=scrapy.Field()

爬取美剧100实例注意还要注释一下PIPLINE文件里的内容就是有300 优先级那个

# -*- coding: utf-8 -*-
import scrapy
from poxiao.items import PoxiaoItem


class NameSpider(scrapy.Spider):
    name = 'name'
    allowed_domains = ['poxiao.com']
    start_urls = ['https://www.poxiao.com/type/movie/']

    def parse(self, response):

        movie=response.xpath('//div[@class="gkpic"]//img')
        for i in movie:
            item=PoxiaoItem()
            item['src']=i.xpath('./@src').extract_first()
            item['name']=i.xpath('./@alt').extract_first()
            yield item
            next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
            if next_page:
                yield response.follow("https://www.poxiao.com"+next_page,self.parse)

第一个小爬虫

import os
import requests
class PoxiaoPipeline(object):
    def process_item(self, item, spider):
        filename=os.path.join(r"d:untitled1poxiao",item['name']+'.jpg')
        with open(filename,'wb') as f:
            f.write(requests.get(item['src']).content)

查看全文

相关阅读:
rabbitMQ交换机的发布订阅模式
 Winforms平台界面开发技巧分享：增强的MVVM功能
 VCL分析工具DevExpress VCL全新发布v19.2.7
Winforms界面开发v20.1——兼容.Net Core 5
ASP.NET界面开发技巧放送，轻松自定义Grid运行时编辑表单布局
 现代Web开发堆栈工具DevExtreme 2020年首发v20.1.3
Web界面开发工具！Kendo UI for jQuery数据管理：虚拟滚动
 Winforms平台界面开发技巧分享：Data Grid和Tree List悬停行外观
 现代Web开发堆栈工具DevExtreme——增强UI小部件功能
 界面控件套包DevExpress 2020年首发v20.1.3

原文地址：https://www.cnblogs.com/xupanfeng/p/11765545.html