zoukankan      html  css  js  c++  java
  • scrapy实战1分布式爬取有缘网(6.22接口已挂):

    直接上代码:

    items.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # http://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class YouyuanwangItem(scrapy.Item):
    12     # define the fields for your item here like:
    13     # name = scrapy.Field()
    14     # 个人头像链接
    15     header_url=scrapy.Field()
    16     # 用户名
    17     username=scrapy.Field()
    18     # 内心独白
    19     monologue=scrapy.Field()
    20     # 相册图片链接
    21     pic_urls=scrapy.Field()
    22     #籍贯
    23     place_from=scrapy.Field()
    24     #学历
    25     education=scrapy.Field()
    26     # 年龄
    27     age=scrapy.Field()
    28     #身高
    29     height=scrapy.Field()
    30     #工资
    31     salary=scrapy.Field()
    32     #兴趣爱好
    33     hobby=scrapy.Field()
    34     # 网站来源 youyuan
    35     source=scrapy.Field()
    36     # 个人主页源url
    37     source_url=scrapy.Field()
    38     # 爬虫名
    39     spider=scrapy.Field()
    View Code

    spiders >yuoyuan.py

      1 # -*- coding: utf-8 -*-
      2 import scrapy
      3 from scrapy.linkextractors import LinkExtractor
      4 from scrapy.spiders import Rule
      5 from scrapy_redis.spiders import RedisCrawlSpider
      6 from youyuanwang.items import YouyuanwangItem
      7 
      8 
      9 
     10 # class YouyuanSpider(CrawlSpider):
     11 class youyuan(RedisCrawlSpider):
     12     name = 'youyuan'
     13     # allowed_domains = ['www.youyuan.com']
     14     # 有缘网的列表页
     15     # start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
     16     redis_key = 'youyuan:start_urls'
     17     #动态域范围的获取
     18     def __init__(self, *args, **kwargs):
     19         # Dynamically define the allowed domains list.
     20         domain = kwargs.pop('domain', '')
     21         self.allowed_domains = filter(None, domain.split(','))
     22         super(youyuan, self).__init__(*args, **kwargs)
     23     #匹配全国
     24     #list_page = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+'))
     25     # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接
     26     page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/pd+/")
     27     # 个人主页 匹配规则,根据response提取链接
     28     profile_page=LinkExtractor(allow=r"http://www.youyuan.com/d+-profile/")
     29 
     30     rules = (
     31         # 匹配列表页成功,跟进链接,跳板
     32         Rule(page_links),
     33         # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进
     34         Rule(profile_page,callback="parse_profile_page",follow=False)
     35     )
     36 
     37     # 处理个人主页信息,得到我们要的数据
     38     def parse_profile_page(self, response):
     39         item=YouyuanwangItem()
     40         # 个人头像链接
     41         item['header_url']=self.get_header_url(response)
     42         # 用户名
     43         item['username']=self.get_username(response)
     44         #籍贯
     45         item['place_from']=self.get_place_from(response)
     46         #学历
     47         item['education']=self.get_education(response)
     48 
     49         # 年龄
     50         item['age']=self.get_age(response)
     51         # 身高
     52         item['height']=self.get_height(response)
     53         # 工资
     54         item['salary']=self.get_salary(response)
     55         # 兴趣爱好
     56         item['hobby']=self.get_hobby(response)
     57         # 相册图片链接
     58         item['pic_urls'] = self.get_pic_urls(response)
     59         # 内心独白
     60         item['monologue'] = self.get_monologue(response)
     61         # 个人主页源url
     62         item['source_url']=response.url
     63         # 网站来源 youyuan
     64         item['source']="youyuan"
     65         # 爬虫名
     66         item['spider']="youyuan"
     67         yield item
     68    #提取头像地址
     69     def get_header_url(self,response):
     70         header=response.xpath('//dl[@class="personal_cen"]/dt/img/@src').extract()
     71         if len(header):
     72             header_url=header[0]
     73         else:
     74             header_url= ""
     75         return header_url.strip()
     76     #提取用户名
     77     def get_username(self,response):
     78         username=response.xpath('//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()').extract()
     79         if len(username):
     80             username=username[0]
     81         else:
     82             username=""
     83         return username.strip()
     84     #提取年龄
     85     def get_age(self,response):
     86         age=response.xpath('//dl[@class="personal_cen"]//p[@class="local"]/text()').extract()
     87         if len(age):
     88             age=age[0].split()[1]
     89         else:
     90             age=""
     91         return age
     92     #提取身高
     93     def get_height(self,response):
     94         height=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()').extract()
     95         if len(height):
     96             height=height[0]
     97         else:
     98             height=""
     99 
    100         return height.strip()
    101     #提取工资
    102     def get_salary(self,response):
    103         salary=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()').extract()
    104         if len(salary):
    105             salary=salary[0]
    106         else:
    107             salary=""
    108         return salary.strip()
    109     #提取兴趣爱好
    110     def get_hobby(self,response):
    111         hobby=response.xpath('//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()').extract()
    112         if len(hobby):
    113             hobby=",".join(hobby).replace(" ","")
    114         else:
    115             hobby=""
    116         return hobby.strip()
    117     #提取相册图片
    118     def get_pic_urls(self,response):
    119         pic_urls=response.xpath('//div[@class="ph_show"]/ul/li/a/img/@src').extract()
    120         if len(pic_urls):
    121             pic_urls=",".join(pic_urls)
    122             #将相册url列表转换成字符串
    123         else:
    124             pic_urls=""
    125         return pic_urls
    126     #提取内心独白
    127     def get_monologue(self,response):
    128         monologue=response.xpath('//div[@class="pre_data"]/ul/li/p/text()').extract()
    129         if len(monologue):
    130             monologue=monologue[0]
    131         else:
    132             monologue=""
    133         return monologue.strip()
    134     #提取籍贯
    135     def get_place_from(self,response):
    136         place_from=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()').extract()
    137         if len(place_from):
    138             place_from=place_from[0]
    139         else:
    140             place_from=""
    141         return place_from.strip()
    142     #提取学历
    143     def get_education(self,response):
    144         education=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()').extract()
    145         if len(education):
    146             education=education[0]
    147         else:
    148             education=""
    149         return education.strip()
    View Code
      1 # -*- coding: utf-8 -*-
      2 import scrapy
      3 from scrapy.linkextractors import LinkExtractor
      4 from scrapy.spiders import Rule,CrawlSpider
      5 #from scrapy_redis.spiders import RedisCrawlSpider
      6 from youyuanwang.items import YouyuanwangItem
      7 
      8 
      9 class YouyuanSpider(CrawlSpider):
     10 #class YouyuanSpider(RedisCrawlSpider):
     11     name = 'youyuan'
     12     allowed_domains = ['www.youyuan.com']
     13     # 有缘网的列表页
     14     start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
     15     #redis_key = 'YouyuanSpider:start_urls'
     16     #动态域范围的获取
     17     # def __init__(self, *args, **kwargs):
     18     #     # Dynamically define the allowed domains list.
     19     #     domain = kwargs.pop('domain', '')
     20     #     self.allowed_domains = filter(None, domain.split(','))
     21     #     super(YouyuanSpider, self).__init__(*args, **kwargs)
     22     #匹配全国
     23     #list_page = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+'))
     24     # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接
     25     page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/pd+/")
     26     # 个人主页 匹配规则,根据response提取链接
     27     profile_page=LinkExtractor(allow=r"http://www.youyuan.com/d+-profile/")
     28 
     29     rules = (
     30         # 匹配列表页成功,跟进链接,跳板
     31         Rule(page_links),
     32         # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进
     33         Rule(profile_page,callback="parse_profile_page",follow=False)
     34     )
     35 
     36     # 处理个人主页信息,得到我们要的数据
     37     def parse_profile_page(self, response):
     38         item=YouyuanwangItem()
     39         # 个人头像链接
     40         item['header_url']=self.get_header_url(response)
     41         # 用户名
     42         item['username']=self.get_username(response)
     43         #籍贯
     44         item['place_from']=self.get_place_from(response)
     45         #学历
     46         item['education']=self.get_education(response)
     47 
     48         # 年龄
     49         item['age']=self.get_age(response)
     50         # 身高
     51         item['height']=self.get_height(response)
     52         # 工资
     53         item['salary']=self.get_salary(response)
     54         # 兴趣爱好
     55         item['hobby']=self.get_hobby(response)
     56         # 相册图片链接
     57         item['pic_urls'] = self.get_pic_urls(response)
     58         # 内心独白
     59         item['monologue'] = self.get_monologue(response)
     60         # 个人主页源url
     61         item['source_url']=response.url
     62         # 网站来源 youyuan
     63         item['source']="youyuan"
     64         # 爬虫名
     65         item['spider']="youyuan"
     66         yield item
     67    #提取头像地址
     68     def get_header_url(self,response):
     69         header=response.xpath('//dl[@class="personal_cen"]/dt/img/@src').extract()
     70         if len(header):
     71             header_url=header[0]
     72         else:
     73             header_url= ""
     74         return header_url.strip()
     75     #提取用户名
     76     def get_username(self,response):
     77         username=response.xpath('//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()').extract()
     78         if len(username):
     79             username=username[0]
     80         else:
     81             username=""
     82         return username.strip()
     83     #提取年龄
     84     def get_age(self,response):
     85         age=response.xpath('//dl[@class="personal_cen"]//p[@class="local"]/text()').extract()
     86         if len(age):
     87             age=age[0].split()[1]
     88         else:
     89             age=""
     90         return age
     91     #提取身高
     92     def get_height(self,response):
     93         height=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()').extract()
     94         if len(height):
     95             height=height[0]
     96         else:
     97             height=""
     98 
     99         return height.strip()
    100     #提取工资
    101     def get_salary(self,response):
    102         salary=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()').extract()
    103         if len(salary):
    104             salary=salary[0]
    105         else:
    106             salary=""
    107         return salary.strip()
    108     #提取兴趣爱好
    109     def get_hobby(self,response):
    110         hobby=response.xpath('//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()').extract()
    111         if len(hobby):
    112             hobby=",".join(hobby).replace(" ","")
    113         else:
    114             hobby=""
    115         return hobby.strip()
    116     #提取相册图片
    117     def get_pic_urls(self,response):
    118         pic_urls=response.xpath('//div[@class="ph_show"]/ul/li/a/img/@src').extract()
    119         if len(pic_urls):
    120             pic_urls=",".join(pic_urls)
    121             #将相册url列表转换成字符串
    122         else:
    123             pic_urls=""
    124         return pic_urls
    125     #提取内心独白
    126     def get_monologue(self,response):
    127         monologue=response.xpath('//div[@class="pre_data"]/ul/li/p/text()').extract()
    128         if len(monologue):
    129             monologue=monologue[0]
    130         else:
    131             monologue=""
    132         return monologue.strip()
    133     #提取籍贯
    134     def get_place_from(self,response):
    135         place_from=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()').extract()
    136         if len(place_from):
    137             place_from=place_from[0]
    138         else:
    139             place_from=""
    140         return place_from.strip()
    141     #提取学历
    142     def get_education(self,response):
    143         education=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()').extract()
    144         if len(education):
    145             education=education[0]
    146         else:
    147             education=""
    148         return education.strip()
    View Code

    pipelines.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 # import json
     8 #
     9 # class YouyuanwangPipeline(object):
    10 #     def __init__(self):
    11 #         self.filename=open("youyuanwang.json","wb")
    12 #     def process_item(self, item, spider):
    13 #         jsontext=json.dumps(dict(item),ensure_ascii=False) + ",
    "
    14 #         self.filename.write(jsontext.encode("utf-8"))
    15 #         return item
    16 #     def close_spider(self,spider):
    17 #         self.filename.close()
    18 
    19 import pymysql
    20 from .models.es_types import YouyuanType
    21 class XiciPipeline(object):
    22     def process_item(self, item, spider):
    23         # DBKWARGS=spider.settings.get('DBKWARGS')
    24         con=pymysql.connect(host='127.0.0.1',user='root',passwd='229801',db='yunyuan',charset='utf8')
    25         cur=con.cursor()
    26         sql=("insert into youyuanwang(header_url,username,monologue,pic_urls,place_from,education,age,height,salary,hobby,source)"
    27              "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
    28         lis=(item['header_url'],item['username'],item['monologue'],item['pic_urls'],item['place_from'],item['education'],item['age'],item['height'],item['salary'],item['hobby'],item['source'])
    29 
    30         cur.execute(sql,lis)
    31         con.commit()
    32         cur.close()
    33         con.close()
    34         return item
    35 
    36 
    37 
    38 class ElasticsearchPipeline(object):
    39     def process_item(self,item,spider):
    40         youyuan = YouyuanType()
    41         youyuan.header_url=item["header_url"]
    42         youyuan.username=item["username"]
    43         youyuan.age=item["age"]
    44         youyuan.salary=item["salary"]
    45         youyuan.monologue=item["monologue"]
    46         youyuan.pic_urls=item["pic_urls"]
    47         youyuan.place_from=item["place_from"]
    48 
    49         youyuan.save()
    50 
    51         return item
    View Code

    settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for youyuanwang project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     http://doc.scrapy.org/en/latest/topics/settings.html
    #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'youyuanwang'
    
    SPIDER_MODULES = ['youyuanwang.spiders']
    NEWSPIDER_MODULE = 'youyuanwang.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'youyuanwang (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    #ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'youyuanwang.middlewares.YouyuanwangSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'youyuanwang.middlewares.MyCustomDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
     DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
     SCHEDULER = "scrapy_redis.scheduler.Scheduler"
     SCHEDULER_PERSIST = True
    
    # Configure item pipelines
    # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       #'youyuanwang.pipelines.XiciPipeline': 300,
       'youyuanwang.pipelines.ElasticsearchPipeline': 300,
       # 'scrapy_redis.pipelines.RedisPipeline':400,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    View Code

    从redis保存到mongodb 在目录下新建文件process_item_mongo.py(名字随便取)

     1 #coding=utf-8
     2 
     3 
     4 import pymongo
     5 import redis
     6 import json
     7 
     8 def process_item():
     9     Redis_conn=redis.StrictRedis(host='127.0.0.1',port=6379,db=0)
    10     Mongo_conn=pymongo.MongoClient(host='127.0.0.1',port=27017)
    11     db=Mongo_conn["youyuan"]
    12     table=db["beijing_18_25"]
    13     while True:
    14         source, data = Redis_conn.blpop(["youyuan:items"])
    15         data = json.loads(data.decode("utf-8"))
    16         table.insert(data)
    17 if __name__=="__main__":
    18     process_item()
    View Code

    从redis保存到mysql 在目录下新建文件process_item_mysql.py(名字随便取)

     1 #coding=utf-8
     2 
     3 import pymysql
     4 import redis
     5 import json
     6 
     7 def process_item():
     8     Redis_conn=redis.StrictRedis(host='127.0.0.1',port=6379,db=0)
     9     MySql_conn=pymysql.connect(host='127.0.0.1',user='root',passwd='229801',port=3306,db='yunyuan')
    10     while True:
    11         source,data=Redis_conn.blpop("youyuan:items")
    12         data=json.loads(data.decode("utf-8"))
    13         cur=MySql_conn.cursor()
    14         sql=("insert into youyuanwang(header_url,username,monologue,pic_urls,place_from,education,age,height,salary,hobby,source)"
    15              "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
    16         lis = (data['header_url'], data['username'], data['monologue'], data['pic_urls'], data['place_from'],
    17                data['education'], data['age'], data['height'], data['salary'], data['hobby'], data['source'])
    18         cur.execute(sql,lis)
    19         MySql_conn.commit()
    20         cur.close()
    21         MySql_conn.close()
    22     if __name__=="__main__":
    23         process_item()
    View Code

     数据:

    申明:以上只限于参考学习交流!!! 更多:https://github.com/huwei86/Spideryouyuanwang

  • 相关阅读:
    Java多线程同步和异步问题
    最优二叉查找树
    岛屿的周长
    Mac 环境下 go 国内代理配置
    岛屿数量
    字符串解码
    环形链表 II
    颜色分类
    无重复字符的最长子串
    完全平方数
  • 原文地址:https://www.cnblogs.com/huwei934/p/6985380.html
Copyright © 2011-2022 走看看