zoukankan      html  css  js  c++  java
  • 123123

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2017-07-27 15:08:01
    # Project: 36_ke

    from pyspider.libs.base_handler import *
    from msxflibs.pyspider.public.database.tomysql import ToMysql
    from msxflibs.pyspider.projects.newmedia.images import extract_img_url
    from datetime import datetime
    import hashlib
    import time
    import json


    # 36氪
    class Handler(BaseHandler):
    # 消息头设置
    headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"
    }

    ajax_headers = {
    "Host": "36kr.com",
    "Connection": "keep-alive",
    "Accept": "*/*",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
    # "Referer": "http://jingji.cctv.com/caijing/index.shtml",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8"
    }

    crawl_config = {
    "headers": headers,
    "timeout": 60000
    }

    web_name = "36氪"

    #newsflashes_url = "http://36kr.com/newsflashes"

    columns = {
    "明星公司": ["http://36kr.com/api/search/articles/%E6%98%8E%E6%98%9F%E5%85%AC%E5%8F%B8?page=1&pageSize=40&_=",
    "http://36kr.com/search/articles/%E6%98%8E%E6%98%9F%E5%85%AC%E5%8F%B8?page=1&ts="],
    "行业新闻": ["http://36kr.com/api/search/articles/%E8%A1%8C%E4%B8%9A%E6%96%B0%E9%97%BB?page=1&pageSize=40&_=",
    "http://36kr.com/search/articles/%E8%A1%8C%E4%B8%9A%E6%96%B0%E9%97%BB?page=1&ts="],
    "行业研究": ["http://36kr.com/api/search/articles/%E8%A1%8C%E4%B8%9A%E7%A0%94%E7%A9%B6?page=1&pageSize=40&_=",
    "http://36kr.com/search/articles/%E8%A1%8C%E4%B8%9A%E7%A0%94%E7%A9%B6?page=1&ts="]
    }

    datetime_format_to_space_s = '%Y-%m-%d %H:%M:%S'

    @every(minutes=60)
    def on_start(self):
    #self.crawl(self.newsflashes_url, callback=self.parse_newsflashes_page)
    for i in self.columns:
    current_seconds = int(time.time())
    current_millis = int(time.time() * 1000)
    ajax_url = self.columns[i][0] + str(current_millis)
    page_url = self.columns[i][1] + str(current_seconds - 3)
    self.ajax_headers['Referer'] = page_url
    self.crawl(ajax_url, headers=self.ajax_headers, callback=self.index_page)

    # @config(age=60 * 60, priority=6)
    # def parse_newsflashes_page(self, response):
    # props = self.get_script_props(response)
    # if props is None:
    # return
    # newsflash_list = props.get('newsflashList|newsflash')
    # for newsflash in newsflash_list:
    # newsflash_r = self.get_newsflash_result(newsflash)
    # self.on_result(newsflash_r)

    # def get_newsflash_result(self, newsflash):
    # url = 'http://36kr.com/newsflashes?column_id=' + (newsflash.get('column_id') or '') + '&id=' + (newsflash.get('id') or '')
    # publish_time_str = newsflash.get('published_at')
    # publish_time = datetime.strptime(publish_time_str, self.datetime_format_to_space_s)
    # return {
    # "url": url,
    # "url_hash_code": hashlib.sha256(url).hexdigest(),
    # "title": newsflash.get('title') or '',
    # "keywords": '',
    # "description": '',
    # "publish_time": publish_time or '',
    # "article_resouce": '36氪 7x24 快讯',
    # "article_resouce_link": newsflash.get('news_url') or '',
    # "content": newsflash.get('description') or '',
    # "gmt_create_time": datetime.now(),
    # "gmt_update_time": datetime.now(),
    # "web_name": self.web_name,
    # "article_type": 1,
    # "image_url": '',
    # "is_image_inside": False
    # }

    def get_script_props(self, response):
    for script in response.doc("script").items():
    if script is None:
    continue
    script_str = script.text().encode('UTF-8').strip()
    if script_str is None or len(script_str) <= 0:
    continue
    if not str(script_str).startswith('var props='):
    continue
    index = script_str.find('{')
    if index == -1:
    continue
    script_str = script_str[index:]
    index = script_str.find(',locationnal=')
    if index == -1:
    continue
    script_str = script_str[:index]
    if len(script_str) <= 0:
    continue
    props = json.loads(script_str)
    if props is None:
    continue
    return props

    @config(age=60 * 60, priority=6)
    def index_page(self, response):
    print(response.url)
    data_dict = response.json
    print('data_dict: ' + ((data_dict and str(data_dict)) or 'None'))
    if data_dict is None:
    return
    data_dict = data_dict.get(u'data')
    if data_dict is None:
    return
    datas = data_dict.get(u'data')
    if datas is None or len(datas) == 0:
    return
    for data_d in datas:
    article_id = data_d.get(u'id')
    if article_id is None or len(str(article_id).strip()) <= 0:
    continue
    article_url = "http://36kr.com/p/" + str(article_id) + ".html"
    article_img_url = data_d.get(u'img')
    self.crawl(article_url, save={'article_image': article_img_url or ''}, callback=self.detail_page)

    @config(age=60 * 60, priority=10)
    def detail_page(self, response):
    props = self.get_script_props(response)
    if props is None:
    return
    detailArticle = props.get('detailArticle|post')

    title = response.doc('title').text().strip()
    image_url = self.get_image_url(response)
    image_url = image_url or detailArticle.get('cover')

    publish_time_str = detailArticle.get('published_at')
    publish_time = datetime.strptime(publish_time_str, self.datetime_format_to_space_s)

    article_resource = (detailArticle.get('user') and detailArticle.get('user').get('name')) or ''
    return {
    "url": response.url,
    "url_hash_code": hashlib.sha256(response.url).hexdigest(),
    "title": title,
    "keywords": response.doc('head meta[name="keywords"]').attr('content') or '',
    "description": response.doc('head meta[name="description"]').attr('content') or '',
    "publish_time": publish_time or '',
    "article_resouce": article_resource or '',
    "article_resouce_link": detailArticle.get('source_urls') or '',
    "content": self.get_content(response) or '',
    "gmt_create_time": datetime.now(),
    "gmt_update_time": datetime.now(),
    "web_name": self.web_name,
    "article_type": 1,
    "image_url": image_url or '',
    "is_image_inside": bool(image_url)
    }

    def on_result(self, result):
    if not result or not result['title']:
    return
    sql = ToMysql()
    sql.into('web_page_content', **result)

    # 获取文章整个页面
    def get_content(self, response):
    content = response.content.replace(' ', '').replace(' ', '').replace(' ', '').replace('&amp;', '&')
    content = unicode(content, response.encoding)
    print("content:")
    print(content)
    return content

    # 获取文章图片 image_url
    def get_image_url(self, response):
    # extract_img_url(response.url, response.doc('div.art_context'), 'div[align="center"]>img')
    article_image = response.save.get('article_image')
    if article_image is not None and len(article_image) > 0:
    image_url = article_image
    print("image_url:")
    print(image_url)
    return image_url

  • 相关阅读:
    HTTP状态码
    NSData NSDate NSString NSArray NSDictionary 相互转换
    NSDictionary to jsonString || 对象转json格式
    git 上传本地文件到github
    NSAssert用法
    深入理解GCD(一)
    ug-Assertion failure in [MyClass layoutSublayersOfLayer:]
    构建之法阅读笔记01
    学习进度
    四则运算程序
  • 原文地址:https://www.cnblogs.com/feifang/p/7454890.html
Copyright © 2011-2022 走看看