zoukankan      html  css  js  c++  java
  • pyspider--post

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-08-19 14:47:28
    # Project: HBGGZY_SBJ


    import json
    import pymongo
    import hashlib
    from bs4 import BeautifulSoup
    from pyspider.libs.base_handler import *


    class Handler(BaseHandler):
    def __init__(self):
    self.data = json.dumps({"token":"","pn":0,"rn":10,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"title","cnum":"001","sort":"{"showdate":"0"}","ssort":"title","cl":200,"terminal":"","condition":[{"fieldName":"categorynum","isLike":"true","likeType":2,"equal":"003005"},{"fieldName":"infoc","isLike":"true","likeType":2,"equal":"1300"}],"time":"null","highlights":"title","statistics":"null","unionCondition":"null","accuracy":"","noParticiple":"0","searchRange":"null","isBusiness":1})
    self.Client = pymongo.MongoClient()
    self.db = self.Client["Tender"]
    self.tb = self.db['HBGGZY']

    crawl_config = {
    }

    @every(minutes=15)
    def on_start(self):
    self.crawl("http://www.hebpr.cn/inteligentsearch/rest/inteligentSearch/getFullTextDataNew",callback=self.index_page,data=self.data,age=60)
    @config(age=20 * 24 *60 * 60)
    def index_page(self, response):
    data = json.loads(response.text)['result']['records']
    for item in data:
    self.crawl('http://www.hebpr.cn'+item['linkurl'],callback=self.detail_page,save={'title':item['title'],'show_date':item['showdate'],'province':'河北省','city':'省本级','county':item['zhuanzai']})

    @config(priority=2)
    def detail_page(self, response):
    sha1_scripy = hashlib.sha1(response.doc('title').text().encode('utf8'))
    sha1_title = sha1_scripy.hexdigest()
    data = {
    "url": response.url,
    "title": response.doc('title').text().encode('utf8'),
    "content":str(BeautifulSoup(response.text).find_all("div",id="hideDeil")[0]),
    "show_date":response.save["show_date"].split(" ")[0],
    "province":response.save["province"],
    "city":response.save["city"],
    "county":response.save["county"],
    "sha1_title":sha1_title,
    "is_indb":"0",
    "province_id":"130000",
    "city_id":"0",
    "county_id":"0",

    }

    MyQuery = self.tb.find({"sha1_title":sha1_title})
    if MyQuery.count()>0:
    print "存在了"
    else:
    self.tb.insert(data)









  • 相关阅读:
    编写登陆认证程序
    Ubuntu18.04安装MySQL
    python输出的高亮显示
    河北省赛
    dp
    迷宫问题
    牛客-幸运数字Ⅱ
    [管理运筹学]线性规划&单纯形法的各种姿势(题目:[NOI2008]志愿者招募)
    [管理运筹学]指派问题的匈牙利算法及其c++实现 (例:「网络流 24 题」分配问题 )
    打算在CSDN写了,虽然博客园也很好
  • 原文地址:https://www.cnblogs.com/pxfb/p/9502323.html
Copyright © 2011-2022 走看看