zoukankan      html  css  js  c++  java
  • shenzhensgxk

    # -*- coding: utf-8 -*-
    import scrapy
    import re

    class SpiderShenzhenSpider(scrapy.Spider):
    name = 'spider_shenzhen'

    start_urls = ['http://www.szjs.gov.cn/bsfw/jggs/sgxk/']
    # urls = 'http://portal.szjs.gov.cn:8888/gongshi/sgxkList.html?page=1&qymc=&ann_serial=&pro_name='
    # urls = 'http://portal.szjs.gov.cn:8888/gongshi/sgxkList.html?page=1369&qymc=&ann_serial=&pro_name='
    def start_requests(self):
    for i in range(400,600):
    urls = 'http://portal.szjs.gov.cn:8888/gongshi/sgxkList.html?page={}&qymc=&ann_serial=&pro_name='.format(str(i))
    yield scrapy.Request(url=urls,callback=self.get_parse,priority=1)
    def get_parse(self, response):
    r = response.xpath('//tr/td/a/@onclick').extract()
    for rs in r:
    r = re.match("serachbyId('(.*?)','(.*?)')",rs)
    result1 = r.group(1)
    result2 = r.group(2)
    # print(result1,result2)
    content_url = 'http://portal.szjs.gov.cn:8888/gongshi/sgxkz.html?instanceGuid={}&yxtywlsh={}'.format(result1,result2)
    # print(content_url)
    test_url = 'http://portal.szjs.gov.cn:8888/gongshi/sgxkz.html?instanceGuid=4403062018002301&yxtywlsh=2018-0214'
    yield scrapy.Request(url=content_url,callback=self.get_one,priority=4)
    def get_one(self,response):
    result = response.xpath('//tr/td/text()').extract()
    print(result)
    # result_list = '_'.join(result)
    # re.findall()
    print(len(result))
    with open('test400-600.xlsx','a+',encoding='utf-8') as f:
    f.write(response.url+' ')
    i = 1
    while i<len(result):
    # print(result[i-1].replace('xa0',''),result[i].replace('xa0',''))
    f.write(result[i].replace('xa0','').replace(' ','').replace(' ','').replace(' ','')+' ')
    i += 2
    f.write(' ')

  • 相关阅读:
    JAVA编写的一个简单的Socket实现的HTTP响应服务器
    IPTV 质量评价方法已经不适用于 OTT TV 质量评价
    AAC规格(LC,HE,HEv2)及性能对比
    洛谷 P1572 计算分数
    洛谷 P2128 赤壁之战
    洛谷 P2818 天使的起誓
    洛谷 P3914 染色计数
    洛谷 P1193 洛谷团队训练VS传统团队训练
    洛谷 P1318 积水面积
    洛谷 P1061 Jam的计数法
  • 原文地址:https://www.cnblogs.com/currynashinians000/p/9014851.html
Copyright © 2011-2022 走看看