zoukankan      html  css  js  c++  java
  • win系统apscheduler + asyncio 问题解决

    async def beijing_spider():
    '''开始抓'''
    #北京采购网

    tim=time_spider()
    for i in range(1,31):#设置20 为每天抓
    '''选择71 刚好是一个月'''
    print('北京采购网第%s页' % i)
    url='http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/index_%s.html'% i
    try:
    proxies = random.sample(proxies_list, 1)[0]

    page = requests.get(url=url, headers=headers,proxies=proxies).text
    print(111, proxies)
    except:
    proxies = random.sample(proxies_list, 1)[0]

    page = requests.get(url=url, headers=headers,proxies=proxies).text
    print(222, proxies)
    tree = etree.HTML(page)
    ul_list = tree.xpath('//ul[@class="xinxi_ul"]/li')
    for ul in ul_list:
    name = ul.xpath('./a/text()')[0].strip()
    for ii in ll:
    if ii in name:
    new_url = 'http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/' + ul.xpath('./a/@href')[0].split('/')[-1]
    datatime = ul.xpath('./span/text()')[0].strip()
    pag= requests.get(url=new_url, headers=headers).text
    source=name+datatime
    source_id = hashlib.md5(source.encode()).hexdigest() #设置唯一id 做去重

    #去重
    sql="select source from ZHAOBIAO_SPIDER where source='%s' "%source_id
    ret = ora_obj.open_oracle(sql)
    ora_obj.off()
    if len(ret)>0:
    print('%s>>>已经有了'% name)
    else:
    #没有就添加
    sql1="insert into ZHAOBIAO_SPIDER values(sys_guid(),'%s','%s',to_date('%s','yyyy-mm-dd'),'%s',to_date('%s','yyyy-mm-dd'))" %(source_id,name,datatime,new_url,tim)
    ora_obj.oracle_work(sql1)
    ora_obj.off()
    filename=os.path.join(pat,'html','%s.html')% source_id
    with open(filename,'w')as f:
    f.write(pag)
    print(source_id, name, datatime, new_url, tim)
    return


    async def jincaiwang_spider():
    '''开始抓 金菜网'''

    tim=time_spider()
    for i in range(1,31): #设置20 为每天抓
    '''选择71 刚好是一个月'''
    print('金菜网第%s页'% i)
    url='http://www.cfcpn.com/plist/caigou?pageNo=%s'% i
    try:
    proxies = random.sample(proxies_list, 1)[0]

    page = requests.get(url=url, headers=headers).text
    print(111, proxies)
    except:
    proxies = random.sample(proxies_list, 1)[0]

    page = requests.get(url=url, headers=headers,proxies=proxies).text
    print(222, proxies)
    tree = etree.HTML(page)
    p_list = tree.xpath('//div[@class="cfcpn_list_content text-left"]')
    for p in p_list:
    name = p.xpath('./p[1]/a/text()')[0].strip()
    for ii in ll:
    if ii in name:
    new_url = 'http://www.cfcpn.com' + p.xpath('./p[1]/a/@href')[0]
    datatime = p.xpath('./p[2]/text()')[0].strip().replace('发布时间:','')[:10]
    pag=requests.get(url=new_url, headers=headers).text
    source=name+datatime
    source_id = hashlib.md5(source.encode()).hexdigest() #设置唯一id 做去重

    #去重
    sql="select source from ZHAOBIAO_SPIDER where source='%s' "%source_id
    ret = ora_obj.open_oracle(sql)
    ora_obj.off()
    if len(ret)>0:
    print('%s>>>已经有了'% name)
    else:
    #没有就添加
    sql1="insert into ZHAOBIAO_SPIDER values(sys_guid(),'%s','%s',to_date('%s','yyyy-mm-dd'),'%s',to_date('%s','yyyy-mm-dd'))" %(source_id,name,datatime,new_url,tim)
    ora_obj.oracle_work(sql1)
    ora_obj.off()
    filename = os.path.join(pat, 'html', '%s.html') % source_id
    with open(filename, 'w',encoding='utf8')as f:
    f.write(pag)
    print(source_id, name, datatime, new_url, tim)

    return


    async def zhongyang_spider():
    '''开始抓 中央采购网'''
    from selenium.webdriver.chrome.options import Options
    from selenium import webdriver
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(executable_path=r'confchromedriver.exe', chrome_options=chrome_options)
    try:
    tim=time_spider()
    for i in range(1,11):
    '''选择11 刚好是一个月'''
    print('中央采购网第%s页'% i)
    url='http://www.zycg.gov.cn/article/llist?catalog=StockAffiche&page=%s'% i
    try:
    proxies = random.sample(proxies_list, 1)[0]

    page = requests.get(url=url, headers=headers,proxies=proxies).text
    print(111, proxies)
    except:
    proxies = random.sample(proxies_list, 1)[0]

    page = requests.get(url=url, headers=headers,proxies=proxies).text
    print(222, proxies)
    tree = etree.HTML(page)
    li_list = tree.xpath('//ul[@class="lby-list"]/li')
    for li in li_list:
    try:
    name = li.xpath('./a/text()')[0].strip()
    except:
    name=''
    for ii in ll:
    if ii in name:
    new_url = 'http://www.zycg.gov.cn' + li.xpath('./a/@href')[0]
    datatime = li.xpath('./span/text()')[0].strip().replace('[','').replace(']','')
    source = name + datatime
    source_id = hashlib.md5(source.encode()).hexdigest() # 设置唯一id 做去重

    #获取页面
    driver.get(new_url)
    time.sleep(random.uniform(0.5,1))
    js = 'window.scrollTo(0,document.body.scrollHeight)'
    driver.execute_script(js)
    time.sleep(random.uniform(1, 2))
    driver.switch_to.frame("ueditor_0")
    pag = driver.page_source


    #去重
    sql="select source from ZHAOBIAO_SPIDER where source='%s' "%source_id
    ret = ora_obj.open_oracle(sql)
    ora_obj.off()
    if len(ret)>0:
    print('%s>>>已经有了'% name)
    else:
    #没有就添加
    sql1="insert into ZHAOBIAO_SPIDER values(sys_guid(),'%s','%s',to_date('%s','yyyy-mm-dd'),'%s',to_date('%s','yyyy-mm-dd'))" %(source_id,name,datatime,new_url,tim)
    ora_obj.oracle_work(sql1)
    ora_obj.off()
    filename = os.path.join(pat, 'html', '%s.html') % source_id
    with open(filename, 'w', encoding='GB18030')as f:
    f.write(pag)
    print(source_id, name, datatime, new_url, tim)
    except Exception as e:
    print(e)
    finally:
    driver.quit()
    return


    def start():
    print('----------------------%s-----------------------------' % time_time())
    try:
    # loop = asyncio.get_event_loop() 用这种会出现下面报错 使用apscheduler + asyncio 建议使用以下方式
    # 处理报错 RuntimeError: There is no current event loop in thread 'ThreadPoolExecutor-0_0'.
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    bj = loop.create_task(beijing_spider())
    jc = loop.create_task(jincaiwang_spider())
    zy = loop.create_task(zhongyang_spider())

    loop.run_until_complete(asyncio.wait([bj,jc,zy]))
    except Exception as e:
    logging.error(traceback.format_exc())
    finally:
    print('---------------------结束---------------------------' )

    if __name__ == '__main__':
    print('等待中......')
    scheduler = BlockingScheduler()
    scheduler.add_job(start, 'cron', hour=8, minute=30) # 'interval', seconds=40
    try:
    scheduler.start()
    except (KeyboardInterrupt, SystemExit):
    pass
    except Exception as e:
    print(e)
    logging.error(traceback.format_exc())
  • 相关阅读:
    人类登月不可或缺 大型机半个世纪发展史
    宽带上网知识(如何进行上网流程配置,路由器上网配置)
    团队项目第二阶段——第二天
    用户体验评价之QQ浏览器
    团队项目第二阶段——第一天
    第十四周学习进度总结
    《梦断代码》阅读笔记02
    第十三周学习进度总结
    第一阶段意见评论
    第十二周学习进度总结
  • 原文地址:https://www.cnblogs.com/xdlzs/p/11346732.html
Copyright © 2011-2022 走看看