zoukankan      html  css  js  c++  java
  • 展开阅读全文 js 爬虫操作

    from selenium import webdriver
    import time
    import random
    from bs4 import *
    
    browser = webdriver.Chrome()
    url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
    browser.get(url)
    
    ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
    ck_l_ori_ok = 0
    try:
        for isc in range(100):
            if ck_l_ori_ok == ck_l_ori_len:
                break
            time.sleep(1)
            js = 'window.scrollTo(0,document.body.scrollHeight)'
            js = 'window.scrollTo(0,100*{})'.format(isc)
            browser.execute_script(js)
            ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
            for i in ck_l:
                try:
                    i.click()
                    ck_l_ori_ok += 1
                except Exception as e:
                    print(e)
    except Exception as e:
        print('window.scrollTo-->', e)
    
    # ck_l=browser.find_elements_by_link_text('展开阅读全文 ∨')
    # for i in ck_l:
    #     try:
    #         i.click()
    #     except Exception as e:
    #         print(e)
    
    
    xp_l = ['//*[@id="fanyi967"]/div/div[3]/a', ]
    
    myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
    with open(myhtml, 'w', encoding='utf-8') as fw:
        fw.write(browser.page_source)
    sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
    with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
        bs = BeautifulSoup(myhtml_o, 'html.parser')
    
        dd = 9
    a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){a_[i].click()}}
    

      

    a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}
    

      

    from selenium import webdriver
    import time
    import random
    from bs4 import *
    
    browser = webdriver.Chrome()
    url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
    browser.get(url)
    
    # ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
    # ck_l_ori_ok = 0
    # try:
    #     for isc in range(100):
    #         if ck_l_ori_ok == ck_l_ori_len:
    #             break
    #         time.sleep(1)
    #         js = 'window.scrollTo(0,document.body.scrollHeight)'
    #         js = 'window.scrollTo(0,100*{})'.format(isc)
    #         browser.execute_script(js)
    #         ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
    #         for i in ck_l:
    #             try:
    #                 i.click()
    #                 ck_l_ori_ok += 1
    #             except Exception as e:
    #                 print(e)
    # except Exception as e:
    #     print('window.scrollTo-->', e)
    
    js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"
    try:
        browser.execute_script(js)
    except Exception as e:
        print(e)
        ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
        ck_l_ori_ok = 0
        try:
            for isc in range(100):
                if ck_l_ori_ok == ck_l_ori_len:
                    break
                time.sleep(1)
                js = 'window.scrollTo(0,document.body.scrollHeight)'
                js = 'window.scrollTo(0,100*{})'.format(isc)
                browser.execute_script(js)
                ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
                for i in ck_l:
                    try:
                        i.click()
                        ck_l_ori_ok += 1
                    except Exception as e:
                        print(e)
        except Exception as e:
            print('window.scrollTo-->', e)
    from selenium import webdriver
    import time
    import random
    from bs4 import *
    from pyquery import PyQuery as pq
    
    browser = webdriver.Chrome()
    url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
    browser.get(url)
    
    js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"
    try:
        browser.execute_script(js)
    except Exception as e:
        print(e)
        ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
        ck_l_ori_ok = 0
        try:
            for isc in range(100):
                if ck_l_ori_ok == ck_l_ori_len:
                    break
                time.sleep(1)
                js = 'window.scrollTo(0,document.body.scrollHeight)'
                js = 'window.scrollTo(0,100*{})'.format(isc)
                browser.execute_script(js)
                ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
                for i in ck_l:
                    try:
                        i.click()
                        ck_l_ori_ok += 1
                    except Exception as e:
                        print(e)
        except Exception as e:
            print('window.scrollTo-->', e)
    
    doc = pq(browser.page_source)
    pq_r_d = {'xmlns="http://www.w3.org/1999/xhtml"': ''}
    r_k, r_v = 'xmlns="http://www.w3.org/1999/xhtml"', ''
    article_ = doc('.left>:nth-child(2).sons>.cont>.contson').html().replace(r_k, r_v)
    title_d = {'h1': doc('.left>:nth-child(2).sons>.cont>:nth-child(2)').html().replace(r_k, r_v)}
    author_d = {'h3': doc('.left>:nth-child(2).sons>.cont>:nth-child(3)').text()}
    translation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(2)').html().replace(r_k, r_v)
    explanation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(3)').html().replace(r_k, r_v)
    refer_ = doc('.left>:nth-child(4)>.cankao').html().replace(r_k, r_v)
    
    author_img_url = doc('.left>.sonspic>.cont>.divimg>:nth-child(1)').html().split('src="')[-1].split('"')[0]
    
    d = 4
    

      

  • 相关阅读:
    HDU 5319 Painter
    HDU 5328 Problem Killer
    HDU 5327 Olympiad
    HDU 5339 Untitled
    HDU 5335 Walk Out
    HDU 5317 RGCDQ
    HDU 5326 Work
    CF GYM 100703A Tea-drinking
    CF GYM 100703B Energy Saving
    CF GYM 100703F Game of words
  • 原文地址:https://www.cnblogs.com/rsapaper/p/8933564.html
Copyright © 2011-2022 走看看