zoukankan      html  css  js  c++  java
  • 局部优化与整体效果 新增时间>节省时间 权衡利弊

    原代码

    from selenium import webdriver
    import requests,time
    url_l=[]
    with open('DISTINCT_url.txt', 'r', encoding='utf-8') as fo:
    for i in fo:
    url = '%s%s' % ('http://', i.replace(' ', ''))
    url_l.append(url)
    le,c=len(url_l),0
    # browser = webdriver.Firefox()
    # browser = webdriver.Chrome()

    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference('permissions.default.image', 2)#某些firefox只需要这个
    firefox_profile.set_preference('browser.migration.version', 9001)#部分需要加上这个
    #禁用css
    firefox_profile.set_preference('permissions.default.stylesheet', 2)
    #禁用flash
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
    # #禁用js
    # firefox_profile.set_preference('javascript.enabled', 'false')

    browser = webdriver.Firefox(firefox_profile=firefox_profile)

    # browser = webdriver.Firefox()
    import random


    for i in url_l:
    c+=1
    with open('DISTINCT_url.break.log', 'r', encoding='utf-8') as fc:
    for breaker in fc:
    break
    if c<int(breaker):
    continue
    if c%50==0:
    time.sleep(random.randint(0,3))
    with open('DISTINCT_url.break.log', 'w', encoding='utf-8') as flog:
    flog.write(str(c))

    print(c,'/',le,'---',i)
    try:
    # browser = webdriver.Firefox()
    # browser.minimize_window()
    # browser.set_window_size(10,10)
    browser.get(i)
    page_source = browser.page_source
    # browser.quit()
    if 'us.com/adunion.js' not in page_source:
    s='%s%s' % (i.replace('http://',''),' ')
    print(s)
    with open('DISTINCT_url.404.txt', 'a', encoding='utf-8') as fr:
    fr.write(s)
    print('NOT-IN','---',i)
    else:
    print('OK')
    #browser.close()
    except Exception as e:
    time.sleep(1)
    with open('DISTINCT_url.404.ex.txt', 'a', encoding='utf-8') as fex:
    es='%s%s' %(str(e),' ')
    fex.write(es)
    print(es)


    优化后
    优化原因:上述代码在404情况下执行时间相对
    requests.get(i)
    if req.status_code !
    获取返回码时间长

    但是404的url在总url池中的比例相对非404但需要检测html广告代码的部分少,而该部分必须调动浏览器执行;由此相当与对该部分增加了requests的时间消耗,而其代价已经超过了获取404url的节省的时间
    新增时间>节省时间
    弊大于利




    from selenium import webdriver
    import requests, time

    url_l = []
    with open('DISTINCT_url.txt', 'r', encoding='utf-8') as fo:
    for i in fo:
    url = '%s%s' % ('http://', i.replace(' ', ''))
    url_l.append(url)
    le, c = len(url_l), 0
    # browser = webdriver.Firefox()
    # browser = webdriver.Chrome()

    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference('permissions.default.image', 2) # 某些firefox只需要这个
    firefox_profile.set_preference('browser.migration.version', 9001) # 部分需要加上这个
    # 禁用css
    firefox_profile.set_preference('permissions.default.stylesheet', 2)
    # 禁用flash
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
    # #禁用js
    firefox_profile.set_preference('javascript.enabled', 'false')

    browser = webdriver.Firefox(firefox_profile=firefox_profile)

    # browser = webdriver.Firefox()
    import random

    while True:
    for i in url_l:
    c += 1
    with open('DISTINCT_url.break.log', 'r', encoding='utf-8') as fc:
    for breaker in fc:
    break
    if c < int(breaker):
    continue
    if c % 50 == 0:
    time.sleep(random.randint(0, 3))
    with open('DISTINCT_url.break.log', 'w', encoding='utf-8') as flog:
    flog.write(str(c))

    print(c, '/', le, '---', i)
    try:
    try:
    req = requests.get(i)
    if req.status_code != 404:
    try:
    browser.get(i)
    page_source = browser.page_source
    if 'us.com/adunion.js' not in page_source:
    s = '%s%s' % (i.replace('http://', ''), ' ')
    with open('DISTINCT_url.404.txt', 'a', encoding='utf-8') as fr:
    fr.write(s)
    print('NOT-IN', '---', i)
    else:
    print('OK', '---', i)
    except Exception as e:
    time.sleep(1)
    with open('DISTINCT_url.404.ex.txt', 'a', encoding='utf-8') as fex:
    es = '%s%s' % (str(e), ' ')
    fex.write(es)
    else:
    with open('DISTINCT_url.404.txt', 'a', encoding='utf-8') as fr:
    s = '%s%s' % (i.replace('http://', ''), ' ')
    print('404---', s)
    fr.write(s)
    req.close()
    except:
    continue
    except:
    continue





  • 相关阅读:
    SSDT
    SSDT
    Rootkit之SSDT hook(通过CR0)
    直接用编译器按ctrl+F5运行和双击运行结果不一样
    HDU 1754 I Hate It
    HDU 1166 敌兵布阵
    网易2017内推笔试题 合唱团
    CodeForces 1151F Sonya and Informatics
    CodeForces 1151E Number of Components
    洛谷 P1962 斐波那契数列
  • 原文地址:https://www.cnblogs.com/rsapaper/p/8403552.html
Copyright © 2011-2022 走看看