原代码
from selenium import webdriver
import requests,time
url_l=[]
with open('DISTINCT_url.txt', 'r', encoding='utf-8') as fo:
for i in fo:
url = '%s%s' % ('http://', i.replace(' ', ''))
url_l.append(url)
le,c=len(url_l),0
# browser = webdriver.Firefox()
# browser = webdriver.Chrome()
firefox_profile = webdriver.FirefoxProfile()
firefox_profile.set_preference('permissions.default.image', 2)#某些firefox只需要这个
firefox_profile.set_preference('browser.migration.version', 9001)#部分需要加上这个
#禁用css
firefox_profile.set_preference('permissions.default.stylesheet', 2)
#禁用flash
firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
# #禁用js
# firefox_profile.set_preference('javascript.enabled', 'false')
browser = webdriver.Firefox(firefox_profile=firefox_profile)
# browser = webdriver.Firefox()
import random
for i in url_l:
c+=1
with open('DISTINCT_url.break.log', 'r', encoding='utf-8') as fc:
for breaker in fc:
break
if c<int(breaker):
continue
if c%50==0:
time.sleep(random.randint(0,3))
with open('DISTINCT_url.break.log', 'w', encoding='utf-8') as flog:
flog.write(str(c))
print(c,'/',le,'---',i)
try:
# browser = webdriver.Firefox()
# browser.minimize_window()
# browser.set_window_size(10,10)
browser.get(i)
page_source = browser.page_source
# browser.quit()
if 'us.com/adunion.js' not in page_source:
s='%s%s' % (i.replace('http://',''),' ')
print(s)
with open('DISTINCT_url.404.txt', 'a', encoding='utf-8') as fr:
fr.write(s)
print('NOT-IN','---',i)
else:
print('OK')
#browser.close()
except Exception as e:
time.sleep(1)
with open('DISTINCT_url.404.ex.txt', 'a', encoding='utf-8') as fex:
es='%s%s' %(str(e),' ')
fex.write(es)
print(es)
优化后
优化原因:上述代码在404情况下执行时间相对
requests.get(i)
if req.status_code !
获取返回码时间长
但是404的url在总url池中的比例相对非404但需要检测html广告代码的部分少,而该部分必须调动浏览器执行;由此相当与对该部分增加了requests的时间消耗,而其代价已经超过了获取404url的节省的时间
新增时间>节省时间
弊大于利
from selenium import webdriver
import requests, time
url_l = []
with open('DISTINCT_url.txt', 'r', encoding='utf-8') as fo:
for i in fo:
url = '%s%s' % ('http://', i.replace(' ', ''))
url_l.append(url)
le, c = len(url_l), 0
# browser = webdriver.Firefox()
# browser = webdriver.Chrome()
firefox_profile = webdriver.FirefoxProfile()
firefox_profile.set_preference('permissions.default.image', 2) # 某些firefox只需要这个
firefox_profile.set_preference('browser.migration.version', 9001) # 部分需要加上这个
# 禁用css
firefox_profile.set_preference('permissions.default.stylesheet', 2)
# 禁用flash
firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
# #禁用js
firefox_profile.set_preference('javascript.enabled', 'false')
browser = webdriver.Firefox(firefox_profile=firefox_profile)
# browser = webdriver.Firefox()
import random
while True:
for i in url_l:
c += 1
with open('DISTINCT_url.break.log', 'r', encoding='utf-8') as fc:
for breaker in fc:
break
if c < int(breaker):
continue
if c % 50 == 0:
time.sleep(random.randint(0, 3))
with open('DISTINCT_url.break.log', 'w', encoding='utf-8') as flog:
flog.write(str(c))
print(c, '/', le, '---', i)
try:
try:
req = requests.get(i)
if req.status_code != 404:
try:
browser.get(i)
page_source = browser.page_source
if 'us.com/adunion.js' not in page_source:
s = '%s%s' % (i.replace('http://', ''), ' ')
with open('DISTINCT_url.404.txt', 'a', encoding='utf-8') as fr:
fr.write(s)
print('NOT-IN', '---', i)
else:
print('OK', '---', i)
except Exception as e:
time.sleep(1)
with open('DISTINCT_url.404.ex.txt', 'a', encoding='utf-8') as fex:
es = '%s%s' % (str(e), ' ')
fex.write(es)
else:
with open('DISTINCT_url.404.txt', 'a', encoding='utf-8') as fr:
s = '%s%s' % (i.replace('http://', ''), ' ')
print('404---', s)
fr.write(s)
req.close()
except:
continue
except:
continue