zoukankan html css js c++ java

spider_action

spider from mobile to mobile to mobile

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys

tag_jmtool_list = ['（', '(', '-']

ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
    for i in uafile:
        if i.find('Mozilla') > -1:
            ua_list.append(i.replace('
', '').strip())

ua_list_len_ = len(ua_list) - 1


def extract_name(name_):
    for i in tag_jmtool_list:
        name_ = name_.split(i)[0]
    return name_


target_type_list = ['住宅小区', '写字楼']
target_type_list = ['住宅小区']
target_dic = {}
with open('JMTool0819am/任务JMTool.csv', 'r', encoding='utf-8') as csvfile:
    for i in csvfile:
        l = i.replace(' ', '').replace('
', '').split('";"')
        if l[0].replace('"', '') in target_type_list:
            type_, city, district, addr, name_ = l
            type_, name_ = type_.replace('"', ''), name_.replace('"', '')
            name_reduction = extract_name(name_)

            if city not in target_dic:
                target_dic[city] = {}
            if district not in target_dic[city]:
                target_dic[city][district] = {}
            if type_ not in target_dic[city][district]:
                target_dic[city][district][type_] = {}
            if name_reduction not in target_dic[city][district]:
                target_dic[city][district][type_][name_reduction] = {}
                target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []
                target_dic[city][district][type_][name_reduction]['history_list'] = []

            target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)
            target_dic[city][district][type_][name_reduction]['history_list'].append(l)


def write_res_html(browser, dir_='baidu_map_html/'):
    current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
    page_source = '%s%s' % (current_url_, browser.page_source)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
    fo = open(file_name, 'w', encoding='utf-8')
    fo.write(page_source)
    fo.closed


def gen_random_letter():
    return chr(random.randint(97, 122))


def gen_random_num():
    return random.randint(0, 10)


def gen_sougo_pid():
    res_ = ''
    for i in range(1, 17, 1):
        if i in [1, 3, 4, 15]:
            res_ = '%s%s' % (res_, gen_random_letter())
        else:
            res_ = '%s%s' % (res_, gen_random_num())
    return res_


def close_alert(browser, attitude='accept'):
    try:
        sleep(2)
        al = browser.switch_to.alert()
        sleep(1)
        if attitude == 'accept':
            al.accept()
        elif attitude == 'dismiss':
            al.dismiss()
        print(sys._getframe().f_lineno, 'alert-closed-ok')
    except Exception:
        print(sys._getframe().f_lineno, Exception, 'no-alert')


# input_ = '深圳市南山区荟芳园'

def mobile_mobile_pages_html(input_):
    # mobile_emulation = {
    #     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    #     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
    ua_list_index = random.randint(0, ua_list_len_)
    mobile_emulation = {
        "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}

    mobile_emulation['userAgent'] = ua_list[ua_list_index]
    chrome_options = Options()
    chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    browser = webdriver.Chrome(chrome_options=chrome_options)

    url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid())
    print(url_seed)
    browser.get(url_seed)
    js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
    browser.execute_script(js)
    xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
    browser.find_element_by_xpath(xp_newpage).click()
    sleep(2)

    # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
    # sleep(1)
    # browser.find_element_by_xpath(xp).click()
    close_alert(browser)
    try:
        xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
        sleep(2)
        close_alert(browser)
        browser.find_element_by_xpath(xp)
    except Exception:
        print(sys._getframe().f_lineno, Exception)
        return
    close_alert(browser)
    if browser.find_element_by_xpath(xp).text.find('全部') == -1:
        return
    res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
    res_num = int(res_num)
    page_num = 10
    loop_breaker = math.ceil(res_num / page_num)

    close_alert(browser)
    if res_num <= page_num:
        write_res_html(browser)
        browser.quit()
        return
    close_alert(browser)
    xp = '//*[@id="place-widget-placenewlist-showall"]'
    browser.find_element_by_xpath(xp).click()
    write_res_html(browser)
    close_alert(browser)
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    sleep(1)
    try:
        xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
        browser.find_element_by_xpath(xp_newpage).click()
        sleep(1)
    except Exception:
        print(sys._getframe().f_lineno, Exception)
        write_res_html(browser)
        browser.quit()
        return

    for i in range(1, loop_breaker, 1):
        sleep(1)
        try:
            xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
            sleep(3)
            browser.find_element_by_xpath(xp).click()
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
            break
        try:
            js = "window.scrollTo(0,document.body.scrollHeight)"
            browser.execute_script(js)
            sleep(1)
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
        try:
            xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
            sleep(1)
            print(input_, i)
            browser.find_element_by_xpath(xp_newpage).click()
            write_res_html(browser)
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
    sleep(2)
    browser.quit()


for city in target_dic:
    for district in target_dic[city]:
        for type_ in target_dic[city][district]:
            for name_reduction in target_dic[city][district][type_]:
                input_ = '%s%s%s' % (city, district, name_reduction)
                mobile_mobile_pages_html(input_)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from time import sleep
import math

url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options)

browser.get(url_seed)
input_ = '深圳市南山区荟芳园'

js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
browser.execute_script(js)
xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1)

xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
browser.find_element_by_xpath(xp).click()

xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
browser.find_element_by_xpath(xp)
res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
res_num = int(res_num)
page_num = 10
loop_breaker = math.ceil(res_num / page_num)


def write_res_html(browser, dir_='baidu_map_html/'):
    current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
    page_source = '%s%s' % (current_url_, browser.page_source)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
    fo = open(file_name, 'w', encoding='utf-8')
    fo.write(page_source)
    fo.closed


xp = '//*[@id="place-widget-placenewlist-showall"]'
browser.find_element_by_xpath(xp).click()
write_res_html(browser)

js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1)

for i in range(1, loop_breaker, 1):
    sleep(1)
    xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
    browser.find_element_by_xpath(xp).click()
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    sleep(1)
    xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
    browser.find_element_by_xpath(xp_newpage).click()
    write_res_html(browser)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options)

browser.get(url_seed)
js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"'
browser.execute_script(js)
xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp).click()

Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Opera/9.25 (Windows NT 5.1; U; en)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0

import os, sys
import time
import logging
import requests
import threading

from random import choice
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

"""
全局约定，便于后期做日志分析
os._exit(INT)
4001 4002 4003 4004
"""
os_sep = os.sep
this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
    -1]
base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[0:-2]))
log_abspath = '%s%s%s' % (base_dir, os_sep, 'log')

"""
日志的记录不能依赖于日志类
"""
now_, e = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), '启动脚本'
logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e)
with open(logf, 'a') as fo:
    fo.write(s)
    print(s)

try:
    sys.path.append(base_dir)
    from core.utils import MysqlHelper
except Exception as e:
    s = '%s%s%s' % (
        'from core.utils import MysqlHelper EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())),
        e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4001)

try:
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
                        datefmt='%a, %d %b %Y %H:%M:%S',
                        filename=logf,
                        filemode='a')
except Exception as e:
    s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4002)

try:

    fua, lua = '%s%s%s' % (this_file_abspath, os_sep,
                           'ua_list.txt'), []
    with open(fua, 'r') as fo:
        for i in fo:
            lua.append(i.replace('
', ''))
except Exception as e:
    s = '%s%s' % ('打开文件 EXCEPTION  ua文件路径： ', fua)
    logging.error(s)
    print(s)
    os._exit(4003)

dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = choice(lua)
dcap['browserName'], dcap['platform'] = '', ''


class MyThread(threading.Thread):
    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.func, self.args, self.name = func, args, name

    def run(self):
        self.func(self.args)


ctrl_start, max_script_time = time.time(), 3600 * 4


def ctrl_runtime(exit_type=''):
    if time.time() - ctrl_start >= max_script_time:
        s = '%s%s%s%s%s%s%s%s%s' % (
            '程序开始执行时间', ctrl_start, '执行时间阈值', max_script_time, '终止执行', ' exit_type =', exit_type, ' threadID ',
            threading.get_ident())
        logging.info(s)
        if exit_type == '':
            exit(s)
        elif exit_type == 'sys':
            sys.exit(s)
        elif exit_type == 'os':
            # an integer is required
            # Required argument 'status' (pos 1) not found
            os._exit(4004)


url_counter = 0


def main():
    """
    对异常无限重启
    """

    try:
        mysql_obj = MysqlHelper()
        q = 'SELECT direct_order_id FROM test_error;'
        tuple_l = mysql_obj.select(q)
        pass_id_l = [i[0] for i in tuple_l]
        pass_id_l = [str(i) for i in pass_id_l]
        pass_id_l_s = ','.join(pass_id_l)
        del mysql_obj, tuple_l

        # 业务当前未失效的url在在test_order具有唯一行
        #
        """
        后期任务：
        test_error积累一定数据后对url重新检测
        #3个功能点：当前半个小时、当前未失效的url test_order内url的异常情况（当前的2个功能点）、（后期任务：test_error积累一定数据后对url重新检测）

        q = 'SELECT  url,id FROM test_order WHERE  unix_timestamp(now()) - create_time<=3600*48 AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)

        q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)

        """

        mysql_obj = MysqlHelper()
        q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)
        tuple_l = mysql_obj.select(q)
        del mysql_obj
        if len(tuple_l) == 0:
            s = '无待检测url，程序退出'
            print(s)
            logging.info(s)
    except Exception as e:
        s = '%s%s%s' % ('初始数据，查询数据库异常，无限次重启该脚本', e, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())))
        print(s)
        logging.warning(s)
        cmd = 'python %s' % (__file__)
        os.system(cmd)
        os._exit(1024)

    # 考虑到每1小时执行下该脚本，对url异常的处理为：第一次请求为预期则终止请求，反之，间隔30后，再至多请求2次，每次间隔10s
    sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [
        'g3user.com', '51g3.com.cn'], 4, 10

    # 重构到基类 where list
    # d当前为为了f_l字段的需求改动
    def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_order'):
        t = -1
        try:
            mysql_obj = MysqlHelper()
            f_s = ','.join(f_l)
            q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url)
            s = '%s%s' % (' DB ', q)
            logging.info(s)
            t = mysql_obj.select(q)
            if t != -1:
                t = t[0]
            del mysql_obj
        except Exception as e:
            s = '%s%s' % (' DB ', e)
            logging.info(s)
            return t
        return t

    def chk_exception_url(url, sleep_seconds=0, http_tag='http://'):
        time.sleep(sleep_seconds)
        global url_counter

        ret = {}
        # db url状态值 状态 0：打不开 1：打开无广告 2:已处理
        ret['ok'], ret['status_code'], s = -1, -1, '%s%s%s%s' % (
            time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
        try:
            if url.find('http') == -1:
                url = '%s%s' % (http_tag, url)
            r = requests.get(url)
            ret['status_code'], txt_pos = int(r.status_code), -1
            s = '%s,%s,%s,%s,%s' % (s, ret['status_code'], url, r, r.reason)
        except Exception as e:
            ret['ok'] = 0
            s = '%s %s %s' % (s, ' SPIDER ', e)
            logging.error(s)
            print(e, url)

        # 当前，仅考虑目标站返回200
        if ret['status_code'] == 200:
            for ii in mycode_l:
                if r.text.find(ii) > -1:
                    ret['ok'], txt_pos = 1, 1
                    break
            if txt_pos == -1:
                try:
                    driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                                 executable_path='/usr/local/phantomjs/bin/phantomjs')
                    driver.get(url)
                    time.sleep(1)
                    page_source = driver.page_source
                    driver.quit()
                    for ii in mycode_l:
                        if page_source.find(ii) > -1:
                            ret['ok'] = 1
                            break
                    if ret['ok'] == -1:
                        s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。')
                        ret['ok'], ret['info'] = 0, s
                except Exception as e:
                    s = '%s %s %s' % (s, ' SPIDER ', e)
                    logging.error(s)
                    print(e, url)

        # elif ret['status_code'] == 403:
        # www.hsdcw.com/fenlei/41668214.html
        elif ret['status_code'] == 403:
            pass
        else:
            ret['ok'], ret['info'] = 0, s

        url_counter += 1
        s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s)
        print(s)
        if ret['ok'] == 0:
            logging.warning(s)
        else:
            logging.info(s)
        return ret

    tn, tl, tstep = len(tuple_l), [], 4000

    def tf(ts):

        te = ts + tstep
        te = min(te, tn)
        for i in tuple_l[ts:te]:
            ctrl_runtime(exit_type='os')
            url, chk_id = i
            s = '%s%s%s%s' % (
                time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
            if chk_id in pass_id_l:
                s = '%s%s' % (s, ' 跳过，之前test_error已写入该url ')
                logging.info(s)
                print(s)
            """
          针对新浪爱问的规则:  不检测
          """
            if url.find('iask.sina.com') > -1:
                continue
            write_db_flag = 1
            for t in range(0, repeat_times, 1):
                ret = chk_exception_url(url, repeat_sleep_times)
                if ret['ok'] == 1:
                    write_db_flag = 0
                    break

            if write_db_flag == 1:
                try:
                    title, uid, money_total = get_onerow(url)
                except Exception as e:
                    s = '%s%s%s' % (s, ' DB Exception-去test_order查', e)
                    logging.info(s)
                    print(s)
                    break

                # 多线程 考虑到原包的 数据库限制，每次均实例化数据库类，用后删除
                try:
                    # 可以考虑分装到类构造器中
                    mysql_obj = MysqlHelper()
                except Exception as e:
                    s = '%s%s%s' % (s, ' DB Exception- ', e)
                    logging.error(s)
                    print(s)
                    break

                """
                多进程、线程并发
                待优化，比如队列
              """
                q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url)
                try:
                    r = mysql_obj.select(q)
                    s = '%s%s%s' % (s, ' -SQL- ', q)
                    logging.info(s)
                    print(q)
                except Exception as e:
                    s = '%s %s %s %s' % (s, ' DB Exception-', q, e)
                    logging.info(s)
                    print(s)
                    break

                ctime = int(time.time())
                # 建议优化此处数据库设计
                db_status = 1 if ret['status_code'] == 200 else 0
                if len(r) == 0:
                    q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                        title, url, db_status, ret['info'], ctime, ctime, uid, money_total, chk_id)
                    try:
                        mysql_obj.execute(q)
                        mysql_obj.commit()
                        del mysql_obj
                        s = '%s%s%s' % (s, ' DB SQL ok ', q)
                        logging.info(s)
                        print(s)
                    except Exception as e:
                        s = '%s%s%s%s' % (s, ' DB Exception- ', q, e)
                        logging.error(s)
                        print(s)

                elif len(r) == 1:
                    continue

    for i in range(0, tn, tstep):
        if i >= tn:
            break
        thread_instance = MyThread(tf, (i), tf.__name__)
        tl.append(thread_instance)

    for t in tl:
        t.setDaemon = False
        t.start()
    for t in tl:
        t.join()


if __name__ == '__main__':
    main()

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
from random import choice
# import urllib.parse
from  bs4 import BeautifulSoup

ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
    for i in uafile:
        if i.find('Mozilla') > -1:
            ua_list.append(i.replace('
', '').strip())

ua_list_len_ = len(ua_list) - 1


def close_alert(browser, attitude='accept'):
    # js='alert(window.alert=function(str){return;}'
    # browser.execute_script(js)

    # js= 'window.alert = function(str){return ;}'
    # browser.execute_script(js)
    return


# mobile_emulation = {
#     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
#     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
#     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation['userAgent'] = choice(ua_list)
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome()
s_wd = '长尾'
url_seed = 'https://m.baidu.com/s?word=s_wd'

url_seed = url_seed.replace('s_wd', s_wd)
print(url_seed)
browser.get(url_seed)

rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
browser.quit()
d = 3

查看全文

相关阅读:
github单独下载一个文件夹
 搭建github服务器
 ssh xshell 连接在vim中无法用 ctrl+insert 复制黏贴
 centos 下文件夹共享
 rootkit 内核函数hook
centos dhcp获取不到ip解决方法 Bringing up interface eth0: Device eth0 does not seem to be present,delaying initialization.
ipc 入侵步骤
 linux 无交互添加用户设置密码
 C++笔记
 感谢路遥感谢平凡的世界

原文地址：https://www.cnblogs.com/rsapaper/p/7396160.html