zoukankan      html  css  js  c++  java
  • spider_action

    spider from mobile to mobile to mobile

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    # from selenium.webdriver.firefox.options import Options
    import time
    from time import sleep
    import math
    import random
    import sys
    
    tag_jmtool_list = ['(', '(', '-']
    
    ua_list = []
    with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
        for i in uafile:
            if i.find('Mozilla') > -1:
                ua_list.append(i.replace('
    ', '').strip())
    
    ua_list_len_ = len(ua_list) - 1
    
    
    def extract_name(name_):
        for i in tag_jmtool_list:
            name_ = name_.split(i)[0]
        return name_
    
    
    target_type_list = ['住宅小区', '写字楼']
    target_type_list = ['住宅小区']
    target_dic = {}
    with open('JMTool0819am/任务JMTool.csv', 'r', encoding='utf-8') as csvfile:
        for i in csvfile:
            l = i.replace(' ', '').replace('
    ', '').split('";"')
            if l[0].replace('"', '') in target_type_list:
                type_, city, district, addr, name_ = l
                type_, name_ = type_.replace('"', ''), name_.replace('"', '')
                name_reduction = extract_name(name_)
    
                if city not in target_dic:
                    target_dic[city] = {}
                if district not in target_dic[city]:
                    target_dic[city][district] = {}
                if type_ not in target_dic[city][district]:
                    target_dic[city][district][type_] = {}
                if name_reduction not in target_dic[city][district]:
                    target_dic[city][district][type_][name_reduction] = {}
                    target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []
                    target_dic[city][district][type_][name_reduction]['history_list'] = []
    
                target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)
                target_dic[city][district][type_][name_reduction]['history_list'].append(l)
    
    
    def write_res_html(browser, dir_='baidu_map_html/'):
        current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
        page_source = '%s%s' % (current_url_, browser.page_source)
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
        fo = open(file_name, 'w', encoding='utf-8')
        fo.write(page_source)
        fo.closed
    
    
    def gen_random_letter():
        return chr(random.randint(97, 122))
    
    
    def gen_random_num():
        return random.randint(0, 10)
    
    
    def gen_sougo_pid():
        res_ = ''
        for i in range(1, 17, 1):
            if i in [1, 3, 4, 15]:
                res_ = '%s%s' % (res_, gen_random_letter())
            else:
                res_ = '%s%s' % (res_, gen_random_num())
        return res_
    
    
    def close_alert(browser, attitude='accept'):
        try:
            sleep(2)
            al = browser.switch_to.alert()
            sleep(1)
            if attitude == 'accept':
                al.accept()
            elif attitude == 'dismiss':
                al.dismiss()
            print(sys._getframe().f_lineno, 'alert-closed-ok')
        except Exception:
            print(sys._getframe().f_lineno, Exception, 'no-alert')
    
    
    # input_ = '深圳市南山区荟芳园'
    
    def mobile_mobile_pages_html(input_):
        # mobile_emulation = {
        #     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
        #     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
        ua_list_index = random.randint(0, ua_list_len_)
        mobile_emulation = {
            "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
    
        mobile_emulation['userAgent'] = ua_list[ua_list_index]
        chrome_options = Options()
        chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
        browser = webdriver.Chrome(chrome_options=chrome_options)
    
        url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid())
        print(url_seed)
        browser.get(url_seed)
        js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
        browser.execute_script(js)
        xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
        browser.find_element_by_xpath(xp_newpage).click()
        sleep(2)
    
        # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
        # sleep(1)
        # browser.find_element_by_xpath(xp).click()
        close_alert(browser)
        try:
            xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
            sleep(2)
            close_alert(browser)
            browser.find_element_by_xpath(xp)
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            return
        close_alert(browser)
        if browser.find_element_by_xpath(xp).text.find('全部') == -1:
            return
        res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
        res_num = int(res_num)
        page_num = 10
        loop_breaker = math.ceil(res_num / page_num)
    
        close_alert(browser)
        if res_num <= page_num:
            write_res_html(browser)
            browser.quit()
            return
        close_alert(browser)
        xp = '//*[@id="place-widget-placenewlist-showall"]'
        browser.find_element_by_xpath(xp).click()
        write_res_html(browser)
        close_alert(browser)
        js = "window.scrollTo(0,document.body.scrollHeight)"
        browser.execute_script(js)
        sleep(1)
        try:
            xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
            browser.find_element_by_xpath(xp_newpage).click()
            sleep(1)
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            write_res_html(browser)
            browser.quit()
            return
    
        for i in range(1, loop_breaker, 1):
            sleep(1)
            try:
                xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
                sleep(3)
                browser.find_element_by_xpath(xp).click()
            except Exception:
                print(sys._getframe().f_lineno, Exception)
                sleep(10)
                break
            try:
                js = "window.scrollTo(0,document.body.scrollHeight)"
                browser.execute_script(js)
                sleep(1)
            except Exception:
                print(sys._getframe().f_lineno, Exception)
                sleep(10)
            try:
                xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
                sleep(1)
                print(input_, i)
                browser.find_element_by_xpath(xp_newpage).click()
                write_res_html(browser)
            except Exception:
                print(sys._getframe().f_lineno, Exception)
                sleep(10)
        sleep(2)
        browser.quit()
    
    
    for city in target_dic:
        for district in target_dic[city]:
            for type_ in target_dic[city][district]:
                for name_reduction in target_dic[city][district][type_]:
                    input_ = '%s%s%s' % (city, district, name_reduction)
                    mobile_mobile_pages_html(input_)
    

      

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import time
    from time import sleep
    import math
    
    url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
    mobile_emulation = {
        "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
        "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
    chrome_options = Options()
    chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    browser = webdriver.Chrome(chrome_options=chrome_options)
    
    browser.get(url_seed)
    input_ = '深圳市南山区荟芳园'
    
    js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
    browser.execute_script(js)
    xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
    browser.find_element_by_xpath(xp_newpage).click()
    sleep(1)
    
    xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
    browser.find_element_by_xpath(xp).click()
    
    xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
    browser.find_element_by_xpath(xp)
    res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
    res_num = int(res_num)
    page_num = 10
    loop_breaker = math.ceil(res_num / page_num)
    
    
    def write_res_html(browser, dir_='baidu_map_html/'):
        current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
        page_source = '%s%s' % (current_url_, browser.page_source)
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
        fo = open(file_name, 'w', encoding='utf-8')
        fo.write(page_source)
        fo.closed
    
    
    xp = '//*[@id="place-widget-placenewlist-showall"]'
    browser.find_element_by_xpath(xp).click()
    write_res_html(browser)
    
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    sleep(1)
    xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
    browser.find_element_by_xpath(xp_newpage).click()
    sleep(1)
    
    for i in range(1, loop_breaker, 1):
        sleep(1)
        xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
        browser.find_element_by_xpath(xp).click()
        js = "window.scrollTo(0,document.body.scrollHeight)"
        browser.execute_script(js)
        sleep(1)
        xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
        browser.find_element_by_xpath(xp_newpage).click()
        write_res_html(browser)
    

      

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    
    url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
    mobile_emulation = {
        "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
        "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
    chrome_options = Options()
    chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    browser = webdriver.Chrome(chrome_options=chrome_options)
    
    browser.get(url_seed)
    js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"'
    browser.execute_script(js)
    xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
    browser.find_element_by_xpath(xp).click()
    

      

    ua 

    Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
    Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
    Opera/9.25 (Windows NT 5.1; U; en)
    Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
    Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
    Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
    Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
    Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
    Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
    Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
    Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
    Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0

    import os, sys
    import time
    import logging
    import requests
    import threading
    
    from random import choice
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    
    """
    全局约定,便于后期做日志分析
    os._exit(INT)
    4001 4002 4003 4004
    """
    os_sep = os.sep
    this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
        -1]
    base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[0:-2]))
    log_abspath = '%s%s%s' % (base_dir, os_sep, 'log')
    
    """
    日志的记录不能依赖于日志类
    """
    now_, e = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), '启动脚本'
    logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
    
    try:
        sys.path.append(base_dir)
        from core.utils import MysqlHelper
    except Exception as e:
        s = '%s%s%s' % (
            'from core.utils import MysqlHelper EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())),
            e)
        with open(logf, 'a') as fo:
            fo.write(s)
            print(s)
            os._exit(4001)
    
    try:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
                            datefmt='%a, %d %b %Y %H:%M:%S',
                            filename=logf,
                            filemode='a')
    except Exception as e:
        s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
        with open(logf, 'a') as fo:
            fo.write(s)
            print(s)
            os._exit(4002)
    
    try:
    
        fua, lua = '%s%s%s' % (this_file_abspath, os_sep,
                               'ua_list.txt'), []
        with open(fua, 'r') as fo:
            for i in fo:
                lua.append(i.replace('
    ', ''))
    except Exception as e:
        s = '%s%s' % ('打开文件 EXCEPTION  ua文件路径: ', fua)
        logging.error(s)
        print(s)
        os._exit(4003)
    
    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = choice(lua)
    dcap['browserName'], dcap['platform'] = '', ''
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.func, self.args, self.name = func, args, name
    
        def run(self):
            self.func(self.args)
    
    
    ctrl_start, max_script_time = time.time(), 3600 * 4
    
    
    def ctrl_runtime(exit_type=''):
        if time.time() - ctrl_start >= max_script_time:
            s = '%s%s%s%s%s%s%s%s%s' % (
                '程序开始执行时间', ctrl_start, '执行时间阈值', max_script_time, '终止执行', ' exit_type =', exit_type, ' threadID ',
                threading.get_ident())
            logging.info(s)
            if exit_type == '':
                exit(s)
            elif exit_type == 'sys':
                sys.exit(s)
            elif exit_type == 'os':
                # an integer is required
                # Required argument 'status' (pos 1) not found
                os._exit(4004)
    
    
    url_counter = 0
    
    
    def main():
        """
        对异常无限重启
        """
    
        try:
            mysql_obj = MysqlHelper()
            q = 'SELECT direct_order_id FROM test_error;'
            tuple_l = mysql_obj.select(q)
            pass_id_l = [i[0] for i in tuple_l]
            pass_id_l = [str(i) for i in pass_id_l]
            pass_id_l_s = ','.join(pass_id_l)
            del mysql_obj, tuple_l
    
            # 业务当前未失效的url在在test_order具有唯一行
            #
            """
            后期任务:
            test_error积累一定数据后对url重新检测
            #3个功能点:当前半个小时、当前未失效的url test_order内url的异常情况(当前的2个功能点)、(后期任务:test_error积累一定数据后对url重新检测)
    
            q = 'SELECT  url,id FROM test_order WHERE  unix_timestamp(now()) - create_time<=3600*48 AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
                pass_id_l_s)
    
            q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
                pass_id_l_s)
    
            """
    
            mysql_obj = MysqlHelper()
            q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
                pass_id_l_s)
            tuple_l = mysql_obj.select(q)
            del mysql_obj
            if len(tuple_l) == 0:
                s = '无待检测url,程序退出'
                print(s)
                logging.info(s)
        except Exception as e:
            s = '%s%s%s' % ('初始数据,查询数据库异常,无限次重启该脚本', e, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())))
            print(s)
            logging.warning(s)
            cmd = 'python %s' % (__file__)
            os.system(cmd)
            os._exit(1024)
    
        # 考虑到每1小时执行下该脚本,对url异常的处理为:第一次请求为预期则终止请求,反之,间隔30后,再至多请求2次,每次间隔10s
        sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [
            'g3user.com', '51g3.com.cn'], 4, 10
    
        # 重构到基类 where list
        # d当前为为了f_l字段的需求改动
        def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_order'):
            t = -1
            try:
                mysql_obj = MysqlHelper()
                f_s = ','.join(f_l)
                q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url)
                s = '%s%s' % (' DB ', q)
                logging.info(s)
                t = mysql_obj.select(q)
                if t != -1:
                    t = t[0]
                del mysql_obj
            except Exception as e:
                s = '%s%s' % (' DB ', e)
                logging.info(s)
                return t
            return t
    
        def chk_exception_url(url, sleep_seconds=0, http_tag='http://'):
            time.sleep(sleep_seconds)
            global url_counter
    
            ret = {}
            # db url状态值 状态 0:打不开 1:打开无广告 2:已处理
            ret['ok'], ret['status_code'], s = -1, -1, '%s%s%s%s' % (
                time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
            try:
                if url.find('http') == -1:
                    url = '%s%s' % (http_tag, url)
                r = requests.get(url)
                ret['status_code'], txt_pos = int(r.status_code), -1
                s = '%s,%s,%s,%s,%s' % (s, ret['status_code'], url, r, r.reason)
            except Exception as e:
                ret['ok'] = 0
                s = '%s %s %s' % (s, ' SPIDER ', e)
                logging.error(s)
                print(e, url)
    
            # 当前,仅考虑目标站返回200
            if ret['status_code'] == 200:
                for ii in mycode_l:
                    if r.text.find(ii) > -1:
                        ret['ok'], txt_pos = 1, 1
                        break
                if txt_pos == -1:
                    try:
                        driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                                     executable_path='/usr/local/phantomjs/bin/phantomjs')
                        driver.get(url)
                        time.sleep(1)
                        page_source = driver.page_source
                        driver.quit()
                        for ii in mycode_l:
                            if page_source.find(ii) > -1:
                                ret['ok'] = 1
                                break
                        if ret['ok'] == -1:
                            s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。')
                            ret['ok'], ret['info'] = 0, s
                    except Exception as e:
                        s = '%s %s %s' % (s, ' SPIDER ', e)
                        logging.error(s)
                        print(e, url)
    
            # elif ret['status_code'] == 403:
            # www.hsdcw.com/fenlei/41668214.html
            elif ret['status_code'] == 403:
                pass
            else:
                ret['ok'], ret['info'] = 0, s
    
            url_counter += 1
            s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s)
            print(s)
            if ret['ok'] == 0:
                logging.warning(s)
            else:
                logging.info(s)
            return ret
    
        tn, tl, tstep = len(tuple_l), [], 4000
    
        def tf(ts):
    
            te = ts + tstep
            te = min(te, tn)
            for i in tuple_l[ts:te]:
                ctrl_runtime(exit_type='os')
                url, chk_id = i
                s = '%s%s%s%s' % (
                    time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
                if chk_id in pass_id_l:
                    s = '%s%s' % (s, ' 跳过,之前test_error已写入该url ')
                    logging.info(s)
                    print(s)
                """
              针对新浪爱问的规则:  不检测
              """
                if url.find('iask.sina.com') > -1:
                    continue
                write_db_flag = 1
                for t in range(0, repeat_times, 1):
                    ret = chk_exception_url(url, repeat_sleep_times)
                    if ret['ok'] == 1:
                        write_db_flag = 0
                        break
    
                if write_db_flag == 1:
                    try:
                        title, uid, money_total = get_onerow(url)
                    except Exception as e:
                        s = '%s%s%s' % (s, ' DB Exception-去test_order查', e)
                        logging.info(s)
                        print(s)
                        break
    
                    # 多线程 考虑到原包的 数据库限制,每次均实例化数据库类,用后删除
                    try:
                        # 可以考虑分装到类构造器中
                        mysql_obj = MysqlHelper()
                    except Exception as e:
                        s = '%s%s%s' % (s, ' DB Exception- ', e)
                        logging.error(s)
                        print(s)
                        break
    
                    """
                    多进程、线程并发
                    待优化,比如队列
                  """
                    q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url)
                    try:
                        r = mysql_obj.select(q)
                        s = '%s%s%s' % (s, ' -SQL- ', q)
                        logging.info(s)
                        print(q)
                    except Exception as e:
                        s = '%s %s %s %s' % (s, ' DB Exception-', q, e)
                        logging.info(s)
                        print(s)
                        break
    
                    ctime = int(time.time())
                    # 建议优化此处数据库设计
                    db_status = 1 if ret['status_code'] == 200 else 0
                    if len(r) == 0:
                        q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                            title, url, db_status, ret['info'], ctime, ctime, uid, money_total, chk_id)
                        try:
                            mysql_obj.execute(q)
                            mysql_obj.commit()
                            del mysql_obj
                            s = '%s%s%s' % (s, ' DB SQL ok ', q)
                            logging.info(s)
                            print(s)
                        except Exception as e:
                            s = '%s%s%s%s' % (s, ' DB Exception- ', q, e)
                            logging.error(s)
                            print(s)
    
                    elif len(r) == 1:
                        continue
    
        for i in range(0, tn, tstep):
            if i >= tn:
                break
            thread_instance = MyThread(tf, (i), tf.__name__)
            tl.append(thread_instance)
    
        for t in tl:
            t.setDaemon = False
            t.start()
        for t in tl:
            t.join()
    
    
    if __name__ == '__main__':
        main()
    

      

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    # from selenium.webdriver.firefox.options import Options
    import time
    from time import sleep
    import math
    import random
    import sys
    import threading
    from random import choice
    # import urllib.parse
    from bs4 import BeautifulSoup

    ua_list = []
    with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
    for i in uafile:
    if i.find('Mozilla') > -1:
    ua_list.append(i.replace(' ', '').strip())

    ua_list_len_ = len(ua_list) - 1


    def close_alert(browser, attitude='accept'):
    # js='alert(window.alert=function(str){return;}'
    # browser.execute_script(js)

    # js= 'window.alert = function(str){return ;}'
    # browser.execute_script(js)
    return


    # mobile_emulation = {
    # "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    # "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
    ua_list_index = random.randint(0, ua_list_len_)
    # mobile_emulation = {
    # "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
    #
    # mobile_emulation['userAgent'] = choice(ua_list)
    # chrome_options = Options()
    # chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    # browser = webdriver.Chrome(chrome_options=chrome_options)
    browser = webdriver.Chrome()
    s_wd = '长尾'
    url_seed = 'https://m.baidu.com/s?word=s_wd'

    url_seed = url_seed.replace('s_wd', s_wd)
    print(url_seed)
    browser.get(url_seed)

    rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
    res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
    browser.quit()
    d = 3
  • 相关阅读:
    github单独下载一个文件夹
    搭建github服务器
    ssh xshell 连接在vim中无法用 ctrl+insert 复制黏贴
    centos 下文件夹共享
    rootkit 内核函数hook
    centos dhcp获取不到ip解决方法 Bringing up interface eth0: Device eth0 does not seem to be present,delaying initialization.
    ipc 入侵步骤
    linux 无交互添加用户设置密码
    C++笔记
    感谢路遥 感谢平凡的世界
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7396160.html
Copyright © 2011-2022 走看看