zoukankan      html  css  js  c++  java
  • 反抗分析

    玩淘宝要做访问意图分析,玩今日头条要做访问路径、意图的反抗分析:在生态里边,没有上下班的概念,这才是all in

    from selenium import webdriver
    from  time import sleep
    import time
    from selenium.webdriver.common.keys import Keys
    import os
    
    import requests
    import time
    import threading
    import logging
    import random
    
    start_time = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))
    os_sep = os.sep
    this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
        -1]
    logf = this_file_name + '.log'
    try:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
                            datefmt='%a, %d %b %Y %H:%M:%S',
                            filename=logf,
                            filemode='a')
    except Exception as e:
        s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
        with open(logf, 'a') as fo:
            fo.write(s)
            print(s)
            os._exit(4002)
    
    logging.info('START')
    
    img_url = 'https://s3.pstatp.com/toutiao/static/img/logo.201f80d.png'
    img_dir = 'C:\Users\sas\PycharmProjects\py_win_to_unix\crontab_chk_url\personas\trunk\plugins\spider\dl_img_tmp\'
    
    
    def spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png'):
        r = '%s%s' % (img_dir, local_default)
        try:
            bytes = requests.get(img_url)._content
            r = '%s%s%s%s%s' % (
                img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
                img_url.replace('/', '_xl_').replace(':', '_fxl_').replace('?', '_fxlquestion_').replace('=',
                                                                                                         '_fxlequal_').replace(
                    '&', '_fxland_'), '.png')
            if bytes != 0:
                with open(r, 'wb')as f:
                    f.write(bytes)
        except Exception as e:
            print(e)
        return r
    
    
    import pymysql
    
    h, pt, u, p, db = '192.168.22.21', 3306, 'root', 'mp', 'tab_media_joke'
    
    
    def mysql_fetch(sql, res_type='tuple'):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return ()
        if res_type == 'dic':
            cursor = conn.cursor(pymysql.cursors.DictCursor)
        else:
    
            cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return cursor.fetchall()
    
    
    def mysql_write(sql):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return 1
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return 0
    
    
    import random
    
    while True:
        logging.info('LOOP----')
        sql = 'SELECT username,password,toutiaoid  FROM joke_tab_joke_namepwd WHERE status=1 AND category=1 AND id>236 AND NOT  (toutiaoid IS NULL OR toutiaoid="" )'
        sql = 'SELECT username,password,toutiaoid  FROM joke_tab_joke_namepwd WHERE status=1 AND category=1 AND id=7856582 AND NOT  (toutiaoid IS NULL OR toutiaoid="" )'
        res = mysql_fetch(sql)
        ac_l = [{'u': i[0], 'p': i[1], 'toutiao_uid': i[2]} for i in res]
        for ac in ac_l:
            myid, mypwd, toutiao_uid = ac['u'], ac['p'], ac['toutiao_uid']
            # 发布限制条件逻辑
            sql = "SELECT * FROM joke_tab_joke_relation_wukong_question  WHERE  INSTR(CONCAT(',',id_toutiao_uid_list,','),CONCAT(',','{}',',')) AND time_effective<={}  ORDER BY id DESC; ".format(
                toutiao_uid, int(time.time()));
            sql = "SELECT * FROM joke_joke_article_publish  WHERE  INSTR(CONCAT(',',id_toutiao_uid_list,','),CONCAT(',','{}',',')) AND time_effective<={}  ORDER BY id DESC; ".format(
                toutiao_uid, int(time.time()));
            print(sql)
            logging.info(sql)
            res_content = mysql_fetch(sql, 'dic')
            if len(res_content) == 0:
                continue
            id_article_list = [i['id_article_list'] for i in res_content]
    
            sql = 'SELECT * FROM joke_joke_article WHERE id IN ({}) AND id  NOT IN (SELECT article_id FROM  joke_joke_article_publish_result WHERE 1 AND toutiao_uid="{}" ) LIMIT 2; '.format(
                ','.join([i['id_article_list'] for i in res_content]), toutiao_uid)
            # sql = 'SELECT * FROM joke_tab_joke_wukong_question WHERE id  NOT IN (SELECT toutiao_uid FROM  joke_tab_joke_toutiaouser_wukong_question) LIMIT 1'
            logging.info(sql)
            res_content = mysql_fetch(sql, 'dic')
            if len(res_content) == 0:
                continue
    
            browser = webdriver.Chrome()
            f_url_l = ['https://www.toutiao.com/group/1589657566362638/',
                       'https://www.wukong.com/question/6388670742287876353/',
                       'https://www.wukong.com/tag/6215497898671475202/']
            f_url_l += ['https://www.wukong.com/question/6512777037948649741/',
                        'https://www.wukong.com/question/6469247721038414093/',
                        'https://www.wukong.com/question/6481502080249889037/']
            # f_url_l = []
            f_url_l = ['https://www.toutiao.com/a6514526304476332552/', 'https://www.toutiao.com/a6514661446876398088/',
                       'https://www.toutiao.com/a6514778729951003150/']
            f_url_l += ['https://www.toutiao.com/a6514216125151052291/', 'https://www.toutiao.com/a6512315164463727111/',
                        'https://www.toutiao.com/a6513334304318161411/']
            f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
            # browser.get(random.choice(f_url_l))
            browser.get(f_url_l_a)
            time.sleep(random.randint(10, 20))
    
            js = 'window.location.href="https://sso.toutiao.com/login/";'
            js = 'window.location.href="https://sso.toutiao.com/login/?service=https://mp.toutiao.com/sso_confirm/?redirect_url=/";'
            browser.execute_script(js)
            time.sleep(random.randint(10, 20))
    
            #  js = 'window.location.href="https://sso.toutiao.com/login/?service=https%3A%2F%2Fwww.wukong.com%2Fwenda%2Fwelcome%2F#type=0";'
            browser.execute_script(js)
    
            ac_type = 'qq'
            if ac_type == 'qq':
                myid, mypwd = ac['u'], ac['p']
                xp = '/html/body/div/div/div[2]/div/div/div/ul/li[3]'
                browser.find_element_by_xpath(xp).click()
                time.sleep(10)
                js = '%s%s%s' % ('document.getElementById("u").value="', myid, '"')
                browser.execute_script(js)
                js = '%s%s%s' % ('document.getElementById("p").value="', mypwd, '"')
                browser.execute_script(js)
                time.sleep(random.randint(5, 15))
                xp_newpage = '//*[@id="go"]'
                browser.find_element_by_xpath(xp_newpage).click()
                time.sleep(random.randint(10, 20))
            elif ac_type == 'mail_qq':
                continue
    
            time.sleep(5)
    
            browser.refresh()
            js = 'window.location.href="https://www.toutiao.com/";'
            browser.execute_script(js)
            time.sleep(6)
    
            js = 'window.location.href="https://www.wukong.com/";'
            js = 'window.location.href="https://mp.toutiao.com/profile_v2/publish/";'
            js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/publish";'
            browser.execute_script(js)
    
            time.sleep(6)
    
            js = 'document.getElementsByClassName("ask")[0].click();'
            browser.execute_script(js)
            time.sleep(12)
    
            time.sleep(random.randint(10, 20))
            # 需要键盘事件 反爬虫
            tmp_target = browser.find_element_by_class_name('input-box').find_element_by_tag_name('input')
            # tmp_target.send_keys(Keys.SPACE)
            # tmp_target.send_keys(Keys.CONTROL, 'a')
            # tmp_target.send_keys(Keys.CONTROL, 'x')
            # tmp_target.send_keys(Keys.CONTROL, 'v')
            # tmp_target.send_keys(Keys.BACK_SPACE)
            # time.sleep(random.randint(10, 20))
    
    
            # res_content = []
            for i in res_content[0:1]:
                dbid, content, img_list = i['id'], i['content'], i['img_list']
    
                tmp_l = ['口红', '指甲油', '护发素', '沐浴露', '洗手液', '洗发水', '牙膏']
                tmp_l_1 = ['老人', '小孩', '白领', '前台妹子', '行政妹子', '大学生', '高中生']
                tmp_l_2 = ['类型', '特质', '种类', '价位', '原材料', '主要成分', '价格']
    
                s = '{}{}{}{}{}{}{}'.format(str(random.randint(1, 12)), '月份,', random.choice(tmp_l_1), '适合使用什么',
                                            random.choice(tmp_l_2), '', random.choice(tmp_l))
                js = 'document.getElementsByClassName("input-box")[0].childNodes[0].value="{}";'.format(s)
                browser.execute_script(js)
                time.sleep(12)
                #
                tmp_target.send_keys(Keys.SPACE)
    
                js = 'document.getElementsByClassName("step-btn next")[0].click();'
                browser.execute_script(js)
    
                # step-btn submit
    
                js = 'document.getElementsByClassName("step-btn submit")[0].click();'
                browser.execute_script(js)
                time.sleep(12)
    
                #
                js = 'window.location.href="https://www.wukong.com/user/?uid={}&type=1";'.format(toutiao_uid)
                browser.execute_script(js)
                time.sleep(12)
                res_url = browser.find_element_by_class_name('question-title').find_elements_by_tag_name('a')[
                    0].get_attribute('href')
    
                # print(i)
                # xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/textarea'
                # try:
                #     browser.find_element_by_xpath(xp_newpage)
                # except Exception as e:
                #     print(e)
                #     break
                # browser.find_element_by_xpath(xp_newpage).click()
                # words = content
                # # Message: SyntaxError: unterminated string literal
                # mytxt = words.replace('
    ', ' ').replace('
    ', ' ').replace('\br', ' ').replace('"', '').replace("'", '')
                # # Message: SyntaxError: missing ; before statement
                # mytxt = mytxt.replace("'", '')
                # # 2000 头条
                # mytxt = mytxt[0:2000]
                # mytxt = '好消息' if len(mytxt.replace(' ', '')) == 0 else mytxt
                #
                # # 需要键盘事件 反爬虫
                # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE)
                # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'a')
                # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'x')
                # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'v')
                # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.BACK_SPACE)
                # time.sleep(random.randint(2, 5))
                #
                # try:
                #     # js = '%s%s%s' % ('document.getElementsByTagName("textarea")[0].value="', '', '"')
                #     # browser.execute_script(js)
                #     js = '%s%s%s' % ('document.getElementsByTagName("textarea")[0].value="', mytxt, '"')
                #     browser.execute_script(js)
                #     time.sleep(3)
                # except Exception as jse:
                #     print('.getElementsByTagName("textarea")--log-', jse)
                #     continue
                #
                # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE)
                # xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/div[1]/div[1]/span[1]/span'
                # browser.find_element_by_xpath(xp_newpage).click()
                # time.sleep(3)
                # try:
                #     upload = browser.find_element_by_id('fileElem')
                #
                #     logs_img = ''
                #     img_url_list = img_list.split(',')
                #
                #     for imgid in img_url_list:
                #         img_url = 'http://192.168.2.212:83/file/get?type=tab_joke&id=199'.replace('199', str(imgid))
                #         local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url,
                #                                                                 local_default='default.DONOT_REMOVE.png')
                #         print(local_img_path)
                #         time.sleep(random.randint(2, 4))
                #         logs_img += img_url
                #         logs_img += local_img_path
                #         upload.send_keys(local_img_path)
                #         time.sleep(random.randint(3, 7))
                # except Exception as ee:
                #     img_url_default = ''
                #     img_url = img_url_default
                #     local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url,
                #                                                             local_default='default.DONOT_REMOVE.png')
                #     sleep(2)
                #     logs_img += img_url
                #     logs_img += local_img_path
                #     # upload.send_keys(local_img_path)
                #     logging.exception(ee)
                #
                # try:
                #     xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/ul'
                #     browser.find_element_by_xpath(xp_newpage).click()
                #     xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/div[1]/div[2]/a'
                #     browser.find_element_by_xpath(xp_newpage).click()
                #
                #     time.sleep(random.randint(8, 20))
                #     js = 'document.getElementsByClassName("ugc-mode-content")[0].getElementsByTagName("a")[0].target="_self"'
                #     browser.execute_script(js)
                #
                #     time.sleep(random.randint(2, 5))
                #     xp_newpage = '/html/body/div/div[2]/div[2]/div[2]/ul/li[1]/div/div[2]/div/div[2]/a'
                #     browser.find_element_by_xpath(xp_newpage).click()
                #     time.sleep(random.randint(3, 6))
                #     url_curr = browser.current_url
                #
                #     with open('toutiao_success.log', 'a', encoding='utf-8') as f:
                #         logs = '%s%s%s%s%s
    ' % (
                #             time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), ac_type, myid[0:4], mytxt,
                #             logs_img)
                #         print(logs)
                #         f.write(logs)
    
                sql = 'INSERT INTO  joke_tab_joke_toutiaouser_action_wukong_question_action (article_id,article_url,time_script,toutiao_uid) VALUE("%s","%s","%s","%s");' % (
                    dbid, res_url, int(time.time()), toutiao_uid)
                mysql_write(sql)
                print(sql)
                time.sleep(random.randint(20, 30))
                js = 'window.location.href="https://www.wukong.com/"'
                browser.execute_script(js)
                # except Exception as e_url_jump:
                #     print('e_url_jump', e_url_jump)
        try:
            browser.quit()
        except Exception as e1:
            print(e1)
            logging.exception(e1)
    
    time.sleep(random.randint(120, 300))
     
     
     
  • 相关阅读:
    CentOS 6.5环境实现corosync+pacemaker实现DRBD高可用
    通达OA2008优化前端web为lnmp环境及后续优化
    CentOS 6.5环境使用ansible剧本自动化部署Corosync + pacemaker环境及corosync常用配置详解
    利用mycat实现基于mysql5.5主从复制的读写分离
    登录服务器windows2008出现:远程桌面服务当前正忙,因此无法完成您尝试执行的任务。请在几分钟后重试。其他用户应该仍然能够登录
    CentOS 6.5使用Corosync + pacemaker实现httpd服务的高可用
    ansible的安装部署及简单应用
    centos6.7安装系统后看不到网卡无法配置IP的解决办法
    Error: Cannot retrieve metalink for repository: epel. Please verify its path and try again
    centos6环境创建局域网http方式的yum源
  • 原文地址:https://www.cnblogs.com/rsapaper/p/8855058.html
Copyright © 2011-2022 走看看