玩淘宝要做访问意图分析,玩今日头条要做访问路径、意图的反抗分析:在生态里边,没有上下班的概念,这才是all in
from selenium import webdriver from time import sleep import time from selenium.webdriver.common.keys import Keys import os import requests import time import threading import logging import random start_time = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())) os_sep = os.sep this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[ -1] logf = this_file_name + '.log' try: logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]', datefmt='%a, %d %b %Y %H:%M:%S', filename=logf, filemode='a') except Exception as e: s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e) with open(logf, 'a') as fo: fo.write(s) print(s) os._exit(4002) logging.info('START') img_url = 'https://s3.pstatp.com/toutiao/static/img/logo.201f80d.png' img_dir = 'C:\Users\sas\PycharmProjects\py_win_to_unix\crontab_chk_url\personas\trunk\plugins\spider\dl_img_tmp\' def spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png'): r = '%s%s' % (img_dir, local_default) try: bytes = requests.get(img_url)._content r = '%s%s%s%s%s' % ( img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()), img_url.replace('/', '_xl_').replace(':', '_fxl_').replace('?', '_fxlquestion_').replace('=', '_fxlequal_').replace( '&', '_fxland_'), '.png') if bytes != 0: with open(r, 'wb')as f: f.write(bytes) except Exception as e: print(e) return r import pymysql h, pt, u, p, db = '192.168.22.21', 3306, 'root', 'mp', 'tab_media_joke' def mysql_fetch(sql, res_type='tuple'): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return () if res_type == 'dic': cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return cursor.fetchall() def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 import random while True: logging.info('LOOP----') sql = 'SELECT username,password,toutiaoid FROM joke_tab_joke_namepwd WHERE status=1 AND category=1 AND id>236 AND NOT (toutiaoid IS NULL OR toutiaoid="" )' sql = 'SELECT username,password,toutiaoid FROM joke_tab_joke_namepwd WHERE status=1 AND category=1 AND id=7856582 AND NOT (toutiaoid IS NULL OR toutiaoid="" )' res = mysql_fetch(sql) ac_l = [{'u': i[0], 'p': i[1], 'toutiao_uid': i[2]} for i in res] for ac in ac_l: myid, mypwd, toutiao_uid = ac['u'], ac['p'], ac['toutiao_uid'] # 发布限制条件逻辑 sql = "SELECT * FROM joke_tab_joke_relation_wukong_question WHERE INSTR(CONCAT(',',id_toutiao_uid_list,','),CONCAT(',','{}',',')) AND time_effective<={} ORDER BY id DESC; ".format( toutiao_uid, int(time.time())); sql = "SELECT * FROM joke_joke_article_publish WHERE INSTR(CONCAT(',',id_toutiao_uid_list,','),CONCAT(',','{}',',')) AND time_effective<={} ORDER BY id DESC; ".format( toutiao_uid, int(time.time())); print(sql) logging.info(sql) res_content = mysql_fetch(sql, 'dic') if len(res_content) == 0: continue id_article_list = [i['id_article_list'] for i in res_content] sql = 'SELECT * FROM joke_joke_article WHERE id IN ({}) AND id NOT IN (SELECT article_id FROM joke_joke_article_publish_result WHERE 1 AND toutiao_uid="{}" ) LIMIT 2; '.format( ','.join([i['id_article_list'] for i in res_content]), toutiao_uid) # sql = 'SELECT * FROM joke_tab_joke_wukong_question WHERE id NOT IN (SELECT toutiao_uid FROM joke_tab_joke_toutiaouser_wukong_question) LIMIT 1' logging.info(sql) res_content = mysql_fetch(sql, 'dic') if len(res_content) == 0: continue browser = webdriver.Chrome() f_url_l = ['https://www.toutiao.com/group/1589657566362638/', 'https://www.wukong.com/question/6388670742287876353/', 'https://www.wukong.com/tag/6215497898671475202/'] f_url_l += ['https://www.wukong.com/question/6512777037948649741/', 'https://www.wukong.com/question/6469247721038414093/', 'https://www.wukong.com/question/6481502080249889037/'] # f_url_l = [] f_url_l = ['https://www.toutiao.com/a6514526304476332552/', 'https://www.toutiao.com/a6514661446876398088/', 'https://www.toutiao.com/a6514778729951003150/'] f_url_l += ['https://www.toutiao.com/a6514216125151052291/', 'https://www.toutiao.com/a6512315164463727111/', 'https://www.toutiao.com/a6513334304318161411/'] f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)] # browser.get(random.choice(f_url_l)) browser.get(f_url_l_a) time.sleep(random.randint(10, 20)) js = 'window.location.href="https://sso.toutiao.com/login/";' js = 'window.location.href="https://sso.toutiao.com/login/?service=https://mp.toutiao.com/sso_confirm/?redirect_url=/";' browser.execute_script(js) time.sleep(random.randint(10, 20)) # js = 'window.location.href="https://sso.toutiao.com/login/?service=https%3A%2F%2Fwww.wukong.com%2Fwenda%2Fwelcome%2F#type=0";' browser.execute_script(js) ac_type = 'qq' if ac_type == 'qq': myid, mypwd = ac['u'], ac['p'] xp = '/html/body/div/div/div[2]/div/div/div/ul/li[3]' browser.find_element_by_xpath(xp).click() time.sleep(10) js = '%s%s%s' % ('document.getElementById("u").value="', myid, '"') browser.execute_script(js) js = '%s%s%s' % ('document.getElementById("p").value="', mypwd, '"') browser.execute_script(js) time.sleep(random.randint(5, 15)) xp_newpage = '//*[@id="go"]' browser.find_element_by_xpath(xp_newpage).click() time.sleep(random.randint(10, 20)) elif ac_type == 'mail_qq': continue time.sleep(5) browser.refresh() js = 'window.location.href="https://www.toutiao.com/";' browser.execute_script(js) time.sleep(6) js = 'window.location.href="https://www.wukong.com/";' js = 'window.location.href="https://mp.toutiao.com/profile_v2/publish/";' js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/publish";' browser.execute_script(js) time.sleep(6) js = 'document.getElementsByClassName("ask")[0].click();' browser.execute_script(js) time.sleep(12) time.sleep(random.randint(10, 20)) # 需要键盘事件 反爬虫 tmp_target = browser.find_element_by_class_name('input-box').find_element_by_tag_name('input') # tmp_target.send_keys(Keys.SPACE) # tmp_target.send_keys(Keys.CONTROL, 'a') # tmp_target.send_keys(Keys.CONTROL, 'x') # tmp_target.send_keys(Keys.CONTROL, 'v') # tmp_target.send_keys(Keys.BACK_SPACE) # time.sleep(random.randint(10, 20)) # res_content = [] for i in res_content[0:1]: dbid, content, img_list = i['id'], i['content'], i['img_list'] tmp_l = ['口红', '指甲油', '护发素', '沐浴露', '洗手液', '洗发水', '牙膏'] tmp_l_1 = ['老人', '小孩', '白领', '前台妹子', '行政妹子', '大学生', '高中生'] tmp_l_2 = ['类型', '特质', '种类', '价位', '原材料', '主要成分', '价格'] s = '{}{}{}{}{}{}{}'.format(str(random.randint(1, 12)), '月份,', random.choice(tmp_l_1), '适合使用什么', random.choice(tmp_l_2), '的', random.choice(tmp_l)) js = 'document.getElementsByClassName("input-box")[0].childNodes[0].value="{}";'.format(s) browser.execute_script(js) time.sleep(12) # tmp_target.send_keys(Keys.SPACE) js = 'document.getElementsByClassName("step-btn next")[0].click();' browser.execute_script(js) # step-btn submit js = 'document.getElementsByClassName("step-btn submit")[0].click();' browser.execute_script(js) time.sleep(12) # js = 'window.location.href="https://www.wukong.com/user/?uid={}&type=1";'.format(toutiao_uid) browser.execute_script(js) time.sleep(12) res_url = browser.find_element_by_class_name('question-title').find_elements_by_tag_name('a')[ 0].get_attribute('href') # print(i) # xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/textarea' # try: # browser.find_element_by_xpath(xp_newpage) # except Exception as e: # print(e) # break # browser.find_element_by_xpath(xp_newpage).click() # words = content # # Message: SyntaxError: unterminated string literal # mytxt = words.replace(' ', ' ').replace(' ', ' ').replace('\br', ' ').replace('"', '“').replace("'", '‘') # # Message: SyntaxError: missing ; before statement # mytxt = mytxt.replace("'", '‘') # # 2000 头条 # mytxt = mytxt[0:2000] # mytxt = '好消息' if len(mytxt.replace(' ', '')) == 0 else mytxt # # # 需要键盘事件 反爬虫 # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE) # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'a') # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'x') # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'v') # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.BACK_SPACE) # time.sleep(random.randint(2, 5)) # # try: # # js = '%s%s%s' % ('document.getElementsByTagName("textarea")[0].value="', '', '"') # # browser.execute_script(js) # js = '%s%s%s' % ('document.getElementsByTagName("textarea")[0].value="', mytxt, '"') # browser.execute_script(js) # time.sleep(3) # except Exception as jse: # print('.getElementsByTagName("textarea")--log-', jse) # continue # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE) # xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/div[1]/div[1]/span[1]/span' # browser.find_element_by_xpath(xp_newpage).click() # time.sleep(3) # try: # upload = browser.find_element_by_id('fileElem') # # logs_img = '' # img_url_list = img_list.split(',') # # for imgid in img_url_list: # img_url = 'http://192.168.2.212:83/file/get?type=tab_joke&id=199'.replace('199', str(imgid)) # local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url, # local_default='default.DONOT_REMOVE.png') # print(local_img_path) # time.sleep(random.randint(2, 4)) # logs_img += img_url # logs_img += local_img_path # upload.send_keys(local_img_path) # time.sleep(random.randint(3, 7)) # except Exception as ee: # img_url_default = '' # img_url = img_url_default # local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url, # local_default='default.DONOT_REMOVE.png') # sleep(2) # logs_img += img_url # logs_img += local_img_path # # upload.send_keys(local_img_path) # logging.exception(ee) # # try: # xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/ul' # browser.find_element_by_xpath(xp_newpage).click() # xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/div[1]/div[2]/a' # browser.find_element_by_xpath(xp_newpage).click() # # time.sleep(random.randint(8, 20)) # js = 'document.getElementsByClassName("ugc-mode-content")[0].getElementsByTagName("a")[0].target="_self"' # browser.execute_script(js) # # time.sleep(random.randint(2, 5)) # xp_newpage = '/html/body/div/div[2]/div[2]/div[2]/ul/li[1]/div/div[2]/div/div[2]/a' # browser.find_element_by_xpath(xp_newpage).click() # time.sleep(random.randint(3, 6)) # url_curr = browser.current_url # # with open('toutiao_success.log', 'a', encoding='utf-8') as f: # logs = '%s%s%s%s%s ' % ( # time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), ac_type, myid[0:4], mytxt, # logs_img) # print(logs) # f.write(logs) sql = 'INSERT INTO joke_tab_joke_toutiaouser_action_wukong_question_action (article_id,article_url,time_script,toutiao_uid) VALUE("%s","%s","%s","%s");' % ( dbid, res_url, int(time.time()), toutiao_uid) mysql_write(sql) print(sql) time.sleep(random.randint(20, 30)) js = 'window.location.href="https://www.wukong.com/"' browser.execute_script(js) # except Exception as e_url_jump: # print('e_url_jump', e_url_jump) try: browser.quit() except Exception as e1: print(e1) logging.exception(e1) time.sleep(random.randint(120, 300))