# -*- coding: utf-8 -*- from __future__ import division from selenium import webdriver import time from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import requests from threading import Thread from pyquery import PyQuery as pq import chardet import copy import xlwt import os import mPing import datetime import xlwt from xlrd import open_workbook now_time = time.strftime('%H-%M-%S', time.localtime(time.time())) print now_time # print chardet.detect(now_time) # print chardet.detect(time_now_time) #xls_name = ("京东爬虫数据.xls").decode("utf-8") xls_name = ("京东爬虫数据"+str(now_time)+".xls").decode("utf-8") #print type(xls_name) #print "京东爬虫数据"+str(now_time)+".xls" title = ["链接", "名称", "价格", "晒图", "好评", "中评", "差评", "全部评价"] urllist = ["https://item.jd.com/11936238.html", "https://item.jd.com/11841674.html" ] URLSource = "京东URL.txt".decode('utf-8') if os.path.isfile(URLSource): print "发现URL文件,准备开始爬虫".decode('utf-8') else: print "亲!!! 当前目录下的url文件: "".decode('utf-8')+URLSource+"" 不存在,请添加后再运行".decode('utf-8') exit(1) def msleep1(): time.sleep(1) def msleep2(): print "...2", time.sleep(1) print "...1", time.sleep(1) print "...0" def msleep3(): print "5", time.sleep(1) print "...4", time.sleep(1) print "...3", time.sleep(1) print "...2", time.sleep(1) print "...1", time.sleep(1) print "...0" def warnningtext(): return "这里无法正确获取数据(偶尔网速问题会影响一两个数据),请手动检查,如果是代码问题请联系开发修改".decode("utf-8") def cannotgetdataprint(text): print ("无法获取"+text+" 请手动检查一下然后联系开发人员").decode('utf-8') def mprint(str): #print "", print "############# " + str.decode('utf-8') + " #############" def debugprint(str): print "", #不换行空输出 "" 后面加 , print "debugprint@@@ " + str.decode('utf-8') def totwrite(str): return str.decode('utf-8') # mPing.mNetPing('jd.com') # chromeOptions = webdriver.ChromeOptions() # prefs = {"profile.managed_default_content_settings.images":2} # chromeOptions.add_experimental_option("prefs",prefs) # driver = webdriver.Chrome(chrome_options=chromeOptions) prefs = {"profile.managed_default_content_settings.images":2} option = webdriver.ChromeOptions() option.add_argument("test-type")#不显示警告 option.add_experimental_option("prefs",prefs)#不显示图片 global timesurl timesurl = 1 global webdriver_chrome #webdriver_chrome = webdriver.PhantomJS()#phantomjs无法加载ajax 所以这里不能用 还是要用chrome来模拟动态的加载 webdriver_chrome = webdriver.Chrome(chrome_options=option) #webdriver_chrome.set_window_size(2000,2000) def isUrlBefore(): pass#打开url后地址是否被跳转 如果跳转那就跳过该地址并写入警告 def isString(isstr, data): if isstr in str(data.encode("utf-8")): return True else: return False def openweb(url): global starttime global driver_wait global isOffsale COUNTINUE = False SKIP = 1 TIAOZHUAN = 2 LOADERROR = 3 FATALERROR = 4 mprint("努力加载链接中,请耐心等待") try: try:#获取源码进行判断 respone = requests.get(url) #正确打开连接 isOffsale = False #初始化设置为不下柜 if respone.status_code == 200:#正确加载价格页面包括下柜的页面 if "商品评价" in str(respone.text.encode("utf-8")):#说明页面正常访问到商品页面 否则可能被跳转了 # print respone.text isOffsale = False if "商品已下柜" in str(respone.text.encode("utf-8")): isOffsale = True else: return TIAOZHUAN #说明页面不是价格页面 被跳转了? else:#无法打开连接 return LOADERROR#状态码不是200说明访问有问题 except Exception, e: print Exception, e#无法获取源码 return FATALERROR #以下代码应该不会被执行 webdriver_chrome.get(url) # mprint("获取当前地址") if "?c" in getcurrenturl():#有了上面的if "商品评价" in判断后这段代码应该不会被执行到 mprint("地址已经被跳转") return SKIP driver_wait = WebDriverWait(webdriver_chrome, 10) return COUNTINUE except Exception: mprint("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!请注意,链接有问题 无法打开 程序可能停止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print url print getcurrenturl() return SKIP finally: debugprint("打印url") def get_element_bycssselector(css_selector): element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) # print element.text return element def get_datanum_bycssselectorlist(css_selector_list, text): for css_selector in css_selector_list: try: # print css_selector element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) data_num = element.get_attribute('data-num') if isString(text, element.text): print element.text + ":" + str(data_num) # mprint ("显示好评") return data_num else: mprint("无法获取") except: pass return warnningtext() def get_element_byxpathlist(xpath_list, text): for xpath in xpath_list: try: element = driver_wait.until(EC.element_to_be_clickable((By.XPATH, xpath))) # print element.text if isString(text, element.text): print element.text return element else: mprint("无法获取xpath如下") print xpath except: mprint(xpath) pass return None # def try_element(element): # try: # element # except: # pass def getname(): debugprint("start find name btn") try: myname = webdriver_chrome.find_element_by_class_name('sku-name') mprint("1名称:") print myname.text return myname.text except Exception: pass try: myname = webdriver_chrome.find_element_by_css_selector('#name > h1') mprint("2名称:")#生鲜 书籍 print myname.text return myname.text except Exception: pass try: myname = webdriver_chrome.find_element_by_css_selector('#name') mprint("3名称:")#生鲜 书籍 print myname.text return myname.text except Exception: mprint("第 3次 抓取商品名称失败") return warnningtext() def getprice(): debugprint("start getprice") try: myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.summary.summary-first > div > div.summary-price.J-summary-price > div.dd > span'))) mprint("1价格:") # print myprice.text finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep1() finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep2 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep3 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') print finalprice return finalprice except Exception:#估计下架 做下架的抓取 pass try: # 生鲜 书籍 抓取价格 myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#jd-price"))) # 生鲜 可用 # myprice = webdriver_chrome.find_element_by_xpath("/html/body/div[7]/div/div[2]/div[3]/div/div[1]/div[2]/span/span[2]") mprint("2价格:") # print myprice.text finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep1 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep2 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep3 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') print finalprice return finalprice except Exception: # 估计下架 做下架的抓取 pass try: # 生鲜 书籍 抓取价格 myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.summary-price.J-summary-price > div > div.dd > span > span"))) # 生鲜 可用 mprint("3价格:") # print myprice.text finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep1 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep2 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep3 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') print finalprice return finalprice except Exception: # 估计下架 做下架的抓取 pass try: # 下架的抓取 前面判断了下架 这里基本上不会执行了 mprint("4下架:") soldout = webdriver_chrome.find_element_by_class_name('itemover-tip') # 抓下柜 下架 “该商品已下柜,欢迎挑选其他商品!” print soldout.text return soldout.text except Exception: mprint("抓不到价格 也不是下架 请检查") return warnningtext() def scrolldown(): debugprint("准备开始滚动500") webdriver_chrome.execute_script("window.scrollBy(0,500)") debugprint("已向下滚动500") def clickcommentbtn(): xpath1 = '//*[@id="detail"]/div[1]/ul/li[5]' xpath2 = '//*[@id="detail"]/div[1]/ul/li[4]' # xpath3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)' btn = get_element_byxpathlist([xpath1, xpath2], "商品评价") if btn is not None: try: btn.click() # mprint("xpath点击") except Exception, e: mprint("btn非空 不过点击失败了 一般不会这样的 报错是否是:Element is not clickable at point (697, 299). Other element would receive the click") print Exception, e else: # pass#其他判断 基本上不会到这里 css_sele1 = '# detail > div.tab-main.large > ul > li:nth-child(4)' css_sele2= '#detail > div.tab-main.large > ul > li.current' try: get_element_bycssselector(css_sele1).click() mprint("通过csssele获取到") print css_sele1 except: try: get_element_bycssselector(css_sele2).click() mprint("通过csssele获取到") print css_sele1 except: mprint("实在找不到 联系开发 程序可能终止") """ try:#1#detail > div.tab-main.large > ul > li.current > s mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[5]') mprint("1点击") print mysumcommentbtn.text, # 三个按钮的链接要用其他的(运动户外类) # mprint("运动户外类?") if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~点击了按钮") # 这句有问题 return True else: mprint("找不到按钮 商品评价 继续寻找2") except: pass try:#2 mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[4]') mprint("2点击") print mysumcommentbtn.text if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~点击了评论总量按钮") return True else: mprint("找到按钮 不是商品评价 继续寻找3") except: mprint("2点击找不到继续下一步") pass try:#3 css_sele = '# detail > div.tab-main.large > ul > li:nth-child(4)' # 香蕉 # http: // item.jd.com / 11461683.html mysumcommentbtn = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) mprint("3点击") print mysumcommentbtn.text if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~点击了评论总量按钮") return True except: mprint("找不到按钮 商品评价 继续寻找4 ") pass try:#4 css_sele = '#detail-tab-comm' # 书籍类比较多 mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele))) mprint("4点击") print mysumcommentbtn.text if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~点击了评论总量按钮") return True except: mprint("找不到按钮 商品评价 继续寻找5") pass try:#5 css_sele = '#detail > div.tab-main.large > ul > li.current' # 香蕉 书籍 # http: // item.jd.com / 11461683.html mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele))) mprint("5点击") print mysumcommentbtn.text if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~点击了评论总量按钮") return True else: mprint("第五次也找不到 只能手动找了") print getcurrenturl() return warnningtext() except: mprint("无法找到商品评价按钮 请联系开发 提供url:") print getcurrenturl() return warnningtext() """ def getshowpicnum(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)' css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(2)' for i in range(3):#循环查找3次 pic_num = get_datanum_bycssselectorlist ([css_sele1, css_sele2], "晒图") if pic_num is not None: # mprint(pic_num) return pic_num else: # pass mprint("shaitu") # print u"第"+str(i+1)+u"次没找到,准备开始第"+str(i+2)+u"次查找" """ global data_num global myshowpic try:#comments-list > div.mt > div > ul > li:nth-child(2) # comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2) css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)' myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) data_num = myshowpic.get_attribute('data-num') mprint("1晒图") print myshowpic.text, if "晒图" in str(myshowpic.text.encode("utf-8")): debugprint("第一次判断正确 是晒图按钮") if data_num is not None: return data_num else: mprint("晒图的值没有正确加载 5s后再次验证") msleep3() data_num = myshowpic.get_attribute ('data-num') if data_num is not None: mprint("找到晒图值") print myshowpic.text return data_num else: mprint ("晒图的值没有正确加载 5s后再次验证") msleep3 () msleep3 () data_num = myshowpic.get_attribute ('data-num') if data_num is not None: mprint ("找到晒图值") print myshowpic.text return data_num else:#多次查找无法找到值 mprint("#多次查找无法找到值") return warnningtext() else: debugprint("第一次判断错误 按钮找到不是晒图 联系开发提供截图") except: debugprint("第一次判断没找到按钮 开始第二次") try: css_sele = '#comments-list > div.mt > div > ul > li:nth-child(2)' myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) mprint("2晒图") print myshowpic.text if "晒图" in str(myshowpic.text.encode("utf-8")): debugprint("第2次判断正确 是晒图按钮") if myshowpic.get_attribute('data-num') is not None: return myshowpic.get_attribute('data-num') else: mprint ("晒图的值没有正确加载 5s后再次验证") msleep3 () data_num = myshowpic.get_attribute ('data-num') if data_num is not None: mprint ("找到晒图值") print myshowpic.text return data_num else: mprint ("晒图的值没有正确加载 5s后再次验证") msleep3 () msleep3 () data_num = myshowpic.get_attribute ('data-num') if data_num is not None: mprint ("找到晒图值") print myshowpic.text return data_num else: # 多次查找无法找到值 return warnningtext () else: debugprint("第2次判断错误 按钮找到不是晒图 联系开发提供截图") except: debugprint("第2次判断没找到按钮 联系开发") return warnningtext() """ def totalcomment(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current' css_sele2 = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr' return get_datanum_bycssselectorlist([css_sele1, css_sele2], "全部评价") """ try: css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current' mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)") data_num = mypositivecomment.get_attribute('data-num') mprint("1全部评价") print mypositivecomment.text, data_num if "全部评价" in str(mypositivecomment.text.encode("utf-8")): debugprint("第1次判断正确 是全部评价按钮") if data_num is not None: return data_num else: mprint("全部评价的值没有正确加载 请手动查找") return cannotgetdataprint(mypositivecomment.text) else: debugprint("第1次判断错误 按钮找到不是全部评价 联系开发提供截图") except: debugprint("第一次抓全部评价失败 继续第二次") pass try: css_sele = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr' mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)") data_num = mypositivecomment.get_attribute('data-num') mprint("2全部评价") print mypositivecomment.text, data_num if "全部评价" in str(mypositivecomment.text.encode("utf-8")): debugprint("第2次判断正确 是全部评价按钮") if data_num is not None: return data_num else: mprint("全部评价的值没有正确加载 请手动查找") return cannotgetdataprint(mypositivecomment.text) else: debugprint("第2次判断错误 按钮找到不是全部评价 联系开发提供截图") except: debugprint("第2次抓全部评价失败 继续第二次") return cannotgetdataprint("全部评价") """ def getpositivecomment(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(4)' css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(3)' css_sele3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)' return get_datanum_bycssselectorlist([css_sele1, css_sele2, css_sele3], "好评(") """ try: mypositivecomment = get_element_bycssselector(css_sele1) data_num = mypositivecomment.get_attribute('data-num') mprint("1好评") if isString("好评(", mypositivecomment.text): print mypositivecomment.text + ":" + str(data_num) # mprint ("显示好评") return data_num else: mprint("好评数量无法获取") except: debugprint("第一次抓好评失败 继续第二次") pass try:#书籍 香蕉 mypositivecomment = get_element_bycssselector(css_sele2) data_num = mypositivecomment.get_attribute('data-num') mprint("2好评") if isString("好评(", mypositivecomment.text): print mypositivecomment.text + ":" + str(data_num) # mprint ("显示好评") return data_num else: mprint("好评数量无法获取") except: pass try:#?? mypositivecomment = get_element_bycssselector(css_sele3) data_num = mypositivecomment.get_attribute('data-num') if isString("好评(", mypositivecomment.text): mprint ("第3次获取到好评") print mypositivecomment.text + ":" + str(data_num) # mprint ("显示好评") return data_num else: mprint("好评数量无法获取") print mypositivecomment.text + ":" + str(data_num) # mprint ("显示好评") except: mprint("无法获取到好评") return warnningtext() """ def getmoderatecomment(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)' css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)' return get_datanum_bycssselectorlist([css_sele1, css_sele2], "中评(") """ try: css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)' mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) # mymoderatecomment = webdriver_chrome.find_element_by_css_selector( # "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)") data_num = mymoderatecomment.get_attribute('data-num') mprint("1中评") print mymoderatecomment.text + ":" + str(data_num) # mprint("显示中评") return data_num except: pass try: css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)' mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) # mymoderatecomment = webdriver_chrome.find_element_by_css_selector( # "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)") data_num = mymoderatecomment.get_attribute('data-num') print mymoderatecomment.text + ":" + str(data_num) # mprint("显示中评") mprint("2中评") return data_num except: mprint("第二次中评失败 联系开发") """ def getnegativecomment(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)' css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)' return get_datanum_bycssselectorlist([css_sele1, css_sele2], "差评(") """ try: css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)' mynegativecomment = driver_wait.until (EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) data_num = mynegativecomment.get_attribute('data-num') mprint("1差评") print mynegativecomment.text+":"+str(data_num) # mprint ("显示差评") return data_num except: debugprint("第一次差评失败") try: css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)' mynegativecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) data_num = mynegativecomment.get_attribute('data-num') print mynegativecomment.text + ":" + str(data_num) # mprint ("显示差评") mprint("2差评") return data_num except: mprint("第2次差评失败 联系开发") """ def getaddcomment():#追评 css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment' return get_datanum_bycssselectorlist ([css_sele1, ], "追评(") """ try: css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment' maddcomment = driver_wait.until(EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele))) data_num = maddcomment.get_attribute('data-num') print maddcomment return data_num except: return "如果前面都没问题 可能这个链接没有追评 可以手动确认".decode("utf-8") """ def getcurrenturl(): # debugprint("打印当前页面url: "+str(webdriver_chrome.current_url)) return webdriver_chrome.current_url def mwrite(linenum, zlist): #放一个 要保存的 行数 和 数据list count = len(zlist) #列表数据的长度 mprint("准备插入第 "+str(linenum+1)+" 条数据,一共:"+str(count)+"列") title_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on', num_format_str='#,##0.00') if linenum == 0: global wb global ws wb = xlwt.Workbook() ws = wb.add_sheet("京东666".decode("utf-8")) for i in range(0, count):#列数 if i == 0: mprint("写入如下数据") if linenum == 0:#第1条数据待插入 需要先把标题插入0 再把第一条数据插入1 ws.write(linenum, i, title[i].decode("utf-8"), title_style)#写标题 ws.write(linenum+1, i, zlist[i])#这个write是一个覆盖操作 如果没write就放空 print title[i].decode("utf-8"), zlist[i] wb.save(xls_name) # if i == (count-1): # mprint("完成本条数据写入") else: # 第2+条数据开始插入 ws = wb.get_sheet(0) ws.write(linenum+1, i, zlist[i]) print title[i].decode ("utf-8"), zlist[i] wb.save(xls_name) # mprint("第"+str(linenum+1)+"条数据写入成功,还剩"+(sumurlcount-linenum)+"条数据待解析") class MyThread_totalcom(Thread): def __init__(self): Thread.__init__(self) def run(self): # totalcom = totalcomment() self.totalcom = totalcomment() def get_result(self): return self.totalcom class MyThread_showpic(Thread): def __init__(self): Thread.__init__(self) def run(self): self.showpic = getshowpicnum() def get_result(self): return self.showpic def getall(url): starttime = datetime.datetime.now() RETURN_CODE = openweb(url) print RETURN_CODE,'RETURN_CODE' if RETURN_CODE:#TRUE: skip and warning try: if RETURN_CODE == 2: mprint("页面被跳转") skiplist = [url, "!!页面被跳转".decode("utf-8"), RETURN_CODE, "", "", "", "", ""] return skiplist else:#1 mprint("无法访问 检查网络是否故障") skiplist = [url, "!!检查是否无法打开网页".decode("utf-8"), RETURN_CODE, "", "", "", "", ""] return skiplist except: mprint("???") skiplist = [url, "!!跳过该条链接".decode("utf-8"), "???????????????????".decode("utf-8"), "", "", "", "", ""] return skiplist else:#FALSE :continue to get the data # starttime = datetime.datetime.now () endtime = datetime.datetime.now() timed = (endtime - starttime).seconds mprint("网页已经被打开,耗时:"+str(timed)+"秒") debugprint('scrolldown1') #urlcurrent = getcurrenturl()#写一个 如果链接被跳转到其他页面就跳过的判断 有时间再写吧 urlcurrent可能变成 jd.com scrolldown() # msleep1() #scrolldown() # msleep2() debugprint('scrolldown2') name = getname() if isOffsale: # 下柜 price = "商品已下柜".decode ("utf-8") else: price = getprice() clickcommentbtn() # msleep2() #好评度能加载完成就能显示晒图 try: print u"好评度:", get_element_bycssselector("#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div").text except: mprint("无法获取好评度,说明网络加载缓慢") #想写个多线程 不过单独一个的时候正常 如果两个都放进去就会出问题 难道是selenium不能同时find两个element? mprint("多线程开始") thd1 = MyThread_totalcom() # thd2 = MyThread_showpic() thd1.start() mprint("MyThread_totalcom线程开始") # thd2.start() # mprint("MyThread_showpic程开始") thd1.join() # thd2.join() totalcom = thd1.get_result() # showpic = thd2.get_result() mprint("多线程结束") # totalcom = totalcomment()#上面用多线程这里就注释掉 showpic = getshowpicnum() #上面多线程 只能跑一个 totalcomment和getshowpicnum一起就出问题 好像不是我多线程代码有问题 是selenium不能同时find多个元素 positivcom = getpositivecomment() modertcom = getmoderatecomment() negtivcom = getnegativecomment() # addcomment = getaddcomment() sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom, totalcom] # sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom ,addcomment] # print sumlist return sumlist # a list if __name__ == '__main__': try:#__main__ # print type(now_time), type("时间") print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) cc = 0 # URLSource total_starttime = datetime.datetime.now() f = open(URLSource, "r") lines = f.readlines() # 读取全部内容 global sumurlcount sumurlcount = len(lines) print sumurlcount mprint("一共 "+str(sumurlcount)+" 条数据要爬虫") for jdurl in lines: #for i in urllist: s = [] print jdurl one_starttime = datetime.datetime.now () goodsinfo_list = getall(jdurl.replace(" ", "")) print "test111111111" # print goodsinfo_list mwrite(cc, goodsinfo_list) oneurl_endtime = datetime.datetime.now () oneurl_timed = (oneurl_endtime - one_starttime).seconds mprint ("该条数据写入完成耗时:" + str (oneurl_timed) + "秒,还剩"+str(sumurlcount - cc - 1)+"条数据待分析,即将开始下一个链接的抓取!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") cc = cc + 1 mprint("@@@@@$$$$$$$$@@@@@ 所有代码正常运行 无报错 @@@@@@@@@@@$$$$$$$$$$$$$$$@@@@@@@@@@@@@@@@") total_endtime = datetime.datetime.now () total_timed = (total_endtime - total_starttime).seconds mprint ("整个爬虫一共耗时:" + str (total_timed) + "秒"+",单条链接平均爬虫耗时:"+str((round(total_timed/sumurlcount,2)))+ "秒") except Exception, e: print Exception, e mprint("~~~~~~~~中间有 报错了@@@@@@@@@@@@@@@@") finally: mprint("sleep 10s后关闭浏览器") time.sleep(10) webdriver_chrome.quit()