zoukankan      html  css  js  c++  java
  • 【爬虫】京东商品连接

    # -*- coding: utf-8 -*-
    from __future__ import division
    from selenium import webdriver
    import time
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    import requests
    from threading import Thread
    from pyquery import PyQuery as pq
    import chardet
    import copy
    import xlwt
    import os
    import mPing
    import datetime
    import xlwt
    from xlrd import open_workbook
    now_time = time.strftime('%H-%M-%S', time.localtime(time.time()))
    print now_time
    # print chardet.detect(now_time)
    # print chardet.detect(time_now_time)
    #xls_name = ("京东爬虫数据.xls").decode("utf-8")
    xls_name = ("京东爬虫数据"+str(now_time)+".xls").decode("utf-8")
    #print type(xls_name)
    #print "京东爬虫数据"+str(now_time)+".xls"
    title = ["链接", "名称", "价格", "晒图", "好评", "中评", "差评", "全部评价"]
    urllist = ["https://item.jd.com/11936238.html",
               "https://item.jd.com/11841674.html"
               ]
    URLSource = "京东URL.txt".decode('utf-8')
    if os.path.isfile(URLSource):
        print "发现URL文件,准备开始爬虫".decode('utf-8')
    else:
        print "亲!!! 当前目录下的url文件:   "".decode('utf-8')+URLSource+""    不存在,请添加后再运行".decode('utf-8')
        exit(1)
    
    
    def msleep1():
        time.sleep(1)
    
    
    def msleep2():
        print "...2",
        time.sleep(1)
        print "...1",
        time.sleep(1)
        print "...0"
    
    
    def msleep3():
        print "5",
        time.sleep(1)
        print "...4",
        time.sleep(1)
        print "...3",
        time.sleep(1)
        print "...2",
        time.sleep(1)
        print "...1",
        time.sleep(1)
        print "...0"
    
    
    def warnningtext():
        return "这里无法正确获取数据(偶尔网速问题会影响一两个数据),请手动检查,如果是代码问题请联系开发修改".decode("utf-8")
    
    
    def cannotgetdataprint(text):
        print ("无法获取"+text+" 请手动检查一下然后联系开发人员").decode('utf-8')
    
    
    def mprint(str):
        #print  "",
        print "#############   " + str.decode('utf-8') + "   #############"
    
    
    def debugprint(str):
        print  "",   #不换行空输出   "" 后面加 ,
        print "debugprint@@@   " + str.decode('utf-8')
    
    
    def totwrite(str):
        return str.decode('utf-8')
    
    # mPing.mNetPing('jd.com')
    
    # chromeOptions = webdriver.ChromeOptions()
    # prefs = {"profile.managed_default_content_settings.images":2}
    # chromeOptions.add_experimental_option("prefs",prefs)
    # driver = webdriver.Chrome(chrome_options=chromeOptions)
    
    prefs = {"profile.managed_default_content_settings.images":2}
    option = webdriver.ChromeOptions()
    option.add_argument("test-type")#不显示警告
    option.add_experimental_option("prefs",prefs)#不显示图片
    global timesurl
    timesurl = 1
    global webdriver_chrome
    #webdriver_chrome = webdriver.PhantomJS()#phantomjs无法加载ajax 所以这里不能用 还是要用chrome来模拟动态的加载
    webdriver_chrome = webdriver.Chrome(chrome_options=option)
    #webdriver_chrome.set_window_size(2000,2000)
    
    def isUrlBefore():
        pass#打开url后地址是否被跳转 如果跳转那就跳过该地址并写入警告
    
    def isString(isstr, data):
        if isstr in str(data.encode("utf-8")):
            return True
        else:
            return False
    
    
    def openweb(url):
        global  starttime
        global driver_wait
        global isOffsale
        COUNTINUE = False
        SKIP = 1
        TIAOZHUAN = 2
        LOADERROR = 3
        FATALERROR = 4
    
        mprint("努力加载链接中,请耐心等待")
        try:
            try:#获取源码进行判断
                respone = requests.get(url)
                #正确打开连接
                isOffsale = False #初始化设置为不下柜
                if respone.status_code == 200:#正确加载价格页面包括下柜的页面
                    if "商品评价" in str(respone.text.encode("utf-8")):#说明页面正常访问到商品页面  否则可能被跳转了
                        # print respone.text
                        isOffsale = False
                        if "商品已下柜" in str(respone.text.encode("utf-8")):
                            isOffsale = True
                    else:
                        return TIAOZHUAN #说明页面不是价格页面  被跳转了?
                else:#无法打开连接
                    return LOADERROR#状态码不是200说明访问有问题
            except Exception, e:
                print Exception, e#无法获取源码
                return FATALERROR
        #以下代码应该不会被执行
            webdriver_chrome.get(url)
            # mprint("获取当前地址")
            if "?c" in getcurrenturl():#有了上面的if "商品评价" in判断后这段代码应该不会被执行到
                mprint("地址已经被跳转")
                return SKIP
            driver_wait = WebDriverWait(webdriver_chrome, 10)
            return COUNTINUE
        except Exception:
            mprint("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!请注意,链接有问题 无法打开 程序可能停止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print url
            print getcurrenturl()
            return SKIP
        finally:
            debugprint("打印url")
    
    
    def get_element_bycssselector(css_selector):
        element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
        # print element.text
        return element
    
    
    def get_datanum_bycssselectorlist(css_selector_list, text):
        for css_selector in css_selector_list:
            try:
                # print css_selector
                element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
                data_num = element.get_attribute('data-num')
                if isString(text, element.text):
                    print element.text + ":" + str(data_num)  # mprint ("显示好评")
                    return data_num
                else:
                    mprint("无法获取")
            except:
                pass
        return warnningtext()
    
    
    def get_element_byxpathlist(xpath_list, text):
        for xpath in xpath_list:
            try:
                element = driver_wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
                # print element.text
                if isString(text, element.text):
                    print element.text
                    return element
                else:
                    mprint("无法获取xpath如下")
                    print xpath
            except:
                mprint(xpath)
                pass
        return None
    
    # def try_element(element):
    #     try:
    #         element
    #     except:
    #         pass
    
    
    
    def getname():
        debugprint("start find name btn")
        try:
            myname = webdriver_chrome.find_element_by_class_name('sku-name')
            mprint("1名称:")
            print myname.text
            return myname.text
        except Exception:
            pass
        try:
            myname = webdriver_chrome.find_element_by_css_selector('#name > h1')
            mprint("2名称:")#生鲜 书籍
            print myname.text
            return myname.text
        except Exception:
            pass
        try:
            myname = webdriver_chrome.find_element_by_css_selector('#name')
            mprint("3名称:")#生鲜 书籍
            print myname.text
            return myname.text
        except Exception:
            mprint("第 3次 抓取商品名称失败")
            return warnningtext()
    
    
    def getprice():
        debugprint("start getprice")
        try:
            myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.summary.summary-first > div > div.summary-price.J-summary-price > div.dd > span')))
            mprint("1价格:")
            # print myprice.text
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep1()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep2 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
                    if finalprice == "":
                        msleep3 ()
                        finalprice = myprice.text.encode ('utf-8').replace ('', '')
            print finalprice
            return finalprice
        except Exception:#估计下架 做下架的抓取
            pass
        try:  # 生鲜 书籍 抓取价格
            myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#jd-price")))  # 生鲜 可用
            # myprice = webdriver_chrome.find_element_by_xpath("/html/body/div[7]/div/div[2]/div[3]/div/div[1]/div[2]/span/span[2]")
            mprint("2价格:")
            # print myprice.text
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep1 ()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep2 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
                    if finalprice == "":
                        msleep3 ()
                        finalprice = myprice.text.encode ('utf-8').replace ('', '')
            print finalprice
            return finalprice
        except Exception:  # 估计下架 做下架的抓取
            pass
        try:  # 生鲜 书籍 抓取价格
            myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.summary-price.J-summary-price > div > div.dd > span > span")))  # 生鲜 可用
            mprint("3价格:")
            # print myprice.text
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep1 ()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep2 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
                    if finalprice == "":
                        msleep3 ()
                        finalprice = myprice.text.encode ('utf-8').replace ('', '')
            print finalprice
            return finalprice
        except Exception:  # 估计下架 做下架的抓取
            pass
    
        try:  # 下架的抓取  前面判断了下架 这里基本上不会执行了
            mprint("4下架:")
            soldout = webdriver_chrome.find_element_by_class_name('itemover-tip')  # 抓下柜 下架 “该商品已下柜,欢迎挑选其他商品!”
    
            print  soldout.text
    
            return soldout.text
        except Exception:
            mprint("抓不到价格 也不是下架 请检查")
            return warnningtext()
    
    
    def scrolldown():
        debugprint("准备开始滚动500")
        webdriver_chrome.execute_script("window.scrollBy(0,500)")
        debugprint("已向下滚动500")
    
    
    def clickcommentbtn():
        xpath1 = '//*[@id="detail"]/div[1]/ul/li[5]'
        xpath2 = '//*[@id="detail"]/div[1]/ul/li[4]'
        # xpath3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)'
        btn = get_element_byxpathlist([xpath1, xpath2], "商品评价")
        if btn is not None:
            try:
                btn.click()
                # mprint("xpath点击")
            except Exception, e:
                mprint("btn非空 不过点击失败了 一般不会这样的 报错是否是:Element is not clickable at point (697, 299). Other element would receive the click")
                print Exception, e
        else:
            # pass#其他判断  基本上不会到这里
            css_sele1 = '# detail > div.tab-main.large > ul > li:nth-child(4)'
            css_sele2= '#detail > div.tab-main.large > ul > li.current'
            try:
                get_element_bycssselector(css_sele1).click()
                mprint("通过csssele获取到")
                print css_sele1
            except:
                try:
                    get_element_bycssselector(css_sele2).click()
                    mprint("通过csssele获取到")
                    print css_sele1
                except:
                    mprint("实在找不到 联系开发 程序可能终止")
    
        """
        try:#1#detail > div.tab-main.large > ul > li.current > s
            mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[5]')
            mprint("1点击")
            print mysumcommentbtn.text,  # 三个按钮的链接要用其他的(运动户外类)
            # mprint("运动户外类?")
            if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
                mysumcommentbtn.click()
                mprint("~~~~~~点击了按钮") #    这句有问题
                return True
            else:
                mprint("找不到按钮 商品评价  继续寻找2")
        except:
            pass
    
        try:#2
    
            mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[4]')
            mprint("2点击")
            print mysumcommentbtn.text
            if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
                mysumcommentbtn.click()
                mprint("~~~~~~点击了评论总量按钮")
                return True
            else:
                mprint("找到按钮 不是商品评价  继续寻找3")
        except:
            mprint("2点击找不到继续下一步")
            pass
    
        try:#3
            css_sele = '# detail > div.tab-main.large > ul > li:nth-child(4)'  # 香蕉
           # http: // item.jd.com / 11461683.html
            mysumcommentbtn = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            mprint("3点击")
            print mysumcommentbtn.text
            if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
                mysumcommentbtn.click()
                mprint("~~~~~~点击了评论总量按钮")
                return True
        except:
            mprint("找不到按钮 商品评价  继续寻找4 ")
            pass
    
    
        try:#4
            css_sele = '#detail-tab-comm'  # 书籍类比较多
            mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
            mprint("4点击")
            print mysumcommentbtn.text
            if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
                mysumcommentbtn.click()
                mprint("~~~~~~点击了评论总量按钮")
                return True
        except:
            mprint("找不到按钮 商品评价  继续寻找5")
            pass
        try:#5
            css_sele = '#detail > div.tab-main.large > ul > li.current'  # 香蕉 书籍
            # http: // item.jd.com / 11461683.html
            mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
            mprint("5点击")
            print mysumcommentbtn.text
            if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
                mysumcommentbtn.click()
                mprint("~~~~~~点击了评论总量按钮")
                return True
            else:
                mprint("第五次也找不到 只能手动找了")
                print getcurrenturl()
                return warnningtext()
    
        except:
            mprint("无法找到商品评价按钮 请联系开发 提供url:")
            print getcurrenturl()
            return warnningtext()
            """
    
    
    def getshowpicnum():
        css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)'
        css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(2)'
        for i in range(3):#循环查找3次
            pic_num = get_datanum_bycssselectorlist ([css_sele1, css_sele2], "晒图")
            if pic_num is not None:
                # mprint(pic_num)
                return pic_num
            else:
                # pass
                mprint("shaitu")
                # print u""+str(i+1)+u"次没找到,准备开始第"+str(i+2)+u"次查找"
    
        """
        global data_num
        global myshowpic
        try:#comments-list > div.mt > div > ul > li:nth-child(2)
                       # comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)
            css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)'
            myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            data_num = myshowpic.get_attribute('data-num')
            mprint("1晒图")
            print myshowpic.text,
            if "晒图" in str(myshowpic.text.encode("utf-8")):
                debugprint("第一次判断正确 是晒图按钮")
                if data_num is not None:
                    return data_num
                else:
                    mprint("晒图的值没有正确加载 5s后再次验证")
                    msleep3()
                    data_num = myshowpic.get_attribute ('data-num')
                    if data_num is not None:
                        mprint("找到晒图值")
                        print myshowpic.text
                        return data_num
                    else:
                        mprint ("晒图的值没有正确加载 5s后再次验证")
                        msleep3 ()
                        msleep3 ()
                        data_num = myshowpic.get_attribute ('data-num')
                        if data_num is not None:
                            mprint ("找到晒图值")
                            print myshowpic.text
                            return data_num
                        else:#多次查找无法找到值
                            mprint("#多次查找无法找到值")
                            return warnningtext()
            else:
                debugprint("第一次判断错误 按钮找到不是晒图 联系开发提供截图")
        except:
            debugprint("第一次判断没找到按钮 开始第二次")
    
        try:
            css_sele = '#comments-list > div.mt > div > ul > li:nth-child(2)'
            myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            mprint("2晒图")
            print myshowpic.text
            if "晒图" in str(myshowpic.text.encode("utf-8")):
                debugprint("第2次判断正确 是晒图按钮")
                if myshowpic.get_attribute('data-num') is not None:
                    return myshowpic.get_attribute('data-num')
                else:
                    mprint ("晒图的值没有正确加载 5s后再次验证")
                    msleep3 ()
                    data_num = myshowpic.get_attribute ('data-num')
                    if data_num is not None:
                        mprint ("找到晒图值")
                        print myshowpic.text
                        return data_num
                    else:
                        mprint ("晒图的值没有正确加载 5s后再次验证")
                        msleep3 ()
                        msleep3 ()
                        data_num = myshowpic.get_attribute ('data-num')
                        if data_num is not None:
                            mprint ("找到晒图值")
                            print myshowpic.text
                            return data_num
                        else:  # 多次查找无法找到值
                            return warnningtext ()
            else:
                debugprint("第2次判断错误 按钮找到不是晒图 联系开发提供截图")
        except:
            debugprint("第2次判断没找到按钮 联系开发")
            return warnningtext()
    
        """
    
    
    def totalcomment():
        css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current'
        css_sele2 = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr'
        return get_datanum_bycssselectorlist([css_sele1, css_sele2], "全部评价")
        """
        try:
            css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current'
            mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)")
            data_num = mypositivecomment.get_attribute('data-num')
            mprint("1全部评价")
            print mypositivecomment.text, data_num
            if "全部评价" in str(mypositivecomment.text.encode("utf-8")):
                debugprint("第1次判断正确 是全部评价按钮")
                if data_num is not None:
                    return data_num
                else:
                    mprint("全部评价的值没有正确加载 请手动查找")
                    return cannotgetdataprint(mypositivecomment.text)
            else:
                debugprint("第1次判断错误 按钮找到不是全部评价 联系开发提供截图")
        except:
            debugprint("第一次抓全部评价失败 继续第二次")
            pass
        try:
            css_sele = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr'
            mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)")
            data_num = mypositivecomment.get_attribute('data-num')
            mprint("2全部评价")
            print mypositivecomment.text, data_num
            if "全部评价" in str(mypositivecomment.text.encode("utf-8")):
                debugprint("第2次判断正确 是全部评价按钮")
                if data_num is not None:
                    return data_num
                else:
                    mprint("全部评价的值没有正确加载 请手动查找")
                    return cannotgetdataprint(mypositivecomment.text)
            else:
                debugprint("第2次判断错误 按钮找到不是全部评价 联系开发提供截图")
        except:
            debugprint("第2次抓全部评价失败 继续第二次")
            return cannotgetdataprint("全部评价")
    """
    
    
    def getpositivecomment():
        css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(4)'
        css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(3)'
        css_sele3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)'
        return get_datanum_bycssselectorlist([css_sele1, css_sele2, css_sele3], "好评(")
    """
        try:
            mypositivecomment = get_element_bycssselector(css_sele1)
            data_num = mypositivecomment.get_attribute('data-num')
            mprint("1好评")
            if isString("好评(", mypositivecomment.text):
                print mypositivecomment.text + ":" + str(data_num)  # mprint ("显示好评")
                return data_num
            else:
                mprint("好评数量无法获取")
        except:
            debugprint("第一次抓好评失败 继续第二次")
            pass
    
        try:#书籍 香蕉
            mypositivecomment = get_element_bycssselector(css_sele2)
            data_num = mypositivecomment.get_attribute('data-num')
            mprint("2好评")
            if isString("好评(", mypositivecomment.text):
                print mypositivecomment.text + ":" + str(data_num)  # mprint ("显示好评")
                return data_num
            else:
                mprint("好评数量无法获取")
        except:
            pass
        try:#??
    
            mypositivecomment = get_element_bycssselector(css_sele3)
            data_num = mypositivecomment.get_attribute('data-num')
            if isString("好评(", mypositivecomment.text):
                mprint ("第3次获取到好评")
                print mypositivecomment.text + ":" + str(data_num)  # mprint ("显示好评")
                return data_num
            else:
                mprint("好评数量无法获取")
            print mypositivecomment.text + ":" + str(data_num)  # mprint ("显示好评")
        except:
            mprint("无法获取到好评")
            return warnningtext()
    """
    
    
    def getmoderatecomment():
        css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)'
        css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)'
        return get_datanum_bycssselectorlist([css_sele1, css_sele2], "中评(")
    
        """
        try:
            css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)'
            mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            # mymoderatecomment = webdriver_chrome.find_element_by_css_selector(
            #     "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)")
            data_num = mymoderatecomment.get_attribute('data-num')
            mprint("1中评")
            print mymoderatecomment.text + ":" + str(data_num)  # mprint("显示中评")
            return data_num
    
        except:
            pass
        try:
            css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)'
            mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            # mymoderatecomment = webdriver_chrome.find_element_by_css_selector(
            #     "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)")
            data_num = mymoderatecomment.get_attribute('data-num')
            print mymoderatecomment.text + ":" + str(data_num)  # mprint("显示中评")
            mprint("2中评")
            return data_num
        except:
            mprint("第二次中评失败 联系开发")
    
            """
    
    
    def getnegativecomment():
        css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)'
        css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)'
        return get_datanum_bycssselectorlist([css_sele1, css_sele2], "差评(")
        """
        try:
            css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)'
            mynegativecomment = driver_wait.until (EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            data_num = mynegativecomment.get_attribute('data-num')
            mprint("1差评")
            print mynegativecomment.text+":"+str(data_num) # mprint ("显示差评")
            return data_num
        except:
            debugprint("第一次差评失败")
    
        try:
            css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)'
            mynegativecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
            data_num = mynegativecomment.get_attribute('data-num')
            print mynegativecomment.text + ":" + str(data_num)  # mprint ("显示差评")
            mprint("2差评")
            return data_num
        except:
            mprint("第2次差评失败 联系开发")
            """
    
    
    def getaddcomment():#追评
        css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment'
        return get_datanum_bycssselectorlist ([css_sele1, ], "追评(")
        """
        try:
            css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment'
            maddcomment = driver_wait.until(EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
            data_num = maddcomment.get_attribute('data-num')
            print maddcomment
            return data_num
        except:
            return "如果前面都没问题 可能这个链接没有追评 可以手动确认".decode("utf-8")
        """
    
    
    def getcurrenturl():
        # debugprint("打印当前页面url:  "+str(webdriver_chrome.current_url))
        return webdriver_chrome.current_url
    
    
    def mwrite(linenum, zlist): #放一个 要保存的 行数 和 数据list
        count = len(zlist) #列表数据的长度
    
        mprint("准备插入第 "+str(linenum+1)+" 条数据,一共:"+str(count)+"")
        title_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on', num_format_str='#,##0.00')
        if linenum == 0:
            global wb
            global ws
            wb = xlwt.Workbook()
            ws = wb.add_sheet("京东666".decode("utf-8"))
        for i in range(0, count):#列数
            if i == 0:
                mprint("写入如下数据")
            if linenum == 0:#第1条数据待插入  需要先把标题插入0 再把第一条数据插入1
                ws.write(linenum, i, title[i].decode("utf-8"), title_style)#写标题
                ws.write(linenum+1, i, zlist[i])#这个write是一个覆盖操作 如果没write就放空
                print title[i].decode("utf-8"), zlist[i]
                wb.save(xls_name)
                # if i == (count-1):
                #     mprint("完成本条数据写入")
            else:   #  第2+条数据开始插入
                ws = wb.get_sheet(0)
                ws.write(linenum+1, i, zlist[i])
                print title[i].decode ("utf-8"), zlist[i]
                wb.save(xls_name)
    
        # mprint(""+str(linenum+1)+"条数据写入成功,还剩"+(sumurlcount-linenum)+"条数据待解析")
    
    class MyThread_totalcom(Thread):
        def __init__(self):
            Thread.__init__(self)
    
        def run(self):
            # totalcom = totalcomment()
            self.totalcom = totalcomment()
    
        def get_result(self):
            return self.totalcom
    
    class MyThread_showpic(Thread):
        def __init__(self):
            Thread.__init__(self)
    
        def run(self):
            self.showpic = getshowpicnum()
    
        def get_result(self):
            return self.showpic
    
    def getall(url):
        starttime = datetime.datetime.now()
        RETURN_CODE = openweb(url)
        print RETURN_CODE,'RETURN_CODE'
    
    
        if RETURN_CODE:#TRUE: skip and warning
            try:
                if RETURN_CODE == 2:
                    mprint("页面被跳转")
                    skiplist = [url, "!!页面被跳转".decode("utf-8"), RETURN_CODE, "", "", "", "", ""]
                    return skiplist
                else:#1
                    mprint("无法访问 检查网络是否故障")
                    skiplist = [url, "!!检查是否无法打开网页".decode("utf-8"), RETURN_CODE, "", "", "", "", ""]
                    return skiplist
            except:
                mprint("???")
                skiplist = [url, "!!跳过该条链接".decode("utf-8"), "???????????????????".decode("utf-8"), "", "", "", "", ""]
                return skiplist
    
        else:#FALSE :continue to get the data
            # starttime = datetime.datetime.now ()
            endtime = datetime.datetime.now()
            timed = (endtime - starttime).seconds
            mprint("网页已经被打开,耗时:"+str(timed)+"")
            debugprint('scrolldown1')
            #urlcurrent = getcurrenturl()#写一个 如果链接被跳转到其他页面就跳过的判断  有时间再写吧 urlcurrent可能变成 jd.com
            scrolldown()
            # msleep1()
            #scrolldown()
            # msleep2()
            debugprint('scrolldown2')
            name = getname()
            if isOffsale:  # 下柜
                price = "商品已下柜".decode ("utf-8")
            else:
                price = getprice()
            clickcommentbtn()
            # msleep2()
            #好评度能加载完成就能显示晒图
            try:
                print u"好评度:", get_element_bycssselector("#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div").text
            except:
                mprint("无法获取好评度,说明网络加载缓慢")
            #想写个多线程  不过单独一个的时候正常 如果两个都放进去就会出问题 难道是selenium不能同时find两个element?
    
            mprint("多线程开始")
            thd1 = MyThread_totalcom()
    
            # thd2 = MyThread_showpic()
            thd1.start()
            mprint("MyThread_totalcom线程开始")
            # thd2.start()
            # mprint("MyThread_showpic程开始")
            thd1.join()
            # thd2.join()
            totalcom = thd1.get_result()
            # showpic = thd2.get_result()
            mprint("多线程结束")
    
            # totalcom = totalcomment()#上面用多线程这里就注释掉
            showpic = getshowpicnum()
            #上面多线程 只能跑一个 totalcomment和getshowpicnum一起就出问题 好像不是我多线程代码有问题 是selenium不能同时find多个元素
            positivcom = getpositivecomment()
            modertcom = getmoderatecomment()
            negtivcom = getnegativecomment()
            # addcomment = getaddcomment()
    
            sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom, totalcom]
            # sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom ,addcomment]
            # print sumlist
            return sumlist # a list
    
    if __name__ == '__main__':
        try:#__main__
            # print type(now_time), type("时间")
            print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            # print ".",
            # time.sleep(0.2)
            cc = 0
            # URLSource
    
            total_starttime = datetime.datetime.now()
            f = open(URLSource, "r")
            lines = f.readlines()  # 读取全部内容
            global sumurlcount
            sumurlcount = len(lines)
            print sumurlcount
            mprint("一共 "+str(sumurlcount)+" 条数据要爬虫")
            for jdurl in lines:
            #for i in urllist:
                s = []
                print jdurl
                one_starttime = datetime.datetime.now ()
                goodsinfo_list = getall(jdurl.replace("
    ", ""))
                print "test111111111"
                # print goodsinfo_list
                mwrite(cc, goodsinfo_list)
                oneurl_endtime = datetime.datetime.now ()
                oneurl_timed = (oneurl_endtime - one_starttime).seconds
                mprint ("该条数据写入完成耗时:" + str (oneurl_timed) + "秒,还剩"+str(sumurlcount - cc - 1)+"条数据待分析,即将开始下一个链接的抓取!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                cc = cc + 1
    
    
            mprint("@@@@@$$$$$$$$@@@@@            所有代码正常运行 无报错          @@@@@@@@@@@$$$$$$$$$$$$$$$@@@@@@@@@@@@@@@@")
            total_endtime = datetime.datetime.now ()
            total_timed = (total_endtime - total_starttime).seconds
            mprint ("整个爬虫一共耗时:" + str (total_timed) + ""+",单条链接平均爬虫耗时:"+str((round(total_timed/sumurlcount,2)))+ "")
    
        except Exception, e:
            print Exception, e
    
            mprint("~~~~~~~~中间有 报错了@@@@@@@@@@@@@@@@")
        finally:
            mprint("sleep 10s后关闭浏览器")
            time.sleep(10)
            webdriver_chrome.quit()
  • 相关阅读:
    CBV进阶(一)
    uva 11748 Rigging Elections
    uva 11573 Ocean Currents(bfs+优先队列)
    无向图的欧拉路
    poj 3254 Corn Fields
    hdu 1114
    hdu 2639 (第k小的01背包)
    uva 1347 tour
    uva 437 The Tower of Babylon
    uva 1025 A Spy in the Metro(动态规划)
  • 原文地址:https://www.cnblogs.com/hanxing/p/8919962.html
Copyright © 2011-2022 走看看