zoukankan      html  css  js  c++  java
  • 爬虫-动态获取京东某页面的信息并存储(15)

    创建orm:

    from peewee import *
    
    db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="root")
    
    
    class BaseModel(Model):
        class Meta:
            database = db
    
    #设计数据表的时候有几个重要点一定要注意
    """
    char类型, 要设置最大长度
    对于无法确定最大长度的字段,可以设置为Text
    设计表的时候 采集到的数据要尽量先做格式化处理
    default和null=True
    """
    
    
    class Good(BaseModel):
        id = IntegerField(primary_key=True, verbose_name="商品id")
        name = CharField(max_length=500, verbose_name="商品名称")
        content = TextField(default="", verbose_name="商品描述")
        supplier = CharField(max_length=500, default="")
        ggbz = TextField(default="", verbose_name="规格和包装")
        image_list = TextField(default="", verbose_name="商品的轮播图")
        price = FloatField(default=0.0, verbose_name="商品价格")
    
        good_rate = IntegerField(default=0, verbose_name="好评率")
        comments_nums = IntegerField(default=0, verbose_name="评论数")
        has_image_comment_nums = IntegerField(default=0, verbose_name="晒图数")
        has_video_comment_nums = IntegerField(default=0, verbose_name="视频晒单数")
        has_add_comment_nums = IntegerField(default=0, verbose_name="追评数")
        well_comment_nums = IntegerField(default=0, verbose_name="好评数")
        middle_comment_nums = IntegerField(default=0, verbose_name="中评数")
        bad_comment_nums = IntegerField(default=0, verbose_name="差评数")
    
    
    class GoodEvaluate(BaseModel):
        id = CharField(primary_key=True)
        good = ForeignKeyField(Good,verbose_name="商品")
        user_head_url = CharField(verbose_name="用户头像")
        user_name = CharField(verbose_name="用户名")
        good_info = CharField(max_length=500, verbose_name="购买的商品的信息")
        evaluate_time = DateTimeField(verbose_name="评价时间")
        content = TextField(default="", verbose_name="评论内容")
        star = IntegerField(default=0, verbose_name="评分")
        comment_nums = IntegerField(default=0, verbose_name="评论数")
        praised_nums = IntegerField(default=0, verbose_name="点赞数")
        image_list = TextField(default="", verbose_name="图片")
        video_list = TextField(default="", verbose_name="视频")
    
    
    class GoodEvaluateSummary(BaseModel):
        good = ForeignKeyField(Good, verbose_name="商品")
        tag = CharField(max_length=20, verbose_name="标签")
        num = IntegerField(default=0, verbose_name="数量")
    
    
    if __name__ == "__main__":
        db.create_tables([Good, GoodEvaluate, GoodEvaluateSummary])

    主文件代码:

    import re
    import time
    import json
    from datetime import datetime
    
    from selenium import webdriver
    from scrapy import Selector
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.chrome.options import Options
    
    from jd_spider.models import *
    
    chrome_options = Options()
    
    #设置headless模式
    # chrome_options.add_argument("--headless")
    # 谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--disable-gpu')
    #设置不加载图片
    chrome_options.add_argument("blink-settings=imagesEnabled=false")
    
    browser = webdriver.Chrome(executable_path="E:/chromedriver.exe", chrome_options=chrome_options)
    
    #1. 无界面启动selenium
    #2. 设置selenium不加载图片
    
    def process_value(nums_str):
        """
        将字符串类型的数字转换成数字
        :param nums_str: 字符串类型的数字,数字中可能包含"万"
        :return: 成功返回数字,默认返回0
        """
        nums = 0
        re_math = re.search("(d+)", nums_str)
        if re_math:
            nums = int(re_math.group(1))
            if "" in nums_str:
                nums *= 10000
        return nums
    
    
    def parse_good(good_id):
        browser.get("https://item.jd.com/{}.html".format(good_id))
    
        sel = Selector(text=browser.page_source)
    
        #提取商品的基本信息
        good = Good(id=good_id)
        name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip()
        price = float("".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(good_id)).extract()).strip())
    
        detail = "".join(sel.xpath("//div[@id='detail']//div[@class='tab-con']").extract())
        good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
        supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
    
        re_match = re.search('<a href="//(.*).jd.com', supplier_info)
        if re_match:
            good.supplier = re_match.group(1)
        else:
            good.supplier = "京东"
    
        good.name = name
        good.price = price
        good.content = detail
        good.image_list = json.dumps(good_images)
    
        #模拟点击规格和包装
        ggbz_ele = browser.find_element_by_xpath("//div[@class='tab-main large']//li[contains(text(), '规格与包装')]")
        ggbz_ele.click()
        time.sleep(3)
        sel = Selector(text=browser.page_source)
        ggbz_detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract())
        good.ggbz = ggbz_detail
    
        #模拟点击商品评价后获取评价的信息
        sppj_ele = browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")
        sppj_ele.click()
        time.sleep(5)
        sel = Selector(text=browser.page_source)
        tag_list = sel.xpath("//div[@class='tag-list tag-available']//span/text()").extract()
        good_rate = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])
        good.good_rate = good_rate
    
        summary_as = sel.xpath("//ul[@class='filter-list']/li/a")
        for summary in summary_as:
            name = summary.xpath("./text()").extract()[0]
            nums = summary.xpath("./em/text()").extract()[0]
            nums = process_value(nums)
    
            if name == "晒图":
                good.has_image_comment_nums = nums
            elif name == "视频晒单":
                good.has_video_comment_nums = nums
            elif name == "追评":
                good.has_add_comment_nums = nums
            elif name == "好评":
                good.well_comment_nums = nums
            elif name == "中评":
                good.middle_comment_nums = nums
            elif name == "差评":
                good.bad_comment_nums = nums
            elif name == "全部评价":
                good.comments_nums = nums
    
        #保存商品信息
        existed_good = Good.select().where(Good.id == good.id)
        if existed_good:
            good.save()
        else:
            good.save(force_insert=True)
    
        for tag in tag_list:
            re_match = re.match("(.*)((d+))", tag)
            if re_match:
                tag_name = re_match.group(1)
                nums = int(re_match.group(2))
    
                existed_summarys = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good==good, GoodEvaluateSummary.tag==tag_name)
                if existed_summarys:
                    summary = existed_summarys[0]
                else:
                    summary = GoodEvaluateSummary(good=good)
    
                summary.tag = tag_name
                summary.num = nums
                summary.save()
    
        #获取商品的评价
        has_next_page = True
        while has_next_page:
            all_evalutes = sel.xpath("//div[@class='comment-item']")
            for item in all_evalutes:
                good_evaluate = GoodEvaluate(good=good)
    
                evaluate_id = item.xpath("./@data-guid").extract()[0]
                print(evaluate_id)
                good_evaluate.id = evaluate_id
                user_head_url = item.xpath(".//div[@class='user-info']//img/@src").extract()[0]
                user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()
    
                good_evaluate.user_head_url = user_head_url
                good_evaluate.user_name = user_name
    
                star = item.xpath("./div[2]/div[1]/@class").extract()[0]
                star = int(star[-1])
                good_evaluate.star = star
                evaluate = "".join(item.xpath("./div[2]/p[1]/text()").extract()[0]).strip()
                good_evaluate.content = evaluate
    
                image_list = item.xpath("./div[2]//div[@class='pic-list J-pic-list']/a/img/@src").extract()
                video_list = item.xpath("./div[2]//div[@class='J-video-view-wrap clearfix']//video/@src").extract()
    
                good_evaluate.image_list = json.dumps(image_list)
                good_evaluate.video_list = json.dumps(video_list)
    
                praised_nums = int(item.xpath(".//div[@class='comment-op']/a[2]/text()").extract()[0])
                comment_nums = int(item.xpath(".//div[@class='comment-op']/a[3]/text()").extract()[0])
    
                good_evaluate.praised_nums = praised_nums
                good_evaluate.comment_nums = comment_nums
    
                comment_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
                order_info = comment_info[:-1]
                evaluate_time = comment_info[-1]
                good_evaluate.good_info = json.dumps(order_info)
                evaluate_time = datetime.strptime(evaluate_time, "%Y-%m-%d %H:%M")
                good_evaluate.evaluate_time = evaluate_time
    
                #保存评价信息
                existed_good_evaluates = GoodEvaluate.select().where(GoodEvaluate.id==good_evaluate.id)
                if existed_good_evaluates:
                    good_evaluate.save()
                else:
                    good_evaluate.save(force_insert=True)
    
            try:
                next_page_ele = browser.find_element_by_xpath("//div[@id='comment']//a[@class='ui-pager-next']")
                # next_page_ele.click()
                next_page_ele.send_keys("
    ")
                time.sleep(5)
                sel = Selector(text=browser.page_source)
            except NoSuchElementException as e:
                has_next_page = False
    
    
    
    if __name__ == "__main__":
        parse_good(7652013)
    好好学习,天天向上
  • 相关阅读:
    ECharts
    JSON(及其在ajax前后端交互的过程)小识
    fullpage.js全屏滚动插件使用小结
    php json_encode数据格式化
    jQuery的DOM操作实例(3)——创建节点&&编写一个弹窗
    jQuery的DOM操作实例(2)——拖拽效果&&拓展插件
    jQuery的DOM操作实例(1)——选项卡&&Tab切换
    日常css技巧小结(2)-- inline-block带来的迷惑
    日常css技巧小结(1)--背景透明度改变对内容无影响
    浅析JavaScript事件流——冒泡
  • 原文地址:https://www.cnblogs.com/topass123/p/13354099.html
Copyright © 2011-2022 走看看