zoukankan      html  css  js  c++  java
  • 爬虫-动态获取京东某页面的信息并存储(15)

    创建orm:

    from peewee import *
    
    db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="root")
    
    
    class BaseModel(Model):
        class Meta:
            database = db
    
    #设计数据表的时候有几个重要点一定要注意
    """
    char类型, 要设置最大长度
    对于无法确定最大长度的字段,可以设置为Text
    设计表的时候 采集到的数据要尽量先做格式化处理
    default和null=True
    """
    
    
    class Good(BaseModel):
        id = IntegerField(primary_key=True, verbose_name="商品id")
        name = CharField(max_length=500, verbose_name="商品名称")
        content = TextField(default="", verbose_name="商品描述")
        supplier = CharField(max_length=500, default="")
        ggbz = TextField(default="", verbose_name="规格和包装")
        image_list = TextField(default="", verbose_name="商品的轮播图")
        price = FloatField(default=0.0, verbose_name="商品价格")
    
        good_rate = IntegerField(default=0, verbose_name="好评率")
        comments_nums = IntegerField(default=0, verbose_name="评论数")
        has_image_comment_nums = IntegerField(default=0, verbose_name="晒图数")
        has_video_comment_nums = IntegerField(default=0, verbose_name="视频晒单数")
        has_add_comment_nums = IntegerField(default=0, verbose_name="追评数")
        well_comment_nums = IntegerField(default=0, verbose_name="好评数")
        middle_comment_nums = IntegerField(default=0, verbose_name="中评数")
        bad_comment_nums = IntegerField(default=0, verbose_name="差评数")
    
    
    class GoodEvaluate(BaseModel):
        id = CharField(primary_key=True)
        good = ForeignKeyField(Good,verbose_name="商品")
        user_head_url = CharField(verbose_name="用户头像")
        user_name = CharField(verbose_name="用户名")
        good_info = CharField(max_length=500, verbose_name="购买的商品的信息")
        evaluate_time = DateTimeField(verbose_name="评价时间")
        content = TextField(default="", verbose_name="评论内容")
        star = IntegerField(default=0, verbose_name="评分")
        comment_nums = IntegerField(default=0, verbose_name="评论数")
        praised_nums = IntegerField(default=0, verbose_name="点赞数")
        image_list = TextField(default="", verbose_name="图片")
        video_list = TextField(default="", verbose_name="视频")
    
    
    class GoodEvaluateSummary(BaseModel):
        good = ForeignKeyField(Good, verbose_name="商品")
        tag = CharField(max_length=20, verbose_name="标签")
        num = IntegerField(default=0, verbose_name="数量")
    
    
    if __name__ == "__main__":
        db.create_tables([Good, GoodEvaluate, GoodEvaluateSummary])

    主文件代码:

    import re
    import time
    import json
    from datetime import datetime
    
    from selenium import webdriver
    from scrapy import Selector
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.chrome.options import Options
    
    from jd_spider.models import *
    
    chrome_options = Options()
    
    #设置headless模式
    # chrome_options.add_argument("--headless")
    # 谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--disable-gpu')
    #设置不加载图片
    chrome_options.add_argument("blink-settings=imagesEnabled=false")
    
    browser = webdriver.Chrome(executable_path="E:/chromedriver.exe", chrome_options=chrome_options)
    
    #1. 无界面启动selenium
    #2. 设置selenium不加载图片
    
    def process_value(nums_str):
        """
        将字符串类型的数字转换成数字
        :param nums_str: 字符串类型的数字,数字中可能包含"万"
        :return: 成功返回数字,默认返回0
        """
        nums = 0
        re_math = re.search("(d+)", nums_str)
        if re_math:
            nums = int(re_math.group(1))
            if "" in nums_str:
                nums *= 10000
        return nums
    
    
    def parse_good(good_id):
        browser.get("https://item.jd.com/{}.html".format(good_id))
    
        sel = Selector(text=browser.page_source)
    
        #提取商品的基本信息
        good = Good(id=good_id)
        name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip()
        price = float("".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(good_id)).extract()).strip())
    
        detail = "".join(sel.xpath("//div[@id='detail']//div[@class='tab-con']").extract())
        good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
        supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
    
        re_match = re.search('<a href="//(.*).jd.com', supplier_info)
        if re_match:
            good.supplier = re_match.group(1)
        else:
            good.supplier = "京东"
    
        good.name = name
        good.price = price
        good.content = detail
        good.image_list = json.dumps(good_images)
    
        #模拟点击规格和包装
        ggbz_ele = browser.find_element_by_xpath("//div[@class='tab-main large']//li[contains(text(), '规格与包装')]")
        ggbz_ele.click()
        time.sleep(3)
        sel = Selector(text=browser.page_source)
        ggbz_detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract())
        good.ggbz = ggbz_detail
    
        #模拟点击商品评价后获取评价的信息
        sppj_ele = browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")
        sppj_ele.click()
        time.sleep(5)
        sel = Selector(text=browser.page_source)
        tag_list = sel.xpath("//div[@class='tag-list tag-available']//span/text()").extract()
        good_rate = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])
        good.good_rate = good_rate
    
        summary_as = sel.xpath("//ul[@class='filter-list']/li/a")
        for summary in summary_as:
            name = summary.xpath("./text()").extract()[0]
            nums = summary.xpath("./em/text()").extract()[0]
            nums = process_value(nums)
    
            if name == "晒图":
                good.has_image_comment_nums = nums
            elif name == "视频晒单":
                good.has_video_comment_nums = nums
            elif name == "追评":
                good.has_add_comment_nums = nums
            elif name == "好评":
                good.well_comment_nums = nums
            elif name == "中评":
                good.middle_comment_nums = nums
            elif name == "差评":
                good.bad_comment_nums = nums
            elif name == "全部评价":
                good.comments_nums = nums
    
        #保存商品信息
        existed_good = Good.select().where(Good.id == good.id)
        if existed_good:
            good.save()
        else:
            good.save(force_insert=True)
    
        for tag in tag_list:
            re_match = re.match("(.*)((d+))", tag)
            if re_match:
                tag_name = re_match.group(1)
                nums = int(re_match.group(2))
    
                existed_summarys = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good==good, GoodEvaluateSummary.tag==tag_name)
                if existed_summarys:
                    summary = existed_summarys[0]
                else:
                    summary = GoodEvaluateSummary(good=good)
    
                summary.tag = tag_name
                summary.num = nums
                summary.save()
    
        #获取商品的评价
        has_next_page = True
        while has_next_page:
            all_evalutes = sel.xpath("//div[@class='comment-item']")
            for item in all_evalutes:
                good_evaluate = GoodEvaluate(good=good)
    
                evaluate_id = item.xpath("./@data-guid").extract()[0]
                print(evaluate_id)
                good_evaluate.id = evaluate_id
                user_head_url = item.xpath(".//div[@class='user-info']//img/@src").extract()[0]
                user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()
    
                good_evaluate.user_head_url = user_head_url
                good_evaluate.user_name = user_name
    
                star = item.xpath("./div[2]/div[1]/@class").extract()[0]
                star = int(star[-1])
                good_evaluate.star = star
                evaluate = "".join(item.xpath("./div[2]/p[1]/text()").extract()[0]).strip()
                good_evaluate.content = evaluate
    
                image_list = item.xpath("./div[2]//div[@class='pic-list J-pic-list']/a/img/@src").extract()
                video_list = item.xpath("./div[2]//div[@class='J-video-view-wrap clearfix']//video/@src").extract()
    
                good_evaluate.image_list = json.dumps(image_list)
                good_evaluate.video_list = json.dumps(video_list)
    
                praised_nums = int(item.xpath(".//div[@class='comment-op']/a[2]/text()").extract()[0])
                comment_nums = int(item.xpath(".//div[@class='comment-op']/a[3]/text()").extract()[0])
    
                good_evaluate.praised_nums = praised_nums
                good_evaluate.comment_nums = comment_nums
    
                comment_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
                order_info = comment_info[:-1]
                evaluate_time = comment_info[-1]
                good_evaluate.good_info = json.dumps(order_info)
                evaluate_time = datetime.strptime(evaluate_time, "%Y-%m-%d %H:%M")
                good_evaluate.evaluate_time = evaluate_time
    
                #保存评价信息
                existed_good_evaluates = GoodEvaluate.select().where(GoodEvaluate.id==good_evaluate.id)
                if existed_good_evaluates:
                    good_evaluate.save()
                else:
                    good_evaluate.save(force_insert=True)
    
            try:
                next_page_ele = browser.find_element_by_xpath("//div[@id='comment']//a[@class='ui-pager-next']")
                # next_page_ele.click()
                next_page_ele.send_keys("
    ")
                time.sleep(5)
                sel = Selector(text=browser.page_source)
            except NoSuchElementException as e:
                has_next_page = False
    
    
    
    if __name__ == "__main__":
        parse_good(7652013)
    好好学习,天天向上
  • 相关阅读:
    4 Apr 18 软件开发目录 logging模块的使用 序列化(Json, Pickle) os模块
    3 Apr 18 内置函数 列表生成式与生成器表达式 模块的使用之import 模块的使用之from…import…
    2 Apr 18 三元表达式 函数递归 匿名函数 内置函数
    30 Mar 18 迭代器 生成器 面向过程的编程
    29 Mar 18 函数 有参、无参装饰器
    28 Mar 18 函数
    27 Mar 18 函数的参数
    26 Mar 18 函数介绍
    23 Mar 18 文件处理
    22 Mar 18 补充数据类型+字符编码+文件处理
  • 原文地址:https://www.cnblogs.com/topass123/p/13354099.html
Copyright © 2011-2022 走看看