zoukankan html css js c++ java

爬取京东的商品信息，并保存至CSV文件中（类）

# -*- coding: utf-8 -*-
# @Time    : 2018/12/27  17:06
# @Author  : zhangxinxin
# 感谢小六，小勾， 小胖给予的帮助
# @Email   : 778786617@qq.com
# @Software: PyCharm
import csv
import requests
import time
import json
from lxml import etree

# https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=3d144eda1ae5453e80e1debdea954230
# https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=1&s=1&click=0
# https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=60&click=0


class JdMessage(object):
    def __init__(self):
        self.js_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=7652027,5089253,5853579,100001172674,100000822981,7437780,7081550,7321794,100002338246,8895275,7694047,7421462,100000727128,5089275,5089267,8735304,7437564,100000503295,8514651,100000773889,7920226,100001906474,6735790,100000971366,7651931,100000982034,8790545,8051124,7479810,1861102&callback=jQuery3699627&_=1545911372294'
        # self.url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=1&s=1&click=0'
        self.url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=3d144eda1ae5453e80e1debdea954230'
        self.index_url = 'https://www.jd.com/'
        self.html = ''
        self.js_content = ''
        self.content_list = []
        self.headers = {
            # 'authority': 'search.jd.com',
            'method': 'GET',
            'path': '/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=59&click=0',
            'scheme': 'https',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            'x-requested-with': 'XMLHttpRequest',
            'cookie': '您的cookie'
        }

    def get_html(self):
        try:
            url = requests.get(self.url, headers=self.headers).content.decode(encoding='utf-8')
            # print(url)
            self.js_content = requests.get(self.js_url, headers=self.headers)

            self.html = etree.HTML(url)
        except Exception as e:
            print(e)

    def parse_html(self):
        p_name = self.html.xpath('//li/div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/@title')
        # p_comment = self.html.xpath('//li/div[@class="gl-i-wrap"]/div[@class="p-commit"]/strong/a//text()')
        # 评论是js 加载的，不能说实时，但也是近几天的,在返回的源码里并没有评论数，所以需要再发一个js请求。
        p_price = self.html.xpath('//li/div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()')
        p_shop = self.html.xpath('//div[@class="p-shop"]/span/a/text()')
        self.js_content = self.js_content.text.split('(')[-1].split(')')[0]
        jd_json = json.loads(self.js_content)
        # print(jd_json['CommentsCount'])
        comment_list = []

        for data in jd_json['CommentsCount']:
            comment_list.append(data['CommentCountStr'])
        # print(comment_list)
        # print(p_name,  p_price, p_shop)
        # print(p_comment_num)
        # print(self.js_content)
        self.content_list = zip(p_name, p_price, comment_list, p_shop)

    def save_csv(self):
        with open('JD_phone.csv', 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['商品名', '价格', '评论数', '店铺名'])
        with open('JD_phone.csv', 'a', newline='') as f:
            writer = csv.writer(f)
            for x in self.content_list:
                writer.writerow(x)
            # 数据写入需要时间，程序结束过快会导致数据写入不全
            time.sleep(3)
            print('信息写入成功')

    def run(self):
        self.get_html()
        self.parse_html()
        self.save_csv()


if __name__ == '__main__':
    s = JdMessage()
    s.run()

结果图：

爬取内容有限，但时间已晚，剩余代码以后抽时间写。

查看全文

相关阅读:
Jupyter notebook添加Anaconda中的虚拟环境
 Django踩坑之在Django中创建项目时ImportError: No module named django.core
Django踩坑之ExtendsNode: extends 'base/base.html'> must be the first tag in the template.
git 中文文件名乱码
 Django开发过程中遇到的问题和解决方案
 Advanced pandas
Pycomcad中的过滤机制及访问引用块内对象简述
 头大的一篇日志细节总结唐诗三百首
 UIBarItem 的设置
 MJExtension json快速解析

原文地址：https://www.cnblogs.com/UTF-8-xinxin/p/10240947.html