zoukankan      html  css  js  c++  java
  • XPath的使用[爬取知乎发现]文件存储[txt,json,csv,mongodb]

    使用XPath

     1 import requests
     2 import json
     3 from lxml import etree
     4 from urllib import parse
     5 
     6 url = 'https://www.zhihu.com/explore'
     7 headers = {
     8     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
     9 }
    10 html = requests.get(url, headers=headers).text
    11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
    12 text = etree.HTML(html)
    13 # 返回所有内容的结点位置
    14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')
    15 items ={}
    16 for node in node_list:
    17     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    18     #问题
    19     question = node.xpath('.//h2/a')[0].text.replace("
    ","")
    20     # 作者
    21     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    22     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("
    ","")
    23     # 回答
    24     answer = node.xpath('.//*[@class="content"]')[0].text
    25     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    26     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
    27 
    28     items = {
    29         "question" : question,
    30         "author" : author,
    31         "answer" : answer,
    32     } 
    33 
    34     with open("explore.json", "a") as f:
    35         #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "
    ")
    36         f.write(json.dumps(items, ensure_ascii = False) + "
    ")

    保存为TXT  

     1 import requests
     2  3 from lxml import etree
     4 from urllib import parse
     5 
     6 url = 'https://www.zhihu.com/explore'
     7 headers = {
     8     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
     9 }
    10 html = requests.get(url, headers=headers).text
    11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
    12 text = etree.HTML(html)
    13 # 返回所有内容的结点位置
    14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')
    15 
    16 for node in node_list:
    17     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    18     #问题
    19     question = node.xpath('.//h2/a')[0].text.replace("
    ","")
    20     # 作者
    21     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    22     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("
    ","")
    23     # 回答
    24     answer = node.xpath('.//*[@class="content"]')[0].text
    25     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    26     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
    27 
    28     with open('explore.txt', 'a', encoding='utf-8') as file:
    29         file.write('
    '.join([question, author, answer]))
    30         file.write('
    ' + '=' * 50 + '
    ')

     保存为csv

     1 import requests
     2 from lxml import etree
     3 from urllib import parse
     4 import csv
     5 
     6 url = 'https://www.zhihu.com/explore'
     7 headers = {
     8     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
     9 }
    10 html = requests.get(url, headers=headers).text
    11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
    12 text = etree.HTML(html)
    13 # 返回所有内容的结点位置
    14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')
    15 
    16 for node in node_list:
    17     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    18     #问题
    19     question = node.xpath('.//h2/a')[0].text.replace("
    ","")
    20     # 作者
    21     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    22     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("
    ","")
    23     # 回答,为方便展示,只取部分内容,text[ :10]
    24     answer = node.xpath('.//*[@class="content"]')[0].text[ :10]
    25 
    26     #answer = node.xpath('.//*[@class="content"]')[0].text
    27     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    28     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
    29 
    30 
    31     with open('explore.csv', 'a', encoding='utf-8') as csvfile:
    32         fieldnames = ['question', 'author', 'answer']
    33         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    34         writer.writeheader()
    35         writer.writerow({'question': question, 'author': author, 'answer': answer})

    读取csv

    1 import csv
    2 
    3 with open('explore.csv', 'r', encoding='utf-8') as csvfile:
    4     reader = csv.reader(csvfile)
    5     for row in reader:
    6         print(row)

    Excel打开乱码参考:excel打开csv文件显示乱码的处理方法_百度经验

    保存到MongoDB

     1 import requests
     2 from lxml import etree
     3 from urllib import parse
     4 from pymongo import MongoClient
     5 
     6 client = MongoClient()
     7 db = client['explore']
     8 collection = db['explore']
     9 
    10 url = 'https://www.zhihu.com/explore'
    11 headers = {
    12     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    13 }
    14 html = requests.get(url, headers=headers).text
    15 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
    16 text = etree.HTML(html)
    17 # 返回所有内容的结点位置
    18 node_list = text.xpath('//div[@class="explore-feed feed-item"]')
    19 
    20 for node in node_list:
    21     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    22     #问题
    23     question = node.xpath('.//h2/a')[0].text.replace("
    ","")
    24     # 作者
    25     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    26     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("
    ","")
    27     # 回答
    28     answer = node.xpath('.//*[@class="content"]')[0].text
    29     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    30     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
    31 
    32     items = {
    33         "question" : question,
    34         "author" : author,
    35         "answer" : answer,
    36     } 
    37 
    38     if collection.insert(items):
    39         print('Saved to Mongo')

    稍微改动

     1 import requests
     2 from lxml import etree
     3 from urllib import parse
     4 from pymongo import MongoClient
     5 
     6 client = MongoClient()
     7 db = client['explore']
     8 collection = db['explore']
     9 
    10 url = 'https://www.zhihu.com/explore'
    11 headers = {
    12     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    13 }
    14 html = requests.get(url, headers=headers).text
    15 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
    16 text = etree.HTML(html)
    17 # 返回所有内容的结点位置
    18 node_list = text.xpath('//div[@class="explore-feed feed-item"]')
    19 explore ={}
    20 for node in node_list:
    21     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    22     #问题
    23     explore['question'] = node.xpath('.//h2/a')[0].text.replace("
    ","")
    24     # 作者
    25     explore['author'] = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    26     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("
    ","")
    27     # 回答
    28     explore['answer'] = node.xpath('.//*[@class="content"]')[0].text
    29     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    30     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
    31 
    32     if collection.insert(explore):
    33         print('Saved to Mongo')

    抛出异常

        raise DuplicateKeyError(error.get("errmsg"), 11000, error)

    pymongo.errors.DuplicateKeyError: E11000 duplicate key error collection: explore.explore index: _id_ dup key: { : ObjectId('5b3792ae393e0d0c38123bbc') }

    id用于记录唯一的一条Schema,Schema本身就有标识唯一性的属性,它就是:ObjectId主键,一种特殊而且非常重要的类型,每个Schema都会默认配置这个属性,属性名为_id
    MongoDB 默认会创建一个索引_id

     1 import requests
     2 from lxml import etree
     3 from urllib import parse
     4 from pymongo import MongoClient
     5 import pymongo.errors
     6 
     7 client = MongoClient()
     8 db = client['explore']
     9 collection = db['explore']
    10 
    11 url = 'https://www.zhihu.com/explore'
    12 headers = {
    13     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    14 }
    15 html = requests.get(url, headers=headers).text
    16 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
    17 text = etree.HTML(html)
    18 # 返回所有内容的结点位置
    19 node_list = text.xpath('//div[@class="explore-feed feed-item"]')
    20 explore ={}
    21 for node in node_list:
    22     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
    23     #问题
    24     explore['question'] = node.xpath('.//h2/a')[0].text.replace("
    ","")
    25     # 作者
    26     explore['author'] = node.xpath('.//*[@class="author-link-line"]/*')[0].text
    27     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("
    ","")
    28     # 回答
    29     explore['answer'] = node.xpath('.//*[@class="content"]')[0].text
    30     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
    31     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
    32     try:
    33         if collection.insert(explore):
    34             print('Saved to Mongo')
    35     except pymongo.errors.DuplicateKeyError:
    36         # 对唯一字段进行重复插入,pymongo则会抛出这个错误,并且插入失败
    37         print("重复插入")
    38         pass

    还是不能插入数据。

  • 相关阅读:
    java架构之路-(面试篇)Mysql面试大全
    web工程的路径问题详解
    SQL知识点总结
    Mybatis简介
    mysql大全
    配置心得
    spring4.1.3+springmvc+mybatis3.2.1整合
    jstl标签
    jstl标签库
    Java并发控制机制
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9248573.html
Copyright © 2011-2022 走看看