zoukankan      html  css  js  c++  java
  • 一种通用数据采集的schema定义形式

    {
      "name": "凤凰金融",
      "notice": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
          }
        ],
      "comments": "网站通告"
    },
    "url": { "data": "attribute", "value": "http://www.fengjr.com/financing/list?type=cx"
    "comments": "本平台数据的采集URL"
    }, "project": { "data": "url", "url": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" }, "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "member": { "data": "sub_item", "sub_item": { "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "src-save": 0, "url": { "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" } }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "src-save": 1 }

    补充:

    {
      "name": "凤凰金融",
      "notice": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
          }
        ]
      },
      "url": {
        "data": "attribute",
        "value": "http://www.fengjr.com/financing/list?type=cx"
      },
      "project": {
        "data": "url",
        "url": {
          "data": "attribute",
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ],
          "template": ""
        },
        "title": {
          "data": "attribute",
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ]
        },
        "detail": {
          "name": "网贷列表",
          "title": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          },
          "amount": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          }
        }
      },
      "member": {
        "data": "sub_item",
        "sub_item": {
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ],
          "src-save": 0,
          "url": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ],
            "template": ""
          }
        },
        "detail": {
          "name": "会员材料",
          "title": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          },
          "amount": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          }
        }
      },
      "src-save": 1,

      "crawler": {

          "handler":"httpClient|selenium",
          "results":"html|json|text",
          "next_page": {
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
             ],
            "template": ""
          },
          "history": "re-crawl|skip|stop"
        }

    }
  • 相关阅读:
    python 默认编码( UnicodeDecodeError: 'ascii' codec can't decode)
    python发送各类邮件的主要方法
    python输出htmltestrunner中文乱码如何解决
    Python unittest 官方文档
    Python pip 安装包
    Python easy_insatll 安装包
    linux 解压操作命令
    vim 操作指令2
    vim 操作指令1
    (转)水波纹过渡特效
  • 原文地址:https://www.cnblogs.com/feika/p/4281864.html
Copyright © 2011-2022 走看看