zoukankan      html  css  js  c++  java
  • 一种通用数据采集的schema定义形式

    {
      "name": "凤凰金融",
      "notice": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
          }
        ],
      "comments": "网站通告"
    },
    "url": { "data": "attribute", "value": "http://www.fengjr.com/financing/list?type=cx"
    "comments": "本平台数据的采集URL"
    }, "project": { "data": "url", "url": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" }, "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "member": { "data": "sub_item", "sub_item": { "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "src-save": 0, "url": { "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" } }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "src-save": 1 }

    补充:

    {
      "name": "凤凰金融",
      "notice": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
          }
        ]
      },
      "url": {
        "data": "attribute",
        "value": "http://www.fengjr.com/financing/list?type=cx"
      },
      "project": {
        "data": "url",
        "url": {
          "data": "attribute",
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ],
          "template": ""
        },
        "title": {
          "data": "attribute",
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ]
        },
        "detail": {
          "name": "网贷列表",
          "title": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          },
          "amount": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          }
        }
      },
      "member": {
        "data": "sub_item",
        "sub_item": {
          "matcher": [
            {
              "match": "xpath",
              "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
            }
          ],
          "src-save": 0,
          "url": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ],
            "template": ""
          }
        },
        "detail": {
          "name": "会员材料",
          "title": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          },
          "amount": {
            "data": "attribute",
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
            ]
          }
        }
      },
      "src-save": 1,

      "crawler": {

          "handler":"httpClient|selenium",
          "results":"html|json|text",
          "next_page": {
            "matcher": [
              {
                "match": "xpath",
                "pattern": "//*[@id="page-financing"]/div[1]/div[5]/div/div/div[3]"
              }
             ],
            "template": ""
          },
          "history": "re-crawl|skip|stop"
        }

    }
  • 相关阅读:
    简版一致性hash算法实现
    js类型转换问题
    VIVADO 2017.4配置MIG IP注意事项
    工作笔记2
    工作笔记1
    spring5 + hibernate5(redisson二级缓存) + JPA + JTA + ActiveMQ(JMS)
    spring data jpa 缓存(hibernate)
    JPA @Temporal
    C++ RTTI
    二叉树遍历方法总结
  • 原文地址:https://www.cnblogs.com/feika/p/4281864.html
Copyright © 2011-2022 走看看