zoukankan      html  css  js  c++  java
  • 抓取 RiceData 数据

    完整代码 -- 爬取国家粮食局历年水稻数据

    import requests
    from lxml import etree
    import time 
    
    #获取  urls_province
    #  获取源码 
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
    }
    url = r"https://ricedata.cn/variety/"
    response = requests.get(url,headers)
    response.encoding = "utf-8"
    
    #获取  urls_province
    html = etree.HTML(response.text)
    results = html.xpath('/html/body//tr[4]/td/div/a/@href')
    # 拼接  https://ricedata.cn/variety/ + result
    urls_province = ["https://ricedata.cn/variety/"+ result for result in results]
    
    #print(len(urls_province))
    
    #  获取所有的额  privince_pages
    privince_pages = []
    for url_province in urls_province:
        # 获取  page_urls 以农业部为例   https://ricedata.cn/variety/identified/nation_1.htm
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
        }
        url = url_province
        response = requests.get(url,headers)
        response.encoding = "utf-8"
        html_page = etree.HTML(response.text)
        #获取  urls_province
        results_page = html_page.xpath('/html/body/table[2]/caption/b/a/@href')
        num = (results_page[-1].split('_')[-1]).split('.')[0]
        num = (results_page[-1].split('_')[-1]).split('.')[0]
        parser = results_page[-1].split(num)
        privince_page = ["https://ricedata.cn/variety/identified/"+parser[0]+str(i)+parser[-1] for i in range(1,int(num)+1)]
        #print(privince_page)
        privince_pages.extend(privince_page)
        time.sleep(0.2)
        
    print(len(privince_pages))
        
    #  获取地区的具体数据
    data_content = []
    for privince_page in privince_pages: 
        headers = {
             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
        }
        url = privince_page
        print(privince_page)
        response = requests.get(url,headers)
        response.encoding = "gbk"
        
        contents = etree.HTML(response.text,etree.HTMLParser())
        tr_content = contents.xpath('/html/body/table[2]//tr')    #  /html/body/table[2]/tbody[2]/tr[1]
    
        content = []
        for tr in tr_content:
            result = tr.xpath('./td/text()')
            content.append(result)
        data_content.extend(content)
        time.sleep(0.2)
        #print(content)
    print(len((data_content)))
  • 相关阅读:
    【Linux开发】Linux下jpeglib库的安装详解
    【Linux开发】Linux下jpeglib库的安装详解
    【Linux开发】jpeglib使用指南
    【Linux开发】jpeglib使用指南
    【Linux开发】为qt-embedded添加jpeg库的交叉编译方法for arm
    【Linux开发】为qt-embedded添加jpeg库的交叉编译方法for arm
    Windows 7 64bit上安装Oracle Database 12c [INS-30131] 错误的解决方法
    Log4j 日志记录
    如何根据Ip获取地址信息--Java----待整理完善!!!
    Struts如何获取客户端ip地址
  • 原文地址:https://www.cnblogs.com/Skypeduty1225/p/15569210.html
Copyright © 2011-2022 走看看