zoukankan      html  css  js  c++  java
  • 爬虫_中国天气网_文字天气预报(xpath)

     1 import requests
     2 from lxml import etree
     3 
     4 headers = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
     5 
     6 def get_html(url):
     7     try:
     8         html = requests.get(url, headers={'User-Agent':'headers'})
     9         html.encoding = html.apparent_encoding
    10         if html.status_code == 200:
    11             return html.text
    12         return 0
    13     
    14     except RequestsException:
    15         return 0
    16 '''
    17 conMintab:  华北  
    18     conMintab2    北京
    19     conMintab2    天津
    20     conMintab2    河北
    21     conMintab2    山西
    22     conMintab2    内蒙古
    23 
    24 
    25 //div[@class="conMidtab"][1] 今天
    26 //div[@class="conMidtab"][2] 明天 
    27                 ...
    28 //div[@class="conMidtab"][7] 最后一天
    29 '''
    30 
    31 def parse_html(html):
    32     wea = []
    33     html_element = etree.HTML(html)
    34     # !!!!!  trs = html_element.xpath('//div[@class="conMidtab"][1]//tr')[2:]
    35     provinces = html_element.xpath('//div[@class="conMidtab"][1]//div[@class="conMidtab2"]')
    36     for province in provinces:
    37         trs = province.xpath('.//tr')[2:]
    38         for tr in trs:
    39             weather = {}
    40             city = tr.xpath('.//td[@width="83"]/a/text()')
    41             phenomenon = tr.xpath('.//td[@width="89"]/text()')
    42             wind = tr.xpath('.//td[@width="162"]//text()')
    43             hightest = tr.xpath('.//td[@width="92"]/text()')
    44             weather['city'] = city
    45             weather['phenomenon'] = phenomenon
    46             weather['wind'] = wind
    47             weather['hightest'] = hightest
    48             while '
    ' in wind:
    49                 wind.remove('
    ')
    50             wea.append(weather)
    51         
    52     print(wea)    
    53     
    54         
    55 def main():
    56     urls = ['http://www.weather.com.cn/textFC/hb.shtml',
    57             'http://www.weather.com.cn/textFC/db.shtml',
    58             'http://www.weather.com.cn/textFC/hd.shtml',
    59             'http://www.weather.com.cn/textFC/hz.shtml',
    60             'http://www.weather.com.cn/textFC/hn.shtml',
    61             'http://www.weather.com.cn/textFC/xb.shtml',
    62             'http://www.weather.com.cn/textFC/xn.shtml',
    63             'http://www.weather.com.cn/textFC/gat.shtml']
    64     for url in urls:
    65         html = get_html(url)
    66         if html == 0:
    67             html = get_html(url)
    68         parse_html(html)
    69 
    70 
    71 if __name__ == '__main__':
    72         main()    

    。。港澳台的格式不太一样,暂时不想管他们了

    。。运行结果的话,我爬取得是当日白天的天气,现在晚上了,数据都没有了(一开始还以为是代码改错了,还一直撤销)

  • 相关阅读:
    Fiddler 简介
    jQuery 属性操作
    Win7的虚拟Wi-Fi
    接口与内部类
    继承(二)
    J2EE框架(Struts&Hibernate&Spring)的理解
    继承(一)
    对象与类
    控制流程
    数据类型
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/9426936.html
Copyright © 2011-2022 走看看