zoukankan      html  css  js  c++  java
  • <爬虫实例> 8684公交网-太原公交线路信息

     1 import requests
     2 from lxml import etree
     3 
     4 '''访问“8684公交查询网”,抓取太原市公交路线:'''
     5 
     6 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
     7                          'AppleWebKit/537.36 (KHTML, like Gecko) '
     8                          'Chrome/73.0.3683.86 Safari/537.36'}
     9 
    10 #用来保存公交路线信息
    11 items = []
    12 
    13 def get_navigation():
    14     '''获取内容'''
    15     url = 'https://taiyuan.8684.cn/'
    16     print('正在获取导航链接')
    17     r = requests.get(url,headers=headers)
    18 
    19     #解析内容,获取导航跳转链接
    20     tree = etree.HTML(r.text)
    21     #查找以数字开头的路线链接
    22     num_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
    23     # 查找以字母开头的路线链接
    24     char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')
    25     #返回导航链接
    26     return num_href_list + char_href_list
    27 
    28 def get_route(navi_list):
    29     #循环遍历链接列表,发送请求,获取每个链接下的公交路线
    30     route_list = []
    31     for i in navi_list:
    32         route_url = 'https://taiyuan.8684.cn' + i
    33         print('正在获取以%s开头的公交路线' %i)
    34         r = requests.get(route_url,headers=headers)
    35 
    36         #解析内容,获取公交路线
    37         tree = etree.HTML(r.text)
    38         href_list = tree.xpath('//div[@id="con_site_1"]/a/@href')
    39         for href in href_list:
    40             route_list.append(href)
    41     return route_list
    42 
    43 def get_info(route_list):
    44     for route in route_list:
    45         info_url = 'https://taiyuan.8684.cn' + route
    46         r = requests.get(info_url,headers=headers)
    47 
    48         #解析获取具体信息
    49         tree = etree.HTML(r.text)
    50         route_name = tree.xpath('//div[@class="bus_i_t1"]/h1/text()')[0]
    51         print('正在获取%s的路线信息' % route_name)
    52         run_time = tree.xpath('//p[@class="bus_i_t4"][1]/text()')[0]
    53         ticket_price = tree.xpath('//p[@class="bus_i_t4"][2]/text()')[0]
    54         update_time = tree.xpath('//p[@class="bus_i_t4"][4]/text()')[0]
    55         station_num = tree.xpath('//div[@class="bus_line_top "]/span/text()')
    56         if len(station_num) == 2:
    57             up_num = station_num[0]
    58             up_station_name = tree.xpath('//div[@class="bus_line_site "][1]/div/div/a/text()')
    59             down_num = station_num[1]
    60             down_station_name = tree.xpath('//div[@class="bus_line_site "][2]/div/div/a/text()')
    61         else:
    62             up_num = station_num[0]
    63             down_num = station_num[0]
    64             up_station_name = tree.xpath('//div[@class="bus_line_site "]/div/div/a/text()')
    65             down_station_name = tree.xpath('//div[@class="bus_line_site "]/div/div/a/text()')
    66 
    67         #写入字典
    68         item = {'路线名':route_name,
    69                 '运行时间':run_time,
    70                 '票价':ticket_price,
    71                 '更新时间':update_time,
    72                 '上行站数':up_num,
    73                 '上行站名':up_station_name,
    74                 '下行站数':down_num,
    75                 '下行站名':down_station_name,}
    76 
    77         items.append(item)
    78 
    79 def main():
    80     #获取所有公交路线导航链接
    81     navi_list = get_navigation()
    82     print('导航链接爬取完毕')
    83 
    84     #循环遍历导航链接列表,找到所有公交路线
    85     route_list = get_route(navi_list)
    86     print('公交路线爬取完毕')
    87 
    88     # 遍历路线表,获取具体信息
    89     info_list = get_info(route_list)
    90     print('具体信息爬取完毕')
    91 
    92     #爬取完毕,写入文件
    93     fp = open('8684_太原公交路线.txt','w',encoding='utf8')
    94     for item in items:
    95         fp.write(str(item) + '
    ')
    96     fp.close()
    97 
    98 if __name__ == '__main__':
    99     main()
  • 相关阅读:
    idea配置tomcat
    idea中配置tomcat乱码问题--记录处理经验
    svn提交报错值 remains tree in conflict
    Vue之MVVM
    python 时间日期处理
    SVN使用指南
    linux查看硬件信息
    软件集成过程标准化的建议
    术语辨析
    科学计数e+转成正常str
  • 原文地址:https://www.cnblogs.com/Finance-IT-gao/p/11136977.html
Copyright © 2011-2022 走看看