zoukankan      html  css  js  c++  java
  • Python 爬虫

    1. 配置第三方包

    # 时间的模块
    import datetime
    # 数据分析模块,用来处理excel
    import pandas as pd
    #用来构造xlsx文件的模块
    import xlsxwriter as xlw
    # 用来爬取数据的模块
    from urllib import request
    # HTML或XML标签中的内容解析器
    from bs4 import BeautifulSoup as bs

    2.获取时间序列函数

    # 产生时间序列
    def dateRange1(start, end):
        datelist1 = [datetime.datetime.strftime(x, '%Y%m') for x in list(
            pd.date_range(start=start, end=end))]
        datelist = sorted(list(set(datelist1)))
        return datelist
    # ['202005', '202006', '202007', '202008', '202009', '202010']

    3.爬取网页

    # 爬取网页数据,解析HTML文件,筛选数据,转换成列表格式数据
    def getCommentsById(city, start, end): 
        weather_result = [] 
        # 获取时间序列 
        datelist = dateRange1(start, end)  # [ '202009', '202010']
        for i in datelist:
            url = 'http://lishi.tianqi.com/' + city + '/' + i + '.html'
            # 请求天气数据
            opener = request.Request(url)
            # 添加  HTTP请求头
            opener.add_header(
                'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
            req = request.urlopen(opener).read()
            # 解析html 数据
            soup = bs(req, 'html.parser')
            
            # 'div .thrui > li ' 筛选html数据 
            weather_m = soup.select('div .thrui > li ')
            # 循环获取的数据  
            for i in weather_m[0:]:  
                tt = []
                for j in range(5):
                   t = i.find_all('div')[j].string
                   if t is not None:  # 存在None值的进行处理,否则不能写入到excel
                        tt.append(t)
                   else:
                        tt.append('None')
                weather_result.append(tt)
                print(weather_result)
        return weather_result

    4.输出excel文件

    #  将list数据写入到本地excel中
    def list_to_excel(weather_result, filename):
        # 创建excel 名称,路径
        workbook = xlw.Workbook('E:\%s.xlsx' % filename)
        # 添加工作簿
        sheet = workbook.add_worksheet('weather_report')
        # 添加excel头标题文字
        title = ['日期', '最高气温', '最低气温', '天气', '风向',]
        for i in range(len(title)):
            # 将标题文字写入excel表头,字体加粗
            sheet.write_string(0, i, title[i], workbook.add_format({'bold': True}))  
        row, col = 1, 0
        for a, b, c, d , e in weather_result:
            # 依次将数据 写入表格
            sheet.write_string(row, col, a)
            sheet.write_string(row, col + 1, b)
            sheet.write_string(row, col + 2, c)
            sheet.write_string(row, col + 3, d)
            sheet.write_string(row, col + 4, d)
            row += 1
            # 关闭表格
        workbook.close()

    5.调用

    # 你要查询的城市的名称(拼音),起始时间,结束时间。
    data = getCommentsById('hunan', '2020-09', '2020-10')
    
    # 获取的data值,excel的文件名
    list_to_excel(data, '湖南天气202009-202010')

    全部源码

    # 时间的模块
    import datetime
    # 数据分析模块,用来处理excel
    import pandas as pd
    #用来构造xlsx文件的模块
    import xlsxwriter as xlw
    # 用来爬取数据的模块
    from urllib import request
    # HTML或XML标签中的内容解析器
    from bs4 import BeautifulSoup as bs
    
    # 产生时间序列
    def dateRange1(start, end):
        datelist1 = [datetime.datetime.strftime(x, '%Y%m') for x in list(
            pd.date_range(start=start, end=end))]
        datelist = sorted(list(set(datelist1)))
        return datelist
    # ['202005', '202006', '202007', '202008', '202009', '202010']
    
    
    # 爬取网页数据,解析HTML文件,筛选数据,转换成列表格式数据
    def getCommentsById(city, start, end): 
        weather_result = [] 
        # 获取时间序列 
        datelist = dateRange1(start, end)  # [ '202009', '202010']
        for i in datelist:
            url = 'http://lishi.tianqi.com/' + city + '/' + i + '.html'
            # 请求天气数据
            opener = request.Request(url)
            # 添加  HTTP请求头
            opener.add_header(
                'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
            req = request.urlopen(opener).read()
            # 解析html 数据
            soup = bs(req, 'html.parser')
            
            # 'div .thrui > li ' 筛选html数据 
            weather_m = soup.select('div .thrui > li ')
            # 循环获取的数据  
            for i in weather_m[0:]:  
                tt = []
                for j in range(5):
                   t = i.find_all('div')[j].string
                   if t is not None:  # 存在None值的进行处理,否则不能写入到excel
                        tt.append(t)
                   else:
                        tt.append('None')
                weather_result.append(tt)
                print(weather_result)
        return weather_result
    
    #  将list数据写入到本地excel中
    def list_to_excel(weather_result, filename):
        # 创建excel 名称,路径
        workbook = xlw.Workbook('E:\%s.xlsx' % filename)
        # 添加工作簿
        sheet = workbook.add_worksheet('weather_report')
        # 添加excel头标题文字
        title = ['日期', '最高气温', '最低气温', '天气', '风向',]
        for i in range(len(title)):
            # 将标题文字写入excel表头,字体加粗
            sheet.write_string(0, i, title[i], workbook.add_format({'bold': True}))  
        row, col = 1, 0
        for a, b, c, d , e in weather_result:
            # 依次将数据 写入表格
            sheet.write_string(row, col, a)
            sheet.write_string(row, col + 1, b)
            sheet.write_string(row, col + 2, c)
            sheet.write_string(row, col + 3, d)
            sheet.write_string(row, col + 4, d)
            row += 1
            # 关闭表格
        workbook.close()
    
    
    
    
    # 你要查询的城市的名称(拼音),起始时间,结束时间。
    data = getCommentsById('hunan', '2020-09', '2020-10')
    
    # 获取的data值,excel的文件名
    list_to_excel(data, '湖南天气202009-202010')
  • 相关阅读:
    String和StringBuffer、StringBuilder的区别
    猜字谜小游戏编程
    const 和非 const 函数重载
    沉溺于 Mac,沉溺于 XCode
    开源软件与自由软件的区别——个人体会
    C++/C宏定义中## 连接符与# 符的含义
    const 关键字用法代码观
    博客搬家
    注销、关闭和重启计算机
    c/c++笔试题——const类型的成员函数内部如何改变成员变量
  • 原文地址:https://www.cnblogs.com/Rivend/p/13875439.html
Copyright © 2011-2022 走看看